linux/kernel/events/core.c
<<
>>
Prefs
   1/*
   2 * Performance events core code:
   3 *
   4 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
   6 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
   7 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8 *
   9 * For licensing details see kernel-base/COPYING
  10 */
  11
  12#include <linux/fs.h>
  13#include <linux/mm.h>
  14#include <linux/cpu.h>
  15#include <linux/smp.h>
  16#include <linux/idr.h>
  17#include <linux/file.h>
  18#include <linux/poll.h>
  19#include <linux/slab.h>
  20#include <linux/hash.h>
  21#include <linux/tick.h>
  22#include <linux/sysfs.h>
  23#include <linux/dcache.h>
  24#include <linux/percpu.h>
  25#include <linux/ptrace.h>
  26#include <linux/reboot.h>
  27#include <linux/vmstat.h>
  28#include <linux/device.h>
  29#include <linux/export.h>
  30#include <linux/vmalloc.h>
  31#include <linux/hardirq.h>
  32#include <linux/rculist.h>
  33#include <linux/uaccess.h>
  34#include <linux/syscalls.h>
  35#include <linux/anon_inodes.h>
  36#include <linux/kernel_stat.h>
  37#include <linux/cgroup.h>
  38#include <linux/perf_event.h>
  39#include <linux/trace_events.h>
  40#include <linux/hw_breakpoint.h>
  41#include <linux/mm_types.h>
  42#include <linux/module.h>
  43#include <linux/mman.h>
  44#include <linux/compat.h>
  45#include <linux/bpf.h>
  46#include <linux/filter.h>
  47
  48#include "internal.h"
  49
  50#include <asm/irq_regs.h>
  51
  52static struct workqueue_struct *perf_wq;
  53
  54typedef int (*remote_function_f)(void *);
  55
  56struct remote_function_call {
  57        struct task_struct      *p;
  58        remote_function_f       func;
  59        void                    *info;
  60        int                     ret;
  61};
  62
  63static void remote_function(void *data)
  64{
  65        struct remote_function_call *tfc = data;
  66        struct task_struct *p = tfc->p;
  67
  68        if (p) {
  69                tfc->ret = -EAGAIN;
  70                if (task_cpu(p) != smp_processor_id() || !task_curr(p))
  71                        return;
  72        }
  73
  74        tfc->ret = tfc->func(tfc->info);
  75}
  76
  77/**
  78 * task_function_call - call a function on the cpu on which a task runs
  79 * @p:          the task to evaluate
  80 * @func:       the function to be called
  81 * @info:       the function call argument
  82 *
  83 * Calls the function @func when the task is currently running. This might
  84 * be on the current CPU, which just calls the function directly
  85 *
  86 * returns: @func return value, or
  87 *          -ESRCH  - when the process isn't running
  88 *          -EAGAIN - when the process moved away
  89 */
  90static int
  91task_function_call(struct task_struct *p, remote_function_f func, void *info)
  92{
  93        struct remote_function_call data = {
  94                .p      = p,
  95                .func   = func,
  96                .info   = info,
  97                .ret    = -ESRCH, /* No such (running) process */
  98        };
  99
 100        if (task_curr(p))
 101                smp_call_function_single(task_cpu(p), remote_function, &data, 1);
 102
 103        return data.ret;
 104}
 105
 106/**
 107 * cpu_function_call - call a function on the cpu
 108 * @func:       the function to be called
 109 * @info:       the function call argument
 110 *
 111 * Calls the function @func on the remote cpu.
 112 *
 113 * returns: @func return value or -ENXIO when the cpu is offline
 114 */
 115static int cpu_function_call(int cpu, remote_function_f func, void *info)
 116{
 117        struct remote_function_call data = {
 118                .p      = NULL,
 119                .func   = func,
 120                .info   = info,
 121                .ret    = -ENXIO, /* No such CPU */
 122        };
 123
 124        smp_call_function_single(cpu, remote_function, &data, 1);
 125
 126        return data.ret;
 127}
 128
 129#define EVENT_OWNER_KERNEL ((void *) -1)
 130
 131static bool is_kernel_event(struct perf_event *event)
 132{
 133        return event->owner == EVENT_OWNER_KERNEL;
 134}
 135
 136#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 137                       PERF_FLAG_FD_OUTPUT  |\
 138                       PERF_FLAG_PID_CGROUP |\
 139                       PERF_FLAG_FD_CLOEXEC)
 140
 141/*
 142 * branch priv levels that need permission checks
 143 */
 144#define PERF_SAMPLE_BRANCH_PERM_PLM \
 145        (PERF_SAMPLE_BRANCH_KERNEL |\
 146         PERF_SAMPLE_BRANCH_HV)
 147
 148enum event_type_t {
 149        EVENT_FLEXIBLE = 0x1,
 150        EVENT_PINNED = 0x2,
 151        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 152};
 153
 154/*
 155 * perf_sched_events : >0 events exist
 156 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 157 */
 158struct static_key_deferred perf_sched_events __read_mostly;
 159static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 160static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 161
 162static atomic_t nr_mmap_events __read_mostly;
 163static atomic_t nr_comm_events __read_mostly;
 164static atomic_t nr_task_events __read_mostly;
 165static atomic_t nr_freq_events __read_mostly;
 166static atomic_t nr_switch_events __read_mostly;
 167
 168static LIST_HEAD(pmus);
 169static DEFINE_MUTEX(pmus_lock);
 170static struct srcu_struct pmus_srcu;
 171
 172/*
 173 * perf event paranoia level:
 174 *  -1 - not paranoid at all
 175 *   0 - disallow raw tracepoint access for unpriv
 176 *   1 - disallow cpu events for unpriv
 177 *   2 - disallow kernel profiling for unpriv
 178 */
 179int sysctl_perf_event_paranoid __read_mostly = 1;
 180
 181/* Minimum for 512 kiB + 1 user control page */
 182int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 183
 184/*
 185 * max perf event sample rate
 186 */
 187#define DEFAULT_MAX_SAMPLE_RATE         100000
 188#define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
 189#define DEFAULT_CPU_TIME_MAX_PERCENT    25
 190
 191int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
 192
 193static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 194static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
 195
 196static int perf_sample_allowed_ns __read_mostly =
 197        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 198
 199static void update_perf_cpu_limits(void)
 200{
 201        u64 tmp = perf_sample_period_ns;
 202
 203        tmp *= sysctl_perf_cpu_time_max_percent;
 204        do_div(tmp, 100);
 205        ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
 206}
 207
 208static int perf_rotate_context(struct perf_cpu_context *cpuctx);
 209
 210int perf_proc_update_handler(struct ctl_table *table, int write,
 211                void __user *buffer, size_t *lenp,
 212                loff_t *ppos)
 213{
 214        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 215
 216        if (ret || !write)
 217                return ret;
 218
 219        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 220        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 221        update_perf_cpu_limits();
 222
 223        return 0;
 224}
 225
 226int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
 227
 228int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 229                                void __user *buffer, size_t *lenp,
 230                                loff_t *ppos)
 231{
 232        int ret = proc_dointvec(table, write, buffer, lenp, ppos);
 233
 234        if (ret || !write)
 235                return ret;
 236
 237        update_perf_cpu_limits();
 238
 239        return 0;
 240}
 241
 242/*
 243 * perf samples are done in some very critical code paths (NMIs).
 244 * If they take too much CPU time, the system can lock up and not
 245 * get any real work done.  This will drop the sample rate when
 246 * we detect that events are taking too long.
 247 */
 248#define NR_ACCUMULATED_SAMPLES 128
 249static DEFINE_PER_CPU(u64, running_sample_length);
 250
 251static void perf_duration_warn(struct irq_work *w)
 252{
 253        u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
 254        u64 avg_local_sample_len;
 255        u64 local_samples_len;
 256
 257        local_samples_len = __this_cpu_read(running_sample_length);
 258        avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
 259
 260        printk_ratelimited(KERN_WARNING
 261                        "perf interrupt took too long (%lld > %lld), lowering "
 262                        "kernel.perf_event_max_sample_rate to %d\n",
 263                        avg_local_sample_len, allowed_ns >> 1,
 264                        sysctl_perf_event_sample_rate);
 265}
 266
 267static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
 268
 269void perf_sample_event_took(u64 sample_len_ns)
 270{
 271        u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
 272        u64 avg_local_sample_len;
 273        u64 local_samples_len;
 274
 275        if (allowed_ns == 0)
 276                return;
 277
 278        /* decay the counter by 1 average sample */
 279        local_samples_len = __this_cpu_read(running_sample_length);
 280        local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
 281        local_samples_len += sample_len_ns;
 282        __this_cpu_write(running_sample_length, local_samples_len);
 283
 284        /*
 285         * note: this will be biased artifically low until we have
 286         * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
 287         * from having to maintain a count.
 288         */
 289        avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
 290
 291        if (avg_local_sample_len <= allowed_ns)
 292                return;
 293
 294        if (max_samples_per_tick <= 1)
 295                return;
 296
 297        max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
 298        sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
 299        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 300
 301        update_perf_cpu_limits();
 302
 303        if (!irq_work_queue(&perf_duration_work)) {
 304                early_printk("perf interrupt took too long (%lld > %lld), lowering "
 305                             "kernel.perf_event_max_sample_rate to %d\n",
 306                             avg_local_sample_len, allowed_ns >> 1,
 307                             sysctl_perf_event_sample_rate);
 308        }
 309}
 310
 311static atomic64_t perf_event_id;
 312
 313static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 314                              enum event_type_t event_type);
 315
 316static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 317                             enum event_type_t event_type,
 318                             struct task_struct *task);
 319
 320static void update_context_time(struct perf_event_context *ctx);
 321static u64 perf_event_time(struct perf_event *event);
 322
 323void __weak perf_event_print_debug(void)        { }
 324
 325extern __weak const char *perf_pmu_name(void)
 326{
 327        return "pmu";
 328}
 329
 330static inline u64 perf_clock(void)
 331{
 332        return local_clock();
 333}
 334
 335static inline u64 perf_event_clock(struct perf_event *event)
 336{
 337        return event->clock();
 338}
 339
 340static inline struct perf_cpu_context *
 341__get_cpu_context(struct perf_event_context *ctx)
 342{
 343        return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
 344}
 345
 346static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 347                          struct perf_event_context *ctx)
 348{
 349        raw_spin_lock(&cpuctx->ctx.lock);
 350        if (ctx)
 351                raw_spin_lock(&ctx->lock);
 352}
 353
 354static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 355                            struct perf_event_context *ctx)
 356{
 357        if (ctx)
 358                raw_spin_unlock(&ctx->lock);
 359        raw_spin_unlock(&cpuctx->ctx.lock);
 360}
 361
 362#ifdef CONFIG_CGROUP_PERF
 363
 364static inline bool
 365perf_cgroup_match(struct perf_event *event)
 366{
 367        struct perf_event_context *ctx = event->ctx;
 368        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 369
 370        /* @event doesn't care about cgroup */
 371        if (!event->cgrp)
 372                return true;
 373
 374        /* wants specific cgroup scope but @cpuctx isn't associated with any */
 375        if (!cpuctx->cgrp)
 376                return false;
 377
 378        /*
 379         * Cgroup scoping is recursive.  An event enabled for a cgroup is
 380         * also enabled for all its descendant cgroups.  If @cpuctx's
 381         * cgroup is a descendant of @event's (the test covers identity
 382         * case), it's a match.
 383         */
 384        return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
 385                                    event->cgrp->css.cgroup);
 386}
 387
 388static inline void perf_detach_cgroup(struct perf_event *event)
 389{
 390        css_put(&event->cgrp->css);
 391        event->cgrp = NULL;
 392}
 393
 394static inline int is_cgroup_event(struct perf_event *event)
 395{
 396        return event->cgrp != NULL;
 397}
 398
 399static inline u64 perf_cgroup_event_time(struct perf_event *event)
 400{
 401        struct perf_cgroup_info *t;
 402
 403        t = per_cpu_ptr(event->cgrp->info, event->cpu);
 404        return t->time;
 405}
 406
 407static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 408{
 409        struct perf_cgroup_info *info;
 410        u64 now;
 411
 412        now = perf_clock();
 413
 414        info = this_cpu_ptr(cgrp->info);
 415
 416        info->time += now - info->timestamp;
 417        info->timestamp = now;
 418}
 419
 420static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 421{
 422        struct perf_cgroup *cgrp_out = cpuctx->cgrp;
 423        if (cgrp_out)
 424                __update_cgrp_time(cgrp_out);
 425}
 426
 427static inline void update_cgrp_time_from_event(struct perf_event *event)
 428{
 429        struct perf_cgroup *cgrp;
 430
 431        /*
 432         * ensure we access cgroup data only when needed and
 433         * when we know the cgroup is pinned (css_get)
 434         */
 435        if (!is_cgroup_event(event))
 436                return;
 437
 438        cgrp = perf_cgroup_from_task(current, event->ctx);
 439        /*
 440         * Do not update time when cgroup is not active
 441         */
 442        if (cgrp == event->cgrp)
 443                __update_cgrp_time(event->cgrp);
 444}
 445
 446static inline void
 447perf_cgroup_set_timestamp(struct task_struct *task,
 448                          struct perf_event_context *ctx)
 449{
 450        struct perf_cgroup *cgrp;
 451        struct perf_cgroup_info *info;
 452
 453        /*
 454         * ctx->lock held by caller
 455         * ensure we do not access cgroup data
 456         * unless we have the cgroup pinned (css_get)
 457         */
 458        if (!task || !ctx->nr_cgroups)
 459                return;
 460
 461        cgrp = perf_cgroup_from_task(task, ctx);
 462        info = this_cpu_ptr(cgrp->info);
 463        info->timestamp = ctx->timestamp;
 464}
 465
 466#define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
 467#define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
 468
 469/*
 470 * reschedule events based on the cgroup constraint of task.
 471 *
 472 * mode SWOUT : schedule out everything
 473 * mode SWIN : schedule in based on cgroup for next
 474 */
 475static void perf_cgroup_switch(struct task_struct *task, int mode)
 476{
 477        struct perf_cpu_context *cpuctx;
 478        struct pmu *pmu;
 479        unsigned long flags;
 480
 481        /*
 482         * disable interrupts to avoid geting nr_cgroup
 483         * changes via __perf_event_disable(). Also
 484         * avoids preemption.
 485         */
 486        local_irq_save(flags);
 487
 488        /*
 489         * we reschedule only in the presence of cgroup
 490         * constrained events.
 491         */
 492
 493        list_for_each_entry_rcu(pmu, &pmus, entry) {
 494                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 495                if (cpuctx->unique_pmu != pmu)
 496                        continue; /* ensure we process each cpuctx once */
 497
 498                /*
 499                 * perf_cgroup_events says at least one
 500                 * context on this CPU has cgroup events.
 501                 *
 502                 * ctx->nr_cgroups reports the number of cgroup
 503                 * events for a context.
 504                 */
 505                if (cpuctx->ctx.nr_cgroups > 0) {
 506                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 507                        perf_pmu_disable(cpuctx->ctx.pmu);
 508
 509                        if (mode & PERF_CGROUP_SWOUT) {
 510                                cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 511                                /*
 512                                 * must not be done before ctxswout due
 513                                 * to event_filter_match() in event_sched_out()
 514                                 */
 515                                cpuctx->cgrp = NULL;
 516                        }
 517
 518                        if (mode & PERF_CGROUP_SWIN) {
 519                                WARN_ON_ONCE(cpuctx->cgrp);
 520                                /*
 521                                 * set cgrp before ctxsw in to allow
 522                                 * event_filter_match() to not have to pass
 523                                 * task around
 524                                 * we pass the cpuctx->ctx to perf_cgroup_from_task()
 525                                 * because cgorup events are only per-cpu
 526                                 */
 527                                cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
 528                                cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 529                        }
 530                        perf_pmu_enable(cpuctx->ctx.pmu);
 531                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 532                }
 533        }
 534
 535        local_irq_restore(flags);
 536}
 537
 538static inline void perf_cgroup_sched_out(struct task_struct *task,
 539                                         struct task_struct *next)
 540{
 541        struct perf_cgroup *cgrp1;
 542        struct perf_cgroup *cgrp2 = NULL;
 543
 544        rcu_read_lock();
 545        /*
 546         * we come here when we know perf_cgroup_events > 0
 547         * we do not need to pass the ctx here because we know
 548         * we are holding the rcu lock
 549         */
 550        cgrp1 = perf_cgroup_from_task(task, NULL);
 551
 552        /*
 553         * next is NULL when called from perf_event_enable_on_exec()
 554         * that will systematically cause a cgroup_switch()
 555         */
 556        if (next)
 557                cgrp2 = perf_cgroup_from_task(next, NULL);
 558
 559        /*
 560         * only schedule out current cgroup events if we know
 561         * that we are switching to a different cgroup. Otherwise,
 562         * do no touch the cgroup events.
 563         */
 564        if (cgrp1 != cgrp2)
 565                perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 566
 567        rcu_read_unlock();
 568}
 569
 570static inline void perf_cgroup_sched_in(struct task_struct *prev,
 571                                        struct task_struct *task)
 572{
 573        struct perf_cgroup *cgrp1;
 574        struct perf_cgroup *cgrp2 = NULL;
 575
 576        rcu_read_lock();
 577        /*
 578         * we come here when we know perf_cgroup_events > 0
 579         * we do not need to pass the ctx here because we know
 580         * we are holding the rcu lock
 581         */
 582        cgrp1 = perf_cgroup_from_task(task, NULL);
 583
 584        /* prev can never be NULL */
 585        cgrp2 = perf_cgroup_from_task(prev, NULL);
 586
 587        /*
 588         * only need to schedule in cgroup events if we are changing
 589         * cgroup during ctxsw. Cgroup events were not scheduled
 590         * out of ctxsw out if that was not the case.
 591         */
 592        if (cgrp1 != cgrp2)
 593                perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 594
 595        rcu_read_unlock();
 596}
 597
 598static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 599                                      struct perf_event_attr *attr,
 600                                      struct perf_event *group_leader)
 601{
 602        struct perf_cgroup *cgrp;
 603        struct cgroup_subsys_state *css;
 604        struct fd f = fdget(fd);
 605        int ret = 0;
 606
 607        if (!f.file)
 608                return -EBADF;
 609
 610        css = css_tryget_online_from_dir(f.file->f_path.dentry,
 611                                         &perf_event_cgrp_subsys);
 612        if (IS_ERR(css)) {
 613                ret = PTR_ERR(css);
 614                goto out;
 615        }
 616
 617        cgrp = container_of(css, struct perf_cgroup, css);
 618        event->cgrp = cgrp;
 619
 620        /*
 621         * all events in a group must monitor
 622         * the same cgroup because a task belongs
 623         * to only one perf cgroup at a time
 624         */
 625        if (group_leader && group_leader->cgrp != cgrp) {
 626                perf_detach_cgroup(event);
 627                ret = -EINVAL;
 628        }
 629out:
 630        fdput(f);
 631        return ret;
 632}
 633
 634static inline void
 635perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 636{
 637        struct perf_cgroup_info *t;
 638        t = per_cpu_ptr(event->cgrp->info, event->cpu);
 639        event->shadow_ctx_time = now - t->timestamp;
 640}
 641
 642static inline void
 643perf_cgroup_defer_enabled(struct perf_event *event)
 644{
 645        /*
 646         * when the current task's perf cgroup does not match
 647         * the event's, we need to remember to call the
 648         * perf_mark_enable() function the first time a task with
 649         * a matching perf cgroup is scheduled in.
 650         */
 651        if (is_cgroup_event(event) && !perf_cgroup_match(event))
 652                event->cgrp_defer_enabled = 1;
 653}
 654
 655static inline void
 656perf_cgroup_mark_enabled(struct perf_event *event,
 657                         struct perf_event_context *ctx)
 658{
 659        struct perf_event *sub;
 660        u64 tstamp = perf_event_time(event);
 661
 662        if (!event->cgrp_defer_enabled)
 663                return;
 664
 665        event->cgrp_defer_enabled = 0;
 666
 667        event->tstamp_enabled = tstamp - event->total_time_enabled;
 668        list_for_each_entry(sub, &event->sibling_list, group_entry) {
 669                if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
 670                        sub->tstamp_enabled = tstamp - sub->total_time_enabled;
 671                        sub->cgrp_defer_enabled = 0;
 672                }
 673        }
 674}
 675#else /* !CONFIG_CGROUP_PERF */
 676
 677static inline bool
 678perf_cgroup_match(struct perf_event *event)
 679{
 680        return true;
 681}
 682
 683static inline void perf_detach_cgroup(struct perf_event *event)
 684{}
 685
 686static inline int is_cgroup_event(struct perf_event *event)
 687{
 688        return 0;
 689}
 690
 691static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
 692{
 693        return 0;
 694}
 695
 696static inline void update_cgrp_time_from_event(struct perf_event *event)
 697{
 698}
 699
 700static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 701{
 702}
 703
 704static inline void perf_cgroup_sched_out(struct task_struct *task,
 705                                         struct task_struct *next)
 706{
 707}
 708
 709static inline void perf_cgroup_sched_in(struct task_struct *prev,
 710                                        struct task_struct *task)
 711{
 712}
 713
 714static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
 715                                      struct perf_event_attr *attr,
 716                                      struct perf_event *group_leader)
 717{
 718        return -EINVAL;
 719}
 720
 721static inline void
 722perf_cgroup_set_timestamp(struct task_struct *task,
 723                          struct perf_event_context *ctx)
 724{
 725}
 726
 727void
 728perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
 729{
 730}
 731
 732static inline void
 733perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 734{
 735}
 736
 737static inline u64 perf_cgroup_event_time(struct perf_event *event)
 738{
 739        return 0;
 740}
 741
 742static inline void
 743perf_cgroup_defer_enabled(struct perf_event *event)
 744{
 745}
 746
 747static inline void
 748perf_cgroup_mark_enabled(struct perf_event *event,
 749                         struct perf_event_context *ctx)
 750{
 751}
 752#endif
 753
 754/*
 755 * set default to be dependent on timer tick just
 756 * like original code
 757 */
 758#define PERF_CPU_HRTIMER (1000 / HZ)
 759/*
 760 * function must be called with interrupts disbled
 761 */
 762static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 763{
 764        struct perf_cpu_context *cpuctx;
 765        int rotations = 0;
 766
 767        WARN_ON(!irqs_disabled());
 768
 769        cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
 770        rotations = perf_rotate_context(cpuctx);
 771
 772        raw_spin_lock(&cpuctx->hrtimer_lock);
 773        if (rotations)
 774                hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
 775        else
 776                cpuctx->hrtimer_active = 0;
 777        raw_spin_unlock(&cpuctx->hrtimer_lock);
 778
 779        return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
 780}
 781
 782static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 783{
 784        struct hrtimer *timer = &cpuctx->hrtimer;
 785        struct pmu *pmu = cpuctx->ctx.pmu;
 786        u64 interval;
 787
 788        /* no multiplexing needed for SW PMU */
 789        if (pmu->task_ctx_nr == perf_sw_context)
 790                return;
 791
 792        /*
 793         * check default is sane, if not set then force to
 794         * default interval (1/tick)
 795         */
 796        interval = pmu->hrtimer_interval_ms;
 797        if (interval < 1)
 798                interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
 799
 800        cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
 801
 802        raw_spin_lock_init(&cpuctx->hrtimer_lock);
 803        hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 804        timer->function = perf_mux_hrtimer_handler;
 805}
 806
 807static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
 808{
 809        struct hrtimer *timer = &cpuctx->hrtimer;
 810        struct pmu *pmu = cpuctx->ctx.pmu;
 811        unsigned long flags;
 812
 813        /* not for SW PMU */
 814        if (pmu->task_ctx_nr == perf_sw_context)
 815                return 0;
 816
 817        raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
 818        if (!cpuctx->hrtimer_active) {
 819                cpuctx->hrtimer_active = 1;
 820                hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
 821                hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
 822        }
 823        raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
 824
 825        return 0;
 826}
 827
 828void perf_pmu_disable(struct pmu *pmu)
 829{
 830        int *count = this_cpu_ptr(pmu->pmu_disable_count);
 831        if (!(*count)++)
 832                pmu->pmu_disable(pmu);
 833}
 834
 835void perf_pmu_enable(struct pmu *pmu)
 836{
 837        int *count = this_cpu_ptr(pmu->pmu_disable_count);
 838        if (!--(*count))
 839                pmu->pmu_enable(pmu);
 840}
 841
 842static DEFINE_PER_CPU(struct list_head, active_ctx_list);
 843
 844/*
 845 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
 846 * perf_event_task_tick() are fully serialized because they're strictly cpu
 847 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
 848 * disabled, while perf_event_task_tick is called from IRQ context.
 849 */
 850static void perf_event_ctx_activate(struct perf_event_context *ctx)
 851{
 852        struct list_head *head = this_cpu_ptr(&active_ctx_list);
 853
 854        WARN_ON(!irqs_disabled());
 855
 856        WARN_ON(!list_empty(&ctx->active_ctx_list));
 857
 858        list_add(&ctx->active_ctx_list, head);
 859}
 860
 861static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
 862{
 863        WARN_ON(!irqs_disabled());
 864
 865        WARN_ON(list_empty(&ctx->active_ctx_list));
 866
 867        list_del_init(&ctx->active_ctx_list);
 868}
 869
 870static void get_ctx(struct perf_event_context *ctx)
 871{
 872        WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 873}
 874
 875static void free_ctx(struct rcu_head *head)
 876{
 877        struct perf_event_context *ctx;
 878
 879        ctx = container_of(head, struct perf_event_context, rcu_head);
 880        kfree(ctx->task_ctx_data);
 881        kfree(ctx);
 882}
 883
 884static void put_ctx(struct perf_event_context *ctx)
 885{
 886        if (atomic_dec_and_test(&ctx->refcount)) {
 887                if (ctx->parent_ctx)
 888                        put_ctx(ctx->parent_ctx);
 889                if (ctx->task)
 890                        put_task_struct(ctx->task);
 891                call_rcu(&ctx->rcu_head, free_ctx);
 892        }
 893}
 894
 895/*
 896 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
 897 * perf_pmu_migrate_context() we need some magic.
 898 *
 899 * Those places that change perf_event::ctx will hold both
 900 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
 901 *
 902 * Lock ordering is by mutex address. There are two other sites where
 903 * perf_event_context::mutex nests and those are:
 904 *
 905 *  - perf_event_exit_task_context()    [ child , 0 ]
 906 *      __perf_event_exit_task()
 907 *        sync_child_event()
 908 *          put_event()                 [ parent, 1 ]
 909 *
 910 *  - perf_event_init_context()         [ parent, 0 ]
 911 *      inherit_task_group()
 912 *        inherit_group()
 913 *          inherit_event()
 914 *            perf_event_alloc()
 915 *              perf_init_event()
 916 *                perf_try_init_event() [ child , 1 ]
 917 *
 918 * While it appears there is an obvious deadlock here -- the parent and child
 919 * nesting levels are inverted between the two. This is in fact safe because
 920 * life-time rules separate them. That is an exiting task cannot fork, and a
 921 * spawning task cannot (yet) exit.
 922 *
 923 * But remember that that these are parent<->child context relations, and
 924 * migration does not affect children, therefore these two orderings should not
 925 * interact.
 926 *
 927 * The change in perf_event::ctx does not affect children (as claimed above)
 928 * because the sys_perf_event_open() case will install a new event and break
 929 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
 930 * concerned with cpuctx and that doesn't have children.
 931 *
 932 * The places that change perf_event::ctx will issue:
 933 *
 934 *   perf_remove_from_context();
 935 *   synchronize_rcu();
 936 *   perf_install_in_context();
 937 *
 938 * to affect the change. The remove_from_context() + synchronize_rcu() should
 939 * quiesce the event, after which we can install it in the new location. This
 940 * means that only external vectors (perf_fops, prctl) can perturb the event
 941 * while in transit. Therefore all such accessors should also acquire
 942 * perf_event_context::mutex to serialize against this.
 943 *
 944 * However; because event->ctx can change while we're waiting to acquire
 945 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
 946 * function.
 947 *
 948 * Lock order:
 949 *      task_struct::perf_event_mutex
 950 *        perf_event_context::mutex
 951 *          perf_event_context::lock
 952 *          perf_event::child_mutex;
 953 *          perf_event::mmap_mutex
 954 *          mmap_sem
 955 */
 956static struct perf_event_context *
 957perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
 958{
 959        struct perf_event_context *ctx;
 960
 961again:
 962        rcu_read_lock();
 963        ctx = ACCESS_ONCE(event->ctx);
 964        if (!atomic_inc_not_zero(&ctx->refcount)) {
 965                rcu_read_unlock();
 966                goto again;
 967        }
 968        rcu_read_unlock();
 969
 970        mutex_lock_nested(&ctx->mutex, nesting);
 971        if (event->ctx != ctx) {
 972                mutex_unlock(&ctx->mutex);
 973                put_ctx(ctx);
 974                goto again;
 975        }
 976
 977        return ctx;
 978}
 979
 980static inline struct perf_event_context *
 981perf_event_ctx_lock(struct perf_event *event)
 982{
 983        return perf_event_ctx_lock_nested(event, 0);
 984}
 985
 986static void perf_event_ctx_unlock(struct perf_event *event,
 987                                  struct perf_event_context *ctx)
 988{
 989        mutex_unlock(&ctx->mutex);
 990        put_ctx(ctx);
 991}
 992
 993/*
 994 * This must be done under the ctx->lock, such as to serialize against
 995 * context_equiv(), therefore we cannot call put_ctx() since that might end up
 996 * calling scheduler related locks and ctx->lock nests inside those.
 997 */
 998static __must_check struct perf_event_context *
 999unclone_ctx(struct perf_event_context *ctx)
1000{
1001        struct perf_event_context *parent_ctx = ctx->parent_ctx;
1002
1003        lockdep_assert_held(&ctx->lock);
1004
1005        if (parent_ctx)
1006                ctx->parent_ctx = NULL;
1007        ctx->generation++;
1008
1009        return parent_ctx;
1010}
1011
1012static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1013{
1014        /*
1015         * only top level events have the pid namespace they were created in
1016         */
1017        if (event->parent)
1018                event = event->parent;
1019
1020        return task_tgid_nr_ns(p, event->ns);
1021}
1022
1023static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1024{
1025        /*
1026         * only top level events have the pid namespace they were created in
1027         */
1028        if (event->parent)
1029                event = event->parent;
1030
1031        return task_pid_nr_ns(p, event->ns);
1032}
1033
1034/*
1035 * If we inherit events we want to return the parent event id
1036 * to userspace.
1037 */
1038static u64 primary_event_id(struct perf_event *event)
1039{
1040        u64 id = event->id;
1041
1042        if (event->parent)
1043                id = event->parent->id;
1044
1045        return id;
1046}
1047
1048/*
1049 * Get the perf_event_context for a task and lock it.
1050 * This has to cope with with the fact that until it is locked,
1051 * the context could get moved to another task.
1052 */
1053static struct perf_event_context *
1054perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1055{
1056        struct perf_event_context *ctx;
1057
1058retry:
1059        /*
1060         * One of the few rules of preemptible RCU is that one cannot do
1061         * rcu_read_unlock() while holding a scheduler (or nested) lock when
1062         * part of the read side critical section was irqs-enabled -- see
1063         * rcu_read_unlock_special().
1064         *
1065         * Since ctx->lock nests under rq->lock we must ensure the entire read
1066         * side critical section has interrupts disabled.
1067         */
1068        local_irq_save(*flags);
1069        rcu_read_lock();
1070        ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1071        if (ctx) {
1072                /*
1073                 * If this context is a clone of another, it might
1074                 * get swapped for another underneath us by
1075                 * perf_event_task_sched_out, though the
1076                 * rcu_read_lock() protects us from any context
1077                 * getting freed.  Lock the context and check if it
1078                 * got swapped before we could get the lock, and retry
1079                 * if so.  If we locked the right context, then it
1080                 * can't get swapped on us any more.
1081                 */
1082                raw_spin_lock(&ctx->lock);
1083                if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1084                        raw_spin_unlock(&ctx->lock);
1085                        rcu_read_unlock();
1086                        local_irq_restore(*flags);
1087                        goto retry;
1088                }
1089
1090                if (!atomic_inc_not_zero(&ctx->refcount)) {
1091                        raw_spin_unlock(&ctx->lock);
1092                        ctx = NULL;
1093                }
1094        }
1095        rcu_read_unlock();
1096        if (!ctx)
1097                local_irq_restore(*flags);
1098        return ctx;
1099}
1100
1101/*
1102 * Get the context for a task and increment its pin_count so it
1103 * can't get swapped to another task.  This also increments its
1104 * reference count so that the context can't get freed.
1105 */
1106static struct perf_event_context *
1107perf_pin_task_context(struct task_struct *task, int ctxn)
1108{
1109        struct perf_event_context *ctx;
1110        unsigned long flags;
1111
1112        ctx = perf_lock_task_context(task, ctxn, &flags);
1113        if (ctx) {
1114                ++ctx->pin_count;
1115                raw_spin_unlock_irqrestore(&ctx->lock, flags);
1116        }
1117        return ctx;
1118}
1119
1120static void perf_unpin_context(struct perf_event_context *ctx)
1121{
1122        unsigned long flags;
1123
1124        raw_spin_lock_irqsave(&ctx->lock, flags);
1125        --ctx->pin_count;
1126        raw_spin_unlock_irqrestore(&ctx->lock, flags);
1127}
1128
1129/*
1130 * Update the record of the current time in a context.
1131 */
1132static void update_context_time(struct perf_event_context *ctx)
1133{
1134        u64 now = perf_clock();
1135
1136        ctx->time += now - ctx->timestamp;
1137        ctx->timestamp = now;
1138}
1139
1140static u64 perf_event_time(struct perf_event *event)
1141{
1142        struct perf_event_context *ctx = event->ctx;
1143
1144        if (is_cgroup_event(event))
1145                return perf_cgroup_event_time(event);
1146
1147        return ctx ? ctx->time : 0;
1148}
1149
1150/*
1151 * Update the total_time_enabled and total_time_running fields for a event.
1152 * The caller of this function needs to hold the ctx->lock.
1153 */
1154static void update_event_times(struct perf_event *event)
1155{
1156        struct perf_event_context *ctx = event->ctx;
1157        u64 run_end;
1158
1159        if (event->state < PERF_EVENT_STATE_INACTIVE ||
1160            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1161                return;
1162        /*
1163         * in cgroup mode, time_enabled represents
1164         * the time the event was enabled AND active
1165         * tasks were in the monitored cgroup. This is
1166         * independent of the activity of the context as
1167         * there may be a mix of cgroup and non-cgroup events.
1168         *
1169         * That is why we treat cgroup events differently
1170         * here.
1171         */
1172        if (is_cgroup_event(event))
1173                run_end = perf_cgroup_event_time(event);
1174        else if (ctx->is_active)
1175                run_end = ctx->time;
1176        else
1177                run_end = event->tstamp_stopped;
1178
1179        event->total_time_enabled = run_end - event->tstamp_enabled;
1180
1181        if (event->state == PERF_EVENT_STATE_INACTIVE)
1182                run_end = event->tstamp_stopped;
1183        else
1184                run_end = perf_event_time(event);
1185
1186        event->total_time_running = run_end - event->tstamp_running;
1187
1188}
1189
1190/*
1191 * Update total_time_enabled and total_time_running for all events in a group.
1192 */
1193static void update_group_times(struct perf_event *leader)
1194{
1195        struct perf_event *event;
1196
1197        update_event_times(leader);
1198        list_for_each_entry(event, &leader->sibling_list, group_entry)
1199                update_event_times(event);
1200}
1201
1202static struct list_head *
1203ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1204{
1205        if (event->attr.pinned)
1206                return &ctx->pinned_groups;
1207        else
1208                return &ctx->flexible_groups;
1209}
1210
1211/*
1212 * Add a event from the lists for its context.
1213 * Must be called with ctx->mutex and ctx->lock held.
1214 */
1215static void
1216list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1217{
1218        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1219        event->attach_state |= PERF_ATTACH_CONTEXT;
1220
1221        /*
1222         * If we're a stand alone event or group leader, we go to the context
1223         * list, group events are kept attached to the group so that
1224         * perf_group_detach can, at all times, locate all siblings.
1225         */
1226        if (event->group_leader == event) {
1227                struct list_head *list;
1228
1229                if (is_software_event(event))
1230                        event->group_flags |= PERF_GROUP_SOFTWARE;
1231
1232                list = ctx_group_list(event, ctx);
1233                list_add_tail(&event->group_entry, list);
1234        }
1235
1236        if (is_cgroup_event(event))
1237                ctx->nr_cgroups++;
1238
1239        list_add_rcu(&event->event_entry, &ctx->event_list);
1240        ctx->nr_events++;
1241        if (event->attr.inherit_stat)
1242                ctx->nr_stat++;
1243
1244        ctx->generation++;
1245}
1246
1247/*
1248 * Initialize event state based on the perf_event_attr::disabled.
1249 */
1250static inline void perf_event__state_init(struct perf_event *event)
1251{
1252        event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1253                                              PERF_EVENT_STATE_INACTIVE;
1254}
1255
1256static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1257{
1258        int entry = sizeof(u64); /* value */
1259        int size = 0;
1260        int nr = 1;
1261
1262        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1263                size += sizeof(u64);
1264
1265        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1266                size += sizeof(u64);
1267
1268        if (event->attr.read_format & PERF_FORMAT_ID)
1269                entry += sizeof(u64);
1270
1271        if (event->attr.read_format & PERF_FORMAT_GROUP) {
1272                nr += nr_siblings;
1273                size += sizeof(u64);
1274        }
1275
1276        size += entry * nr;
1277        event->read_size = size;
1278}
1279
1280static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1281{
1282        struct perf_sample_data *data;
1283        u16 size = 0;
1284
1285        if (sample_type & PERF_SAMPLE_IP)
1286                size += sizeof(data->ip);
1287
1288        if (sample_type & PERF_SAMPLE_ADDR)
1289                size += sizeof(data->addr);
1290
1291        if (sample_type & PERF_SAMPLE_PERIOD)
1292                size += sizeof(data->period);
1293
1294        if (sample_type & PERF_SAMPLE_WEIGHT)
1295                size += sizeof(data->weight);
1296
1297        if (sample_type & PERF_SAMPLE_READ)
1298                size += event->read_size;
1299
1300        if (sample_type & PERF_SAMPLE_DATA_SRC)
1301                size += sizeof(data->data_src.val);
1302
1303        if (sample_type & PERF_SAMPLE_TRANSACTION)
1304                size += sizeof(data->txn);
1305
1306        event->header_size = size;
1307}
1308
1309/*
1310 * Called at perf_event creation and when events are attached/detached from a
1311 * group.
1312 */
1313static void perf_event__header_size(struct perf_event *event)
1314{
1315        __perf_event_read_size(event,
1316                               event->group_leader->nr_siblings);
1317        __perf_event_header_size(event, event->attr.sample_type);
1318}
1319
1320static void perf_event__id_header_size(struct perf_event *event)
1321{
1322        struct perf_sample_data *data;
1323        u64 sample_type = event->attr.sample_type;
1324        u16 size = 0;
1325
1326        if (sample_type & PERF_SAMPLE_TID)
1327                size += sizeof(data->tid_entry);
1328
1329        if (sample_type & PERF_SAMPLE_TIME)
1330                size += sizeof(data->time);
1331
1332        if (sample_type & PERF_SAMPLE_IDENTIFIER)
1333                size += sizeof(data->id);
1334
1335        if (sample_type & PERF_SAMPLE_ID)
1336                size += sizeof(data->id);
1337
1338        if (sample_type & PERF_SAMPLE_STREAM_ID)
1339                size += sizeof(data->stream_id);
1340
1341        if (sample_type & PERF_SAMPLE_CPU)
1342                size += sizeof(data->cpu_entry);
1343
1344        event->id_header_size = size;
1345}
1346
1347static bool perf_event_validate_size(struct perf_event *event)
1348{
1349        /*
1350         * The values computed here will be over-written when we actually
1351         * attach the event.
1352         */
1353        __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1354        __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1355        perf_event__id_header_size(event);
1356
1357        /*
1358         * Sum the lot; should not exceed the 64k limit we have on records.
1359         * Conservative limit to allow for callchains and other variable fields.
1360         */
1361        if (event->read_size + event->header_size +
1362            event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1363                return false;
1364
1365        return true;
1366}
1367
1368static void perf_group_attach(struct perf_event *event)
1369{
1370        struct perf_event *group_leader = event->group_leader, *pos;
1371
1372        /*
1373         * We can have double attach due to group movement in perf_event_open.
1374         */
1375        if (event->attach_state & PERF_ATTACH_GROUP)
1376                return;
1377
1378        event->attach_state |= PERF_ATTACH_GROUP;
1379
1380        if (group_leader == event)
1381                return;
1382
1383        WARN_ON_ONCE(group_leader->ctx != event->ctx);
1384
1385        if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1386                        !is_software_event(event))
1387                group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1388
1389        list_add_tail(&event->group_entry, &group_leader->sibling_list);
1390        group_leader->nr_siblings++;
1391
1392        perf_event__header_size(group_leader);
1393
1394        list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1395                perf_event__header_size(pos);
1396}
1397
1398/*
1399 * Remove a event from the lists for its context.
1400 * Must be called with ctx->mutex and ctx->lock held.
1401 */
1402static void
1403list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1404{
1405        struct perf_cpu_context *cpuctx;
1406
1407        WARN_ON_ONCE(event->ctx != ctx);
1408        lockdep_assert_held(&ctx->lock);
1409
1410        /*
1411         * We can have double detach due to exit/hot-unplug + close.
1412         */
1413        if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1414                return;
1415
1416        event->attach_state &= ~PERF_ATTACH_CONTEXT;
1417
1418        if (is_cgroup_event(event)) {
1419                ctx->nr_cgroups--;
1420                cpuctx = __get_cpu_context(ctx);
1421                /*
1422                 * if there are no more cgroup events
1423                 * then cler cgrp to avoid stale pointer
1424                 * in update_cgrp_time_from_cpuctx()
1425                 */
1426                if (!ctx->nr_cgroups)
1427                        cpuctx->cgrp = NULL;
1428        }
1429
1430        ctx->nr_events--;
1431        if (event->attr.inherit_stat)
1432                ctx->nr_stat--;
1433
1434        list_del_rcu(&event->event_entry);
1435
1436        if (event->group_leader == event)
1437                list_del_init(&event->group_entry);
1438
1439        update_group_times(event);
1440
1441        /*
1442         * If event was in error state, then keep it
1443         * that way, otherwise bogus counts will be
1444         * returned on read(). The only way to get out
1445         * of error state is by explicit re-enabling
1446         * of the event
1447         */
1448        if (event->state > PERF_EVENT_STATE_OFF)
1449                event->state = PERF_EVENT_STATE_OFF;
1450
1451        ctx->generation++;
1452}
1453
1454static void perf_group_detach(struct perf_event *event)
1455{
1456        struct perf_event *sibling, *tmp;
1457        struct list_head *list = NULL;
1458
1459        /*
1460         * We can have double detach due to exit/hot-unplug + close.
1461         */
1462        if (!(event->attach_state & PERF_ATTACH_GROUP))
1463                return;
1464
1465        event->attach_state &= ~PERF_ATTACH_GROUP;
1466
1467        /*
1468         * If this is a sibling, remove it from its group.
1469         */
1470        if (event->group_leader != event) {
1471                list_del_init(&event->group_entry);
1472                event->group_leader->nr_siblings--;
1473                goto out;
1474        }
1475
1476        if (!list_empty(&event->group_entry))
1477                list = &event->group_entry;
1478
1479        /*
1480         * If this was a group event with sibling events then
1481         * upgrade the siblings to singleton events by adding them
1482         * to whatever list we are on.
1483         */
1484        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1485                if (list)
1486                        list_move_tail(&sibling->group_entry, list);
1487                sibling->group_leader = sibling;
1488
1489                /* Inherit group flags from the previous leader */
1490                sibling->group_flags = event->group_flags;
1491
1492                WARN_ON_ONCE(sibling->ctx != event->ctx);
1493        }
1494
1495out:
1496        perf_event__header_size(event->group_leader);
1497
1498        list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1499                perf_event__header_size(tmp);
1500}
1501
1502/*
1503 * User event without the task.
1504 */
1505static bool is_orphaned_event(struct perf_event *event)
1506{
1507        return event && !is_kernel_event(event) && !event->owner;
1508}
1509
1510/*
1511 * Event has a parent but parent's task finished and it's
1512 * alive only because of children holding refference.
1513 */
1514static bool is_orphaned_child(struct perf_event *event)
1515{
1516        return is_orphaned_event(event->parent);
1517}
1518
1519static void orphans_remove_work(struct work_struct *work);
1520
1521static void schedule_orphans_remove(struct perf_event_context *ctx)
1522{
1523        if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1524                return;
1525
1526        if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1527                get_ctx(ctx);
1528                ctx->orphans_remove_sched = true;
1529        }
1530}
1531
1532static int __init perf_workqueue_init(void)
1533{
1534        perf_wq = create_singlethread_workqueue("perf");
1535        WARN(!perf_wq, "failed to create perf workqueue\n");
1536        return perf_wq ? 0 : -1;
1537}
1538
1539core_initcall(perf_workqueue_init);
1540
1541static inline int pmu_filter_match(struct perf_event *event)
1542{
1543        struct pmu *pmu = event->pmu;
1544        return pmu->filter_match ? pmu->filter_match(event) : 1;
1545}
1546
1547static inline int
1548event_filter_match(struct perf_event *event)
1549{
1550        return (event->cpu == -1 || event->cpu == smp_processor_id())
1551            && perf_cgroup_match(event) && pmu_filter_match(event);
1552}
1553
1554static void
1555event_sched_out(struct perf_event *event,
1556                  struct perf_cpu_context *cpuctx,
1557                  struct perf_event_context *ctx)
1558{
1559        u64 tstamp = perf_event_time(event);
1560        u64 delta;
1561
1562        WARN_ON_ONCE(event->ctx != ctx);
1563        lockdep_assert_held(&ctx->lock);
1564
1565        /*
1566         * An event which could not be activated because of
1567         * filter mismatch still needs to have its timings
1568         * maintained, otherwise bogus information is return
1569         * via read() for time_enabled, time_running:
1570         */
1571        if (event->state == PERF_EVENT_STATE_INACTIVE
1572            && !event_filter_match(event)) {
1573                delta = tstamp - event->tstamp_stopped;
1574                event->tstamp_running += delta;
1575                event->tstamp_stopped = tstamp;
1576        }
1577
1578        if (event->state != PERF_EVENT_STATE_ACTIVE)
1579                return;
1580
1581        perf_pmu_disable(event->pmu);
1582
1583        event->state = PERF_EVENT_STATE_INACTIVE;
1584        if (event->pending_disable) {
1585                event->pending_disable = 0;
1586                event->state = PERF_EVENT_STATE_OFF;
1587        }
1588        event->tstamp_stopped = tstamp;
1589        event->pmu->del(event, 0);
1590        event->oncpu = -1;
1591
1592        if (!is_software_event(event))
1593                cpuctx->active_oncpu--;
1594        if (!--ctx->nr_active)
1595                perf_event_ctx_deactivate(ctx);
1596        if (event->attr.freq && event->attr.sample_freq)
1597                ctx->nr_freq--;
1598        if (event->attr.exclusive || !cpuctx->active_oncpu)
1599                cpuctx->exclusive = 0;
1600
1601        if (is_orphaned_child(event))
1602                schedule_orphans_remove(ctx);
1603
1604        perf_pmu_enable(event->pmu);
1605}
1606
1607static void
1608group_sched_out(struct perf_event *group_event,
1609                struct perf_cpu_context *cpuctx,
1610                struct perf_event_context *ctx)
1611{
1612        struct perf_event *event;
1613        int state = group_event->state;
1614
1615        event_sched_out(group_event, cpuctx, ctx);
1616
1617        /*
1618         * Schedule out siblings (if any):
1619         */
1620        list_for_each_entry(event, &group_event->sibling_list, group_entry)
1621                event_sched_out(event, cpuctx, ctx);
1622
1623        if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1624                cpuctx->exclusive = 0;
1625}
1626
1627struct remove_event {
1628        struct perf_event *event;
1629        bool detach_group;
1630};
1631
1632/*
1633 * Cross CPU call to remove a performance event
1634 *
1635 * We disable the event on the hardware level first. After that we
1636 * remove it from the context list.
1637 */
1638static int __perf_remove_from_context(void *info)
1639{
1640        struct remove_event *re = info;
1641        struct perf_event *event = re->event;
1642        struct perf_event_context *ctx = event->ctx;
1643        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1644
1645        raw_spin_lock(&ctx->lock);
1646        event_sched_out(event, cpuctx, ctx);
1647        if (re->detach_group)
1648                perf_group_detach(event);
1649        list_del_event(event, ctx);
1650        if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1651                ctx->is_active = 0;
1652                cpuctx->task_ctx = NULL;
1653        }
1654        raw_spin_unlock(&ctx->lock);
1655
1656        return 0;
1657}
1658
1659
1660/*
1661 * Remove the event from a task's (or a CPU's) list of events.
1662 *
1663 * CPU events are removed with a smp call. For task events we only
1664 * call when the task is on a CPU.
1665 *
1666 * If event->ctx is a cloned context, callers must make sure that
1667 * every task struct that event->ctx->task could possibly point to
1668 * remains valid.  This is OK when called from perf_release since
1669 * that only calls us on the top-level context, which can't be a clone.
1670 * When called from perf_event_exit_task, it's OK because the
1671 * context has been detached from its task.
1672 */
1673static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1674{
1675        struct perf_event_context *ctx = event->ctx;
1676        struct task_struct *task = ctx->task;
1677        struct remove_event re = {
1678                .event = event,
1679                .detach_group = detach_group,
1680        };
1681
1682        lockdep_assert_held(&ctx->mutex);
1683
1684        if (!task) {
1685                /*
1686                 * Per cpu events are removed via an smp call. The removal can
1687                 * fail if the CPU is currently offline, but in that case we
1688                 * already called __perf_remove_from_context from
1689                 * perf_event_exit_cpu.
1690                 */
1691                cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1692                return;
1693        }
1694
1695retry:
1696        if (!task_function_call(task, __perf_remove_from_context, &re))
1697                return;
1698
1699        raw_spin_lock_irq(&ctx->lock);
1700        /*
1701         * If we failed to find a running task, but find the context active now
1702         * that we've acquired the ctx->lock, retry.
1703         */
1704        if (ctx->is_active) {
1705                raw_spin_unlock_irq(&ctx->lock);
1706                /*
1707                 * Reload the task pointer, it might have been changed by
1708                 * a concurrent perf_event_context_sched_out().
1709                 */
1710                task = ctx->task;
1711                goto retry;
1712        }
1713
1714        /*
1715         * Since the task isn't running, its safe to remove the event, us
1716         * holding the ctx->lock ensures the task won't get scheduled in.
1717         */
1718        if (detach_group)
1719                perf_group_detach(event);
1720        list_del_event(event, ctx);
1721        raw_spin_unlock_irq(&ctx->lock);
1722}
1723
1724/*
1725 * Cross CPU call to disable a performance event
1726 */
1727int __perf_event_disable(void *info)
1728{
1729        struct perf_event *event = info;
1730        struct perf_event_context *ctx = event->ctx;
1731        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1732
1733        /*
1734         * If this is a per-task event, need to check whether this
1735         * event's task is the current task on this cpu.
1736         *
1737         * Can trigger due to concurrent perf_event_context_sched_out()
1738         * flipping contexts around.
1739         */
1740        if (ctx->task && cpuctx->task_ctx != ctx)
1741                return -EINVAL;
1742
1743        raw_spin_lock(&ctx->lock);
1744
1745        /*
1746         * If the event is on, turn it off.
1747         * If it is in error state, leave it in error state.
1748         */
1749        if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1750                update_context_time(ctx);
1751                update_cgrp_time_from_event(event);
1752                update_group_times(event);
1753                if (event == event->group_leader)
1754                        group_sched_out(event, cpuctx, ctx);
1755                else
1756                        event_sched_out(event, cpuctx, ctx);
1757                event->state = PERF_EVENT_STATE_OFF;
1758        }
1759
1760        raw_spin_unlock(&ctx->lock);
1761
1762        return 0;
1763}
1764
1765/*
1766 * Disable a event.
1767 *
1768 * If event->ctx is a cloned context, callers must make sure that
1769 * every task struct that event->ctx->task could possibly point to
1770 * remains valid.  This condition is satisifed when called through
1771 * perf_event_for_each_child or perf_event_for_each because they
1772 * hold the top-level event's child_mutex, so any descendant that
1773 * goes to exit will block in sync_child_event.
1774 * When called from perf_pending_event it's OK because event->ctx
1775 * is the current context on this CPU and preemption is disabled,
1776 * hence we can't get into perf_event_task_sched_out for this context.
1777 */
1778static void _perf_event_disable(struct perf_event *event)
1779{
1780        struct perf_event_context *ctx = event->ctx;
1781        struct task_struct *task = ctx->task;
1782
1783        if (!task) {
1784                /*
1785                 * Disable the event on the cpu that it's on
1786                 */
1787                cpu_function_call(event->cpu, __perf_event_disable, event);
1788                return;
1789        }
1790
1791retry:
1792        if (!task_function_call(task, __perf_event_disable, event))
1793                return;
1794
1795        raw_spin_lock_irq(&ctx->lock);
1796        /*
1797         * If the event is still active, we need to retry the cross-call.
1798         */
1799        if (event->state == PERF_EVENT_STATE_ACTIVE) {
1800                raw_spin_unlock_irq(&ctx->lock);
1801                /*
1802                 * Reload the task pointer, it might have been changed by
1803                 * a concurrent perf_event_context_sched_out().
1804                 */
1805                task = ctx->task;
1806                goto retry;
1807        }
1808
1809        /*
1810         * Since we have the lock this context can't be scheduled
1811         * in, so we can change the state safely.
1812         */
1813        if (event->state == PERF_EVENT_STATE_INACTIVE) {
1814                update_group_times(event);
1815                event->state = PERF_EVENT_STATE_OFF;
1816        }
1817        raw_spin_unlock_irq(&ctx->lock);
1818}
1819
1820/*
1821 * Strictly speaking kernel users cannot create groups and therefore this
1822 * interface does not need the perf_event_ctx_lock() magic.
1823 */
1824void perf_event_disable(struct perf_event *event)
1825{
1826        struct perf_event_context *ctx;
1827
1828        ctx = perf_event_ctx_lock(event);
1829        _perf_event_disable(event);
1830        perf_event_ctx_unlock(event, ctx);
1831}
1832EXPORT_SYMBOL_GPL(perf_event_disable);
1833
1834static void perf_set_shadow_time(struct perf_event *event,
1835                                 struct perf_event_context *ctx,
1836                                 u64 tstamp)
1837{
1838        /*
1839         * use the correct time source for the time snapshot
1840         *
1841         * We could get by without this by leveraging the
1842         * fact that to get to this function, the caller
1843         * has most likely already called update_context_time()
1844         * and update_cgrp_time_xx() and thus both timestamp
1845         * are identical (or very close). Given that tstamp is,
1846         * already adjusted for cgroup, we could say that:
1847         *    tstamp - ctx->timestamp
1848         * is equivalent to
1849         *    tstamp - cgrp->timestamp.
1850         *
1851         * Then, in perf_output_read(), the calculation would
1852         * work with no changes because:
1853         * - event is guaranteed scheduled in
1854         * - no scheduled out in between
1855         * - thus the timestamp would be the same
1856         *
1857         * But this is a bit hairy.
1858         *
1859         * So instead, we have an explicit cgroup call to remain
1860         * within the time time source all along. We believe it
1861         * is cleaner and simpler to understand.
1862         */
1863        if (is_cgroup_event(event))
1864                perf_cgroup_set_shadow_time(event, tstamp);
1865        else
1866                event->shadow_ctx_time = tstamp - ctx->timestamp;
1867}
1868
1869#define MAX_INTERRUPTS (~0ULL)
1870
1871static void perf_log_throttle(struct perf_event *event, int enable);
1872static void perf_log_itrace_start(struct perf_event *event);
1873
1874static int
1875event_sched_in(struct perf_event *event,
1876                 struct perf_cpu_context *cpuctx,
1877                 struct perf_event_context *ctx)
1878{
1879        u64 tstamp = perf_event_time(event);
1880        int ret = 0;
1881
1882        lockdep_assert_held(&ctx->lock);
1883
1884        if (event->state <= PERF_EVENT_STATE_OFF)
1885                return 0;
1886
1887        event->state = PERF_EVENT_STATE_ACTIVE;
1888        event->oncpu = smp_processor_id();
1889
1890        /*
1891         * Unthrottle events, since we scheduled we might have missed several
1892         * ticks already, also for a heavily scheduling task there is little
1893         * guarantee it'll get a tick in a timely manner.
1894         */
1895        if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1896                perf_log_throttle(event, 1);
1897                event->hw.interrupts = 0;
1898        }
1899
1900        /*
1901         * The new state must be visible before we turn it on in the hardware:
1902         */
1903        smp_wmb();
1904
1905        perf_pmu_disable(event->pmu);
1906
1907        perf_set_shadow_time(event, ctx, tstamp);
1908
1909        perf_log_itrace_start(event);
1910
1911        if (event->pmu->add(event, PERF_EF_START)) {
1912                event->state = PERF_EVENT_STATE_INACTIVE;
1913                event->oncpu = -1;
1914                ret = -EAGAIN;
1915                goto out;
1916        }
1917
1918        event->tstamp_running += tstamp - event->tstamp_stopped;
1919
1920        if (!is_software_event(event))
1921                cpuctx->active_oncpu++;
1922        if (!ctx->nr_active++)
1923                perf_event_ctx_activate(ctx);
1924        if (event->attr.freq && event->attr.sample_freq)
1925                ctx->nr_freq++;
1926
1927        if (event->attr.exclusive)
1928                cpuctx->exclusive = 1;
1929
1930        if (is_orphaned_child(event))
1931                schedule_orphans_remove(ctx);
1932
1933out:
1934        perf_pmu_enable(event->pmu);
1935
1936        return ret;
1937}
1938
1939static int
1940group_sched_in(struct perf_event *group_event,
1941               struct perf_cpu_context *cpuctx,
1942               struct perf_event_context *ctx)
1943{
1944        struct perf_event *event, *partial_group = NULL;
1945        struct pmu *pmu = ctx->pmu;
1946        u64 now = ctx->time;
1947        bool simulate = false;
1948
1949        if (group_event->state == PERF_EVENT_STATE_OFF)
1950                return 0;
1951
1952        pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
1953
1954        if (event_sched_in(group_event, cpuctx, ctx)) {
1955                pmu->cancel_txn(pmu);
1956                perf_mux_hrtimer_restart(cpuctx);
1957                return -EAGAIN;
1958        }
1959
1960        /*
1961         * Schedule in siblings as one group (if any):
1962         */
1963        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1964                if (event_sched_in(event, cpuctx, ctx)) {
1965                        partial_group = event;
1966                        goto group_error;
1967                }
1968        }
1969
1970        if (!pmu->commit_txn(pmu))
1971                return 0;
1972
1973group_error:
1974        /*
1975         * Groups can be scheduled in as one unit only, so undo any
1976         * partial group before returning:
1977         * The events up to the failed event are scheduled out normally,
1978         * tstamp_stopped will be updated.
1979         *
1980         * The failed events and the remaining siblings need to have
1981         * their timings updated as if they had gone thru event_sched_in()
1982         * and event_sched_out(). This is required to get consistent timings
1983         * across the group. This also takes care of the case where the group
1984         * could never be scheduled by ensuring tstamp_stopped is set to mark
1985         * the time the event was actually stopped, such that time delta
1986         * calculation in update_event_times() is correct.
1987         */
1988        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1989                if (event == partial_group)
1990                        simulate = true;
1991
1992                if (simulate) {
1993                        event->tstamp_running += now - event->tstamp_stopped;
1994                        event->tstamp_stopped = now;
1995                } else {
1996                        event_sched_out(event, cpuctx, ctx);
1997                }
1998        }
1999        event_sched_out(group_event, cpuctx, ctx);
2000
2001        pmu->cancel_txn(pmu);
2002
2003        perf_mux_hrtimer_restart(cpuctx);
2004
2005        return -EAGAIN;
2006}
2007
2008/*
2009 * Work out whether we can put this event group on the CPU now.
2010 */
2011static int group_can_go_on(struct perf_event *event,
2012                           struct perf_cpu_context *cpuctx,
2013                           int can_add_hw)
2014{
2015        /*
2016         * Groups consisting entirely of software events can always go on.
2017         */
2018        if (event->group_flags & PERF_GROUP_SOFTWARE)
2019                return 1;
2020        /*
2021         * If an exclusive group is already on, no other hardware
2022         * events can go on.
2023         */
2024        if (cpuctx->exclusive)
2025                return 0;
2026        /*
2027         * If this group is exclusive and there are already
2028         * events on the CPU, it can't go on.
2029         */
2030        if (event->attr.exclusive && cpuctx->active_oncpu)
2031                return 0;
2032        /*
2033         * Otherwise, try to add it if all previous groups were able
2034         * to go on.
2035         */
2036        return can_add_hw;
2037}
2038
2039static void add_event_to_ctx(struct perf_event *event,
2040                               struct perf_event_context *ctx)
2041{
2042        u64 tstamp = perf_event_time(event);
2043
2044        list_add_event(event, ctx);
2045        perf_group_attach(event);
2046        event->tstamp_enabled = tstamp;
2047        event->tstamp_running = tstamp;
2048        event->tstamp_stopped = tstamp;
2049}
2050
2051static void task_ctx_sched_out(struct perf_event_context *ctx);
2052static void
2053ctx_sched_in(struct perf_event_context *ctx,
2054             struct perf_cpu_context *cpuctx,
2055             enum event_type_t event_type,
2056             struct task_struct *task);
2057
2058static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2059                                struct perf_event_context *ctx,
2060                                struct task_struct *task)
2061{
2062        cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2063        if (ctx)
2064                ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2065        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2066        if (ctx)
2067                ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2068}
2069
2070/*
2071 * Cross CPU call to install and enable a performance event
2072 *
2073 * Must be called with ctx->mutex held
2074 */
2075static int  __perf_install_in_context(void *info)
2076{
2077        struct perf_event *event = info;
2078        struct perf_event_context *ctx = event->ctx;
2079        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2080        struct perf_event_context *task_ctx = cpuctx->task_ctx;
2081        struct task_struct *task = current;
2082
2083        perf_ctx_lock(cpuctx, task_ctx);
2084        perf_pmu_disable(cpuctx->ctx.pmu);
2085
2086        /*
2087         * If there was an active task_ctx schedule it out.
2088         */
2089        if (task_ctx)
2090                task_ctx_sched_out(task_ctx);
2091
2092        /*
2093         * If the context we're installing events in is not the
2094         * active task_ctx, flip them.
2095         */
2096        if (ctx->task && task_ctx != ctx) {
2097                if (task_ctx)
2098                        raw_spin_unlock(&task_ctx->lock);
2099                raw_spin_lock(&ctx->lock);
2100                task_ctx = ctx;
2101        }
2102
2103        if (task_ctx) {
2104                cpuctx->task_ctx = task_ctx;
2105                task = task_ctx->task;
2106        }
2107
2108        cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2109
2110        update_context_time(ctx);
2111        /*
2112         * update cgrp time only if current cgrp
2113         * matches event->cgrp. Must be done before
2114         * calling add_event_to_ctx()
2115         */
2116        update_cgrp_time_from_event(event);
2117
2118        add_event_to_ctx(event, ctx);
2119
2120        /*
2121         * Schedule everything back in
2122         */
2123        perf_event_sched_in(cpuctx, task_ctx, task);
2124
2125        perf_pmu_enable(cpuctx->ctx.pmu);
2126        perf_ctx_unlock(cpuctx, task_ctx);
2127
2128        return 0;
2129}
2130
2131/*
2132 * Attach a performance event to a context
2133 *
2134 * First we add the event to the list with the hardware enable bit
2135 * in event->hw_config cleared.
2136 *
2137 * If the event is attached to a task which is on a CPU we use a smp
2138 * call to enable it in the task context. The task might have been
2139 * scheduled away, but we check this in the smp call again.
2140 */
2141static void
2142perf_install_in_context(struct perf_event_context *ctx,
2143                        struct perf_event *event,
2144                        int cpu)
2145{
2146        struct task_struct *task = ctx->task;
2147
2148        lockdep_assert_held(&ctx->mutex);
2149
2150        event->ctx = ctx;
2151        if (event->cpu != -1)
2152                event->cpu = cpu;
2153
2154        if (!task) {
2155                /*
2156                 * Per cpu events are installed via an smp call and
2157                 * the install is always successful.
2158                 */
2159                cpu_function_call(cpu, __perf_install_in_context, event);
2160                return;
2161        }
2162
2163retry:
2164        if (!task_function_call(task, __perf_install_in_context, event))
2165                return;
2166
2167        raw_spin_lock_irq(&ctx->lock);
2168        /*
2169         * If we failed to find a running task, but find the context active now
2170         * that we've acquired the ctx->lock, retry.
2171         */
2172        if (ctx->is_active) {
2173                raw_spin_unlock_irq(&ctx->lock);
2174                /*
2175                 * Reload the task pointer, it might have been changed by
2176                 * a concurrent perf_event_context_sched_out().
2177                 */
2178                task = ctx->task;
2179                goto retry;
2180        }
2181
2182        /*
2183         * Since the task isn't running, its safe to add the event, us holding
2184         * the ctx->lock ensures the task won't get scheduled in.
2185         */
2186        add_event_to_ctx(event, ctx);
2187        raw_spin_unlock_irq(&ctx->lock);
2188}
2189
2190/*
2191 * Put a event into inactive state and update time fields.
2192 * Enabling the leader of a group effectively enables all
2193 * the group members that aren't explicitly disabled, so we
2194 * have to update their ->tstamp_enabled also.
2195 * Note: this works for group members as well as group leaders
2196 * since the non-leader members' sibling_lists will be empty.
2197 */
2198static void __perf_event_mark_enabled(struct perf_event *event)
2199{
2200        struct perf_event *sub;
2201        u64 tstamp = perf_event_time(event);
2202
2203        event->state = PERF_EVENT_STATE_INACTIVE;
2204        event->tstamp_enabled = tstamp - event->total_time_enabled;
2205        list_for_each_entry(sub, &event->sibling_list, group_entry) {
2206                if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2207                        sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2208        }
2209}
2210
2211/*
2212 * Cross CPU call to enable a performance event
2213 */
2214static int __perf_event_enable(void *info)
2215{
2216        struct perf_event *event = info;
2217        struct perf_event_context *ctx = event->ctx;
2218        struct perf_event *leader = event->group_leader;
2219        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2220        int err;
2221
2222        /*
2223         * There's a time window between 'ctx->is_active' check
2224         * in perf_event_enable function and this place having:
2225         *   - IRQs on
2226         *   - ctx->lock unlocked
2227         *
2228         * where the task could be killed and 'ctx' deactivated
2229         * by perf_event_exit_task.
2230         */
2231        if (!ctx->is_active)
2232                return -EINVAL;
2233
2234        raw_spin_lock(&ctx->lock);
2235        update_context_time(ctx);
2236
2237        if (event->state >= PERF_EVENT_STATE_INACTIVE)
2238                goto unlock;
2239
2240        /*
2241         * set current task's cgroup time reference point
2242         */
2243        perf_cgroup_set_timestamp(current, ctx);
2244
2245        __perf_event_mark_enabled(event);
2246
2247        if (!event_filter_match(event)) {
2248                if (is_cgroup_event(event))
2249                        perf_cgroup_defer_enabled(event);
2250                goto unlock;
2251        }
2252
2253        /*
2254         * If the event is in a group and isn't the group leader,
2255         * then don't put it on unless the group is on.
2256         */
2257        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
2258                goto unlock;
2259
2260        if (!group_can_go_on(event, cpuctx, 1)) {
2261                err = -EEXIST;
2262        } else {
2263                if (event == leader)
2264                        err = group_sched_in(event, cpuctx, ctx);
2265                else
2266                        err = event_sched_in(event, cpuctx, ctx);
2267        }
2268
2269        if (err) {
2270                /*
2271                 * If this event can't go on and it's part of a
2272                 * group, then the whole group has to come off.
2273                 */
2274                if (leader != event) {
2275                        group_sched_out(leader, cpuctx, ctx);
2276                        perf_mux_hrtimer_restart(cpuctx);
2277                }
2278                if (leader->attr.pinned) {
2279                        update_group_times(leader);
2280                        leader->state = PERF_EVENT_STATE_ERROR;
2281                }
2282        }
2283
2284unlock:
2285        raw_spin_unlock(&ctx->lock);
2286
2287        return 0;
2288}
2289
2290/*
2291 * Enable a event.
2292 *
2293 * If event->ctx is a cloned context, callers must make sure that
2294 * every task struct that event->ctx->task could possibly point to
2295 * remains valid.  This condition is satisfied when called through
2296 * perf_event_for_each_child or perf_event_for_each as described
2297 * for perf_event_disable.
2298 */
2299static void _perf_event_enable(struct perf_event *event)
2300{
2301        struct perf_event_context *ctx = event->ctx;
2302        struct task_struct *task = ctx->task;
2303
2304        if (!task) {
2305                /*
2306                 * Enable the event on the cpu that it's on
2307                 */
2308                cpu_function_call(event->cpu, __perf_event_enable, event);
2309                return;
2310        }
2311
2312        raw_spin_lock_irq(&ctx->lock);
2313        if (event->state >= PERF_EVENT_STATE_INACTIVE)
2314                goto out;
2315
2316        /*
2317         * If the event is in error state, clear that first.
2318         * That way, if we see the event in error state below, we
2319         * know that it has gone back into error state, as distinct
2320         * from the task having been scheduled away before the
2321         * cross-call arrived.
2322         */
2323        if (event->state == PERF_EVENT_STATE_ERROR)
2324                event->state = PERF_EVENT_STATE_OFF;
2325
2326retry:
2327        if (!ctx->is_active) {
2328                __perf_event_mark_enabled(event);
2329                goto out;
2330        }
2331
2332        raw_spin_unlock_irq(&ctx->lock);
2333
2334        if (!task_function_call(task, __perf_event_enable, event))
2335                return;
2336
2337        raw_spin_lock_irq(&ctx->lock);
2338
2339        /*
2340         * If the context is active and the event is still off,
2341         * we need to retry the cross-call.
2342         */
2343        if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
2344                /*
2345                 * task could have been flipped by a concurrent
2346                 * perf_event_context_sched_out()
2347                 */
2348                task = ctx->task;
2349                goto retry;
2350        }
2351
2352out:
2353        raw_spin_unlock_irq(&ctx->lock);
2354}
2355
2356/*
2357 * See perf_event_disable();
2358 */
2359void perf_event_enable(struct perf_event *event)
2360{
2361        struct perf_event_context *ctx;
2362
2363        ctx = perf_event_ctx_lock(event);
2364        _perf_event_enable(event);
2365        perf_event_ctx_unlock(event, ctx);
2366}
2367EXPORT_SYMBOL_GPL(perf_event_enable);
2368
2369static int _perf_event_refresh(struct perf_event *event, int refresh)
2370{
2371        /*
2372         * not supported on inherited events
2373         */
2374        if (event->attr.inherit || !is_sampling_event(event))
2375                return -EINVAL;
2376
2377        atomic_add(refresh, &event->event_limit);
2378        _perf_event_enable(event);
2379
2380        return 0;
2381}
2382
2383/*
2384 * See perf_event_disable()
2385 */
2386int perf_event_refresh(struct perf_event *event, int refresh)
2387{
2388        struct perf_event_context *ctx;
2389        int ret;
2390
2391        ctx = perf_event_ctx_lock(event);
2392        ret = _perf_event_refresh(event, refresh);
2393        perf_event_ctx_unlock(event, ctx);
2394
2395        return ret;
2396}
2397EXPORT_SYMBOL_GPL(perf_event_refresh);
2398
2399static void ctx_sched_out(struct perf_event_context *ctx,
2400                          struct perf_cpu_context *cpuctx,
2401                          enum event_type_t event_type)
2402{
2403        struct perf_event *event;
2404        int is_active = ctx->is_active;
2405
2406        ctx->is_active &= ~event_type;
2407        if (likely(!ctx->nr_events))
2408                return;
2409
2410        update_context_time(ctx);
2411        update_cgrp_time_from_cpuctx(cpuctx);
2412        if (!ctx->nr_active)
2413                return;
2414
2415        perf_pmu_disable(ctx->pmu);
2416        if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
2417                list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2418                        group_sched_out(event, cpuctx, ctx);
2419        }
2420
2421        if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
2422                list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2423                        group_sched_out(event, cpuctx, ctx);
2424        }
2425        perf_pmu_enable(ctx->pmu);
2426}
2427
2428/*
2429 * Test whether two contexts are equivalent, i.e. whether they have both been
2430 * cloned from the same version of the same context.
2431 *
2432 * Equivalence is measured using a generation number in the context that is
2433 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2434 * and list_del_event().
2435 */
2436static int context_equiv(struct perf_event_context *ctx1,
2437                         struct perf_event_context *ctx2)
2438{
2439        lockdep_assert_held(&ctx1->lock);
2440        lockdep_assert_held(&ctx2->lock);
2441
2442        /* Pinning disables the swap optimization */
2443        if (ctx1->pin_count || ctx2->pin_count)
2444                return 0;
2445
2446        /* If ctx1 is the parent of ctx2 */
2447        if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2448                return 1;
2449
2450        /* If ctx2 is the parent of ctx1 */
2451        if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2452                return 1;
2453
2454        /*
2455         * If ctx1 and ctx2 have the same parent; we flatten the parent
2456         * hierarchy, see perf_event_init_context().
2457         */
2458        if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2459                        ctx1->parent_gen == ctx2->parent_gen)
2460                return 1;
2461
2462        /* Unmatched */
2463        return 0;
2464}
2465
2466static void __perf_event_sync_stat(struct perf_event *event,
2467                                     struct perf_event *next_event)
2468{
2469        u64 value;
2470
2471        if (!event->attr.inherit_stat)
2472                return;
2473
2474        /*
2475         * Update the event value, we cannot use perf_event_read()
2476         * because we're in the middle of a context switch and have IRQs
2477         * disabled, which upsets smp_call_function_single(), however
2478         * we know the event must be on the current CPU, therefore we
2479         * don't need to use it.
2480         */
2481        switch (event->state) {
2482        case PERF_EVENT_STATE_ACTIVE:
2483                event->pmu->read(event);
2484                /* fall-through */
2485
2486        case PERF_EVENT_STATE_INACTIVE:
2487                update_event_times(event);
2488                break;
2489
2490        default:
2491                break;
2492        }
2493
2494        /*
2495         * In order to keep per-task stats reliable we need to flip the event
2496         * values when we flip the contexts.
2497         */
2498        value = local64_read(&next_event->count);
2499        value = local64_xchg(&event->count, value);
2500        local64_set(&next_event->count, value);
2501
2502        swap(event->total_time_enabled, next_event->total_time_enabled);
2503        swap(event->total_time_running, next_event->total_time_running);
2504
2505        /*
2506         * Since we swizzled the values, update the user visible data too.
2507         */
2508        perf_event_update_userpage(event);
2509        perf_event_update_userpage(next_event);
2510}
2511
2512static void perf_event_sync_stat(struct perf_event_context *ctx,
2513                                   struct perf_event_context *next_ctx)
2514{
2515        struct perf_event *event, *next_event;
2516
2517        if (!ctx->nr_stat)
2518                return;
2519
2520        update_context_time(ctx);
2521
2522        event = list_first_entry(&ctx->event_list,
2523                                   struct perf_event, event_entry);
2524
2525        next_event = list_first_entry(&next_ctx->event_list,
2526                                        struct perf_event, event_entry);
2527
2528        while (&event->event_entry != &ctx->event_list &&
2529               &next_event->event_entry != &next_ctx->event_list) {
2530
2531                __perf_event_sync_stat(event, next_event);
2532
2533                event = list_next_entry(event, event_entry);
2534                next_event = list_next_entry(next_event, event_entry);
2535        }
2536}
2537
2538static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2539                                         struct task_struct *next)
2540{
2541        struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2542        struct perf_event_context *next_ctx;
2543        struct perf_event_context *parent, *next_parent;
2544        struct perf_cpu_context *cpuctx;
2545        int do_switch = 1;
2546
2547        if (likely(!ctx))
2548                return;
2549
2550        cpuctx = __get_cpu_context(ctx);
2551        if (!cpuctx->task_ctx)
2552                return;
2553
2554        rcu_read_lock();
2555        next_ctx = next->perf_event_ctxp[ctxn];
2556        if (!next_ctx)
2557                goto unlock;
2558
2559        parent = rcu_dereference(ctx->parent_ctx);
2560        next_parent = rcu_dereference(next_ctx->parent_ctx);
2561
2562        /* If neither context have a parent context; they cannot be clones. */
2563        if (!parent && !next_parent)
2564                goto unlock;
2565
2566        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2567                /*
2568                 * Looks like the two contexts are clones, so we might be
2569                 * able to optimize the context switch.  We lock both
2570                 * contexts and check that they are clones under the
2571                 * lock (including re-checking that neither has been
2572                 * uncloned in the meantime).  It doesn't matter which
2573                 * order we take the locks because no other cpu could
2574                 * be trying to lock both of these tasks.
2575                 */
2576                raw_spin_lock(&ctx->lock);
2577                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2578                if (context_equiv(ctx, next_ctx)) {
2579                        /*
2580                         * XXX do we need a memory barrier of sorts
2581                         * wrt to rcu_dereference() of perf_event_ctxp
2582                         */
2583                        task->perf_event_ctxp[ctxn] = next_ctx;
2584                        next->perf_event_ctxp[ctxn] = ctx;
2585                        ctx->task = next;
2586                        next_ctx->task = task;
2587
2588                        swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2589
2590                        do_switch = 0;
2591
2592                        perf_event_sync_stat(ctx, next_ctx);
2593                }
2594                raw_spin_unlock(&next_ctx->lock);
2595                raw_spin_unlock(&ctx->lock);
2596        }
2597unlock:
2598        rcu_read_unlock();
2599
2600        if (do_switch) {
2601                raw_spin_lock(&ctx->lock);
2602                ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2603                cpuctx->task_ctx = NULL;
2604                raw_spin_unlock(&ctx->lock);
2605        }
2606}
2607
2608void perf_sched_cb_dec(struct pmu *pmu)
2609{
2610        this_cpu_dec(perf_sched_cb_usages);
2611}
2612
2613void perf_sched_cb_inc(struct pmu *pmu)
2614{
2615        this_cpu_inc(perf_sched_cb_usages);
2616}
2617
2618/*
2619 * This function provides the context switch callback to the lower code
2620 * layer. It is invoked ONLY when the context switch callback is enabled.
2621 */
2622static void perf_pmu_sched_task(struct task_struct *prev,
2623                                struct task_struct *next,
2624                                bool sched_in)
2625{
2626        struct perf_cpu_context *cpuctx;
2627        struct pmu *pmu;
2628        unsigned long flags;
2629
2630        if (prev == next)
2631                return;
2632
2633        local_irq_save(flags);
2634
2635        rcu_read_lock();
2636
2637        list_for_each_entry_rcu(pmu, &pmus, entry) {
2638                if (pmu->sched_task) {
2639                        cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2640
2641                        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2642
2643                        perf_pmu_disable(pmu);
2644
2645                        pmu->sched_task(cpuctx->task_ctx, sched_in);
2646
2647                        perf_pmu_enable(pmu);
2648
2649                        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2650                }
2651        }
2652
2653        rcu_read_unlock();
2654
2655        local_irq_restore(flags);
2656}
2657
2658static void perf_event_switch(struct task_struct *task,
2659                              struct task_struct *next_prev, bool sched_in);
2660
2661#define for_each_task_context_nr(ctxn)                                  \
2662        for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2663
2664/*
2665 * Called from scheduler to remove the events of the current task,
2666 * with interrupts disabled.
2667 *
2668 * We stop each event and update the event value in event->count.
2669 *
2670 * This does not protect us against NMI, but disable()
2671 * sets the disabled bit in the control field of event _before_
2672 * accessing the event control register. If a NMI hits, then it will
2673 * not restart the event.
2674 */
2675void __perf_event_task_sched_out(struct task_struct *task,
2676                                 struct task_struct *next)
2677{
2678        int ctxn;
2679
2680        if (__this_cpu_read(perf_sched_cb_usages))
2681                perf_pmu_sched_task(task, next, false);
2682
2683        if (atomic_read(&nr_switch_events))
2684                perf_event_switch(task, next, false);
2685
2686        for_each_task_context_nr(ctxn)
2687                perf_event_context_sched_out(task, ctxn, next);
2688
2689        /*
2690         * if cgroup events exist on this CPU, then we need
2691         * to check if we have to switch out PMU state.
2692         * cgroup event are system-wide mode only
2693         */
2694        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2695                perf_cgroup_sched_out(task, next);
2696}
2697
2698static void task_ctx_sched_out(struct perf_event_context *ctx)
2699{
2700        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2701
2702        if (!cpuctx->task_ctx)
2703                return;
2704
2705        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2706                return;
2707
2708        ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2709        cpuctx->task_ctx = NULL;
2710}
2711
2712/*
2713 * Called with IRQs disabled
2714 */
2715static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2716                              enum event_type_t event_type)
2717{
2718        ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2719}
2720
2721static void
2722ctx_pinned_sched_in(struct perf_event_context *ctx,
2723                    struct perf_cpu_context *cpuctx)
2724{
2725        struct perf_event *event;
2726
2727        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2728                if (event->state <= PERF_EVENT_STATE_OFF)
2729                        continue;
2730                if (!event_filter_match(event))
2731                        continue;
2732
2733                /* may need to reset tstamp_enabled */
2734                if (is_cgroup_event(event))
2735                        perf_cgroup_mark_enabled(event, ctx);
2736
2737                if (group_can_go_on(event, cpuctx, 1))
2738                        group_sched_in(event, cpuctx, ctx);
2739
2740                /*
2741                 * If this pinned group hasn't been scheduled,
2742                 * put it in error state.
2743                 */
2744                if (event->state == PERF_EVENT_STATE_INACTIVE) {
2745                        update_group_times(event);
2746                        event->state = PERF_EVENT_STATE_ERROR;
2747                }
2748        }
2749}
2750
2751static void
2752ctx_flexible_sched_in(struct perf_event_context *ctx,
2753                      struct perf_cpu_context *cpuctx)
2754{
2755        struct perf_event *event;
2756        int can_add_hw = 1;
2757
2758        list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2759                /* Ignore events in OFF or ERROR state */
2760                if (event->state <= PERF_EVENT_STATE_OFF)
2761                        continue;
2762                /*
2763                 * Listen to the 'cpu' scheduling filter constraint
2764                 * of events:
2765                 */
2766                if (!event_filter_match(event))
2767                        continue;
2768
2769                /* may need to reset tstamp_enabled */
2770                if (is_cgroup_event(event))
2771                        perf_cgroup_mark_enabled(event, ctx);
2772
2773                if (group_can_go_on(event, cpuctx, can_add_hw)) {
2774                        if (group_sched_in(event, cpuctx, ctx))
2775                                can_add_hw = 0;
2776                }
2777        }
2778}
2779
2780static void
2781ctx_sched_in(struct perf_event_context *ctx,
2782             struct perf_cpu_context *cpuctx,
2783             enum event_type_t event_type,
2784             struct task_struct *task)
2785{
2786        u64 now;
2787        int is_active = ctx->is_active;
2788
2789        ctx->is_active |= event_type;
2790        if (likely(!ctx->nr_events))
2791                return;
2792
2793        now = perf_clock();
2794        ctx->timestamp = now;
2795        perf_cgroup_set_timestamp(task, ctx);
2796        /*
2797         * First go through the list and put on any pinned groups
2798         * in order to give them the best chance of going on.
2799         */
2800        if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2801                ctx_pinned_sched_in(ctx, cpuctx);
2802
2803        /* Then walk through the lower prio flexible groups */
2804        if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2805                ctx_flexible_sched_in(ctx, cpuctx);
2806}
2807
2808static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2809                             enum event_type_t event_type,
2810                             struct task_struct *task)
2811{
2812        struct perf_event_context *ctx = &cpuctx->ctx;
2813
2814        ctx_sched_in(ctx, cpuctx, event_type, task);
2815}
2816
2817static void perf_event_context_sched_in(struct perf_event_context *ctx,
2818                                        struct task_struct *task)
2819{
2820        struct perf_cpu_context *cpuctx;
2821
2822        cpuctx = __get_cpu_context(ctx);
2823        if (cpuctx->task_ctx == ctx)
2824                return;
2825
2826        perf_ctx_lock(cpuctx, ctx);
2827        perf_pmu_disable(ctx->pmu);
2828        /*
2829         * We want to keep the following priority order:
2830         * cpu pinned (that don't need to move), task pinned,
2831         * cpu flexible, task flexible.
2832         */
2833        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2834
2835        if (ctx->nr_events)
2836                cpuctx->task_ctx = ctx;
2837
2838        perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2839
2840        perf_pmu_enable(ctx->pmu);
2841        perf_ctx_unlock(cpuctx, ctx);
2842}
2843
2844/*
2845 * Called from scheduler to add the events of the current task
2846 * with interrupts disabled.
2847 *
2848 * We restore the event value and then enable it.
2849 *
2850 * This does not protect us against NMI, but enable()
2851 * sets the enabled bit in the control field of event _before_
2852 * accessing the event control register. If a NMI hits, then it will
2853 * keep the event running.
2854 */
2855void __perf_event_task_sched_in(struct task_struct *prev,
2856                                struct task_struct *task)
2857{
2858        struct perf_event_context *ctx;
2859        int ctxn;
2860
2861        for_each_task_context_nr(ctxn) {
2862                ctx = task->perf_event_ctxp[ctxn];
2863                if (likely(!ctx))
2864                        continue;
2865
2866                perf_event_context_sched_in(ctx, task);
2867        }
2868        /*
2869         * if cgroup events exist on this CPU, then we need
2870         * to check if we have to switch in PMU state.
2871         * cgroup event are system-wide mode only
2872         */
2873        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2874                perf_cgroup_sched_in(prev, task);
2875
2876        if (atomic_read(&nr_switch_events))
2877                perf_event_switch(task, prev, true);
2878
2879        if (__this_cpu_read(perf_sched_cb_usages))
2880                perf_pmu_sched_task(prev, task, true);
2881}
2882
2883static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2884{
2885        u64 frequency = event->attr.sample_freq;
2886        u64 sec = NSEC_PER_SEC;
2887        u64 divisor, dividend;
2888
2889        int count_fls, nsec_fls, frequency_fls, sec_fls;
2890
2891        count_fls = fls64(count);
2892        nsec_fls = fls64(nsec);
2893        frequency_fls = fls64(frequency);
2894        sec_fls = 30;
2895
2896        /*
2897         * We got @count in @nsec, with a target of sample_freq HZ
2898         * the target period becomes:
2899         *
2900         *             @count * 10^9
2901         * period = -------------------
2902         *          @nsec * sample_freq
2903         *
2904         */
2905
2906        /*
2907         * Reduce accuracy by one bit such that @a and @b converge
2908         * to a similar magnitude.
2909         */
2910#define REDUCE_FLS(a, b)                \
2911do {                                    \
2912        if (a##_fls > b##_fls) {        \
2913                a >>= 1;                \
2914                a##_fls--;              \
2915        } else {                        \
2916                b >>= 1;                \
2917                b##_fls--;              \
2918        }                               \
2919} while (0)
2920
2921        /*
2922         * Reduce accuracy until either term fits in a u64, then proceed with
2923         * the other, so that finally we can do a u64/u64 division.
2924         */
2925        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2926                REDUCE_FLS(nsec, frequency);
2927                REDUCE_FLS(sec, count);
2928        }
2929
2930        if (count_fls + sec_fls > 64) {
2931                divisor = nsec * frequency;
2932
2933                while (count_fls + sec_fls > 64) {
2934                        REDUCE_FLS(count, sec);
2935                        divisor >>= 1;
2936                }
2937
2938                dividend = count * sec;
2939        } else {
2940                dividend = count * sec;
2941
2942                while (nsec_fls + frequency_fls > 64) {
2943                        REDUCE_FLS(nsec, frequency);
2944                        dividend >>= 1;
2945                }
2946
2947                divisor = nsec * frequency;
2948        }
2949
2950        if (!divisor)
2951                return dividend;
2952
2953        return div64_u64(dividend, divisor);
2954}
2955
2956static DEFINE_PER_CPU(int, perf_throttled_count);
2957static DEFINE_PER_CPU(u64, perf_throttled_seq);
2958
2959static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2960{
2961        struct hw_perf_event *hwc = &event->hw;
2962        s64 period, sample_period;
2963        s64 delta;
2964
2965        period = perf_calculate_period(event, nsec, count);
2966
2967        delta = (s64)(period - hwc->sample_period);
2968        delta = (delta + 7) / 8; /* low pass filter */
2969
2970        sample_period = hwc->sample_period + delta;
2971
2972        if (!sample_period)
2973                sample_period = 1;
2974
2975        hwc->sample_period = sample_period;
2976
2977        if (local64_read(&hwc->period_left) > 8*sample_period) {
2978                if (disable)
2979                        event->pmu->stop(event, PERF_EF_UPDATE);
2980
2981                local64_set(&hwc->period_left, 0);
2982
2983                if (disable)
2984                        event->pmu->start(event, PERF_EF_RELOAD);
2985        }
2986}
2987
2988/*
2989 * combine freq adjustment with unthrottling to avoid two passes over the
2990 * events. At the same time, make sure, having freq events does not change
2991 * the rate of unthrottling as that would introduce bias.
2992 */
2993static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2994                                           int needs_unthr)
2995{
2996        struct perf_event *event;
2997        struct hw_perf_event *hwc;
2998        u64 now, period = TICK_NSEC;
2999        s64 delta;
3000
3001        /*
3002         * only need to iterate over all events iff:
3003         * - context have events in frequency mode (needs freq adjust)
3004         * - there are events to unthrottle on this cpu
3005         */
3006        if (!(ctx->nr_freq || needs_unthr))
3007                return;
3008
3009        raw_spin_lock(&ctx->lock);
3010        perf_pmu_disable(ctx->pmu);
3011
3012        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3013                if (event->state != PERF_EVENT_STATE_ACTIVE)
3014                        continue;
3015
3016                if (!event_filter_match(event))
3017                        continue;
3018
3019                perf_pmu_disable(event->pmu);
3020
3021                hwc = &event->hw;
3022
3023                if (hwc->interrupts == MAX_INTERRUPTS) {
3024                        hwc->interrupts = 0;
3025                        perf_log_throttle(event, 1);
3026                        event->pmu->start(event, 0);
3027                }
3028
3029                if (!event->attr.freq || !event->attr.sample_freq)
3030                        goto next;
3031
3032                /*
3033                 * stop the event and update event->count
3034                 */
3035                event->pmu->stop(event, PERF_EF_UPDATE);
3036
3037                now = local64_read(&event->count);
3038                delta = now - hwc->freq_count_stamp;
3039                hwc->freq_count_stamp = now;
3040
3041                /*
3042                 * restart the event
3043                 * reload only if value has changed
3044                 * we have stopped the event so tell that
3045                 * to perf_adjust_period() to avoid stopping it
3046                 * twice.
3047                 */
3048                if (delta > 0)
3049                        perf_adjust_period(event, period, delta, false);
3050
3051                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3052        next:
3053                perf_pmu_enable(event->pmu);
3054        }
3055
3056        perf_pmu_enable(ctx->pmu);
3057        raw_spin_unlock(&ctx->lock);
3058}
3059
3060/*
3061 * Round-robin a context's events:
3062 */
3063static void rotate_ctx(struct perf_event_context *ctx)
3064{
3065        /*
3066         * Rotate the first entry last of non-pinned groups. Rotation might be
3067         * disabled by the inheritance code.
3068         */
3069        if (!ctx->rotate_disable)
3070                list_rotate_left(&ctx->flexible_groups);
3071}
3072
3073static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3074{
3075        struct perf_event_context *ctx = NULL;
3076        int rotate = 0;
3077
3078        if (cpuctx->ctx.nr_events) {
3079                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3080                        rotate = 1;
3081        }
3082
3083        ctx = cpuctx->task_ctx;
3084        if (ctx && ctx->nr_events) {
3085                if (ctx->nr_events != ctx->nr_active)
3086                        rotate = 1;
3087        }
3088
3089        if (!rotate)
3090                goto done;
3091
3092        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3093        perf_pmu_disable(cpuctx->ctx.pmu);
3094
3095        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3096        if (ctx)
3097                ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3098
3099        rotate_ctx(&cpuctx->ctx);
3100        if (ctx)
3101                rotate_ctx(ctx);
3102
3103        perf_event_sched_in(cpuctx, ctx, current);
3104
3105        perf_pmu_enable(cpuctx->ctx.pmu);
3106        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3107done:
3108
3109        return rotate;
3110}
3111
3112#ifdef CONFIG_NO_HZ_FULL
3113bool perf_event_can_stop_tick(void)
3114{
3115        if (atomic_read(&nr_freq_events) ||
3116            __this_cpu_read(perf_throttled_count))
3117                return false;
3118        else
3119                return true;
3120}
3121#endif
3122
3123void perf_event_task_tick(void)
3124{
3125        struct list_head *head = this_cpu_ptr(&active_ctx_list);
3126        struct perf_event_context *ctx, *tmp;
3127        int throttled;
3128
3129        WARN_ON(!irqs_disabled());
3130
3131        __this_cpu_inc(perf_throttled_seq);
3132        throttled = __this_cpu_xchg(perf_throttled_count, 0);
3133
3134        list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3135                perf_adjust_freq_unthr_context(ctx, throttled);
3136}
3137
3138static int event_enable_on_exec(struct perf_event *event,
3139                                struct perf_event_context *ctx)
3140{
3141        if (!event->attr.enable_on_exec)
3142                return 0;
3143
3144        event->attr.enable_on_exec = 0;
3145        if (event->state >= PERF_EVENT_STATE_INACTIVE)
3146                return 0;
3147
3148        __perf_event_mark_enabled(event);
3149
3150        return 1;
3151}
3152
3153/*
3154 * Enable all of a task's events that have been marked enable-on-exec.
3155 * This expects task == current.
3156 */
3157static void perf_event_enable_on_exec(int ctxn)
3158{
3159        struct perf_event_context *ctx, *clone_ctx = NULL;
3160        struct perf_event *event;
3161        unsigned long flags;
3162        int enabled = 0;
3163        int ret;
3164
3165        local_irq_save(flags);
3166        ctx = current->perf_event_ctxp[ctxn];
3167        if (!ctx || !ctx->nr_events)
3168                goto out;
3169
3170        /*
3171         * We must ctxsw out cgroup events to avoid conflict
3172         * when invoking perf_task_event_sched_in() later on
3173         * in this function. Otherwise we end up trying to
3174         * ctxswin cgroup events which are already scheduled
3175         * in.
3176         */
3177        perf_cgroup_sched_out(current, NULL);
3178
3179        raw_spin_lock(&ctx->lock);
3180        task_ctx_sched_out(ctx);
3181
3182        list_for_each_entry(event, &ctx->event_list, event_entry) {
3183                ret = event_enable_on_exec(event, ctx);
3184                if (ret)
3185                        enabled = 1;
3186        }
3187
3188        /*
3189         * Unclone this context if we enabled any event.
3190         */
3191        if (enabled)
3192                clone_ctx = unclone_ctx(ctx);
3193
3194        raw_spin_unlock(&ctx->lock);
3195
3196        /*
3197         * Also calls ctxswin for cgroup events, if any:
3198         */
3199        perf_event_context_sched_in(ctx, ctx->task);
3200out:
3201        local_irq_restore(flags);
3202
3203        if (clone_ctx)
3204                put_ctx(clone_ctx);
3205}
3206
3207void perf_event_exec(void)
3208{
3209        int ctxn;
3210
3211        rcu_read_lock();
3212        for_each_task_context_nr(ctxn)
3213                perf_event_enable_on_exec(ctxn);
3214        rcu_read_unlock();
3215}
3216
3217struct perf_read_data {
3218        struct perf_event *event;
3219        bool group;
3220        int ret;
3221};
3222
3223/*
3224 * Cross CPU call to read the hardware event
3225 */
3226static void __perf_event_read(void *info)
3227{
3228        struct perf_read_data *data = info;
3229        struct perf_event *sub, *event = data->event;
3230        struct perf_event_context *ctx = event->ctx;
3231        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3232        struct pmu *pmu = event->pmu;
3233
3234        /*
3235         * If this is a task context, we need to check whether it is
3236         * the current task context of this cpu.  If not it has been
3237         * scheduled out before the smp call arrived.  In that case
3238         * event->count would have been updated to a recent sample
3239         * when the event was scheduled out.
3240         */
3241        if (ctx->task && cpuctx->task_ctx != ctx)
3242                return;
3243
3244        raw_spin_lock(&ctx->lock);
3245        if (ctx->is_active) {
3246                update_context_time(ctx);
3247                update_cgrp_time_from_event(event);
3248        }
3249
3250        update_event_times(event);
3251        if (event->state != PERF_EVENT_STATE_ACTIVE)
3252                goto unlock;
3253
3254        if (!data->group) {
3255                pmu->read(event);
3256                data->ret = 0;
3257                goto unlock;
3258        }
3259
3260        pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3261
3262        pmu->read(event);
3263
3264        list_for_each_entry(sub, &event->sibling_list, group_entry) {
3265                update_event_times(sub);
3266                if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3267                        /*
3268                         * Use sibling's PMU rather than @event's since
3269                         * sibling could be on different (eg: software) PMU.
3270                         */
3271                        sub->pmu->read(sub);
3272                }
3273        }
3274
3275        data->ret = pmu->commit_txn(pmu);
3276
3277unlock:
3278        raw_spin_unlock(&ctx->lock);
3279}
3280
3281static inline u64 perf_event_count(struct perf_event *event)
3282{
3283        if (event->pmu->count)
3284                return event->pmu->count(event);
3285
3286        return __perf_event_count(event);
3287}
3288
3289/*
3290 * NMI-safe method to read a local event, that is an event that
3291 * is:
3292 *   - either for the current task, or for this CPU
3293 *   - does not have inherit set, for inherited task events
3294 *     will not be local and we cannot read them atomically
3295 *   - must not have a pmu::count method
3296 */
3297u64 perf_event_read_local(struct perf_event *event)
3298{
3299        unsigned long flags;
3300        u64 val;
3301
3302        /*
3303         * Disabling interrupts avoids all counter scheduling (context
3304         * switches, timer based rotation and IPIs).
3305         */
3306        local_irq_save(flags);
3307
3308        /* If this is a per-task event, it must be for current */
3309        WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
3310                     event->hw.target != current);
3311
3312        /* If this is a per-CPU event, it must be for this CPU */
3313        WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
3314                     event->cpu != smp_processor_id());
3315
3316        /*
3317         * It must not be an event with inherit set, we cannot read
3318         * all child counters from atomic context.
3319         */
3320        WARN_ON_ONCE(event->attr.inherit);
3321
3322        /*
3323         * It must not have a pmu::count method, those are not
3324         * NMI safe.
3325         */
3326        WARN_ON_ONCE(event->pmu->count);
3327
3328        /*
3329         * If the event is currently on this CPU, its either a per-task event,
3330         * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3331         * oncpu == -1).
3332         */
3333        if (event->oncpu == smp_processor_id())
3334                event->pmu->read(event);
3335
3336        val = local64_read(&event->count);
3337        local_irq_restore(flags);
3338
3339        return val;
3340}
3341
3342static int perf_event_read(struct perf_event *event, bool group)
3343{
3344        int ret = 0;
3345
3346        /*
3347         * If event is enabled and currently active on a CPU, update the
3348         * value in the event structure:
3349         */
3350        if (event->state == PERF_EVENT_STATE_ACTIVE) {
3351                struct perf_read_data data = {
3352                        .event = event,
3353                        .group = group,
3354                        .ret = 0,
3355                };
3356                smp_call_function_single(event->oncpu,
3357                                         __perf_event_read, &data, 1);
3358                ret = data.ret;
3359        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3360                struct perf_event_context *ctx = event->ctx;
3361                unsigned long flags;
3362
3363                raw_spin_lock_irqsave(&ctx->lock, flags);
3364                /*
3365                 * may read while context is not active
3366                 * (e.g., thread is blocked), in that case
3367                 * we cannot update context time
3368                 */
3369                if (ctx->is_active) {
3370                        update_context_time(ctx);
3371                        update_cgrp_time_from_event(event);
3372                }
3373                if (group)
3374                        update_group_times(event);
3375                else
3376                        update_event_times(event);
3377                raw_spin_unlock_irqrestore(&ctx->lock, flags);
3378        }
3379
3380        return ret;
3381}
3382
3383/*
3384 * Initialize the perf_event context in a task_struct:
3385 */
3386static void __perf_event_init_context(struct perf_event_context *ctx)
3387{
3388        raw_spin_lock_init(&ctx->lock);
3389        mutex_init(&ctx->mutex);
3390        INIT_LIST_HEAD(&ctx->active_ctx_list);
3391        INIT_LIST_HEAD(&ctx->pinned_groups);
3392        INIT_LIST_HEAD(&ctx->flexible_groups);
3393        INIT_LIST_HEAD(&ctx->event_list);
3394        atomic_set(&ctx->refcount, 1);
3395        INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
3396}
3397
3398static struct perf_event_context *
3399alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3400{
3401        struct perf_event_context *ctx;
3402
3403        ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3404        if (!ctx)
3405                return NULL;
3406
3407        __perf_event_init_context(ctx);
3408        if (task) {
3409                ctx->task = task;
3410                get_task_struct(task);
3411        }
3412        ctx->pmu = pmu;
3413
3414        return ctx;
3415}
3416
3417static struct task_struct *
3418find_lively_task_by_vpid(pid_t vpid)
3419{
3420        struct task_struct *task;
3421        int err;
3422
3423        rcu_read_lock();
3424        if (!vpid)
3425                task = current;
3426        else
3427                task = find_task_by_vpid(vpid);
3428        if (task)
3429                get_task_struct(task);
3430        rcu_read_unlock();
3431
3432        if (!task)
3433                return ERR_PTR(-ESRCH);
3434
3435        /* Reuse ptrace permission checks for now. */
3436        err = -EACCES;
3437        if (!ptrace_may_access(task, PTRACE_MODE_READ))
3438                goto errout;
3439
3440        return task;
3441errout:
3442        put_task_struct(task);
3443        return ERR_PTR(err);
3444
3445}
3446
3447/*
3448 * Returns a matching context with refcount and pincount.
3449 */
3450static struct perf_event_context *
3451find_get_context(struct pmu *pmu, struct task_struct *task,
3452                struct perf_event *event)
3453{
3454        struct perf_event_context *ctx, *clone_ctx = NULL;
3455        struct perf_cpu_context *cpuctx;
3456        void *task_ctx_data = NULL;
3457        unsigned long flags;
3458        int ctxn, err;
3459        int cpu = event->cpu;
3460
3461        if (!task) {
3462                /* Must be root to operate on a CPU event: */
3463                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3464                        return ERR_PTR(-EACCES);
3465
3466                /*
3467                 * We could be clever and allow to attach a event to an
3468                 * offline CPU and activate it when the CPU comes up, but
3469                 * that's for later.
3470                 */
3471                if (!cpu_online(cpu))
3472                        return ERR_PTR(-ENODEV);
3473
3474                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3475                ctx = &cpuctx->ctx;
3476                get_ctx(ctx);
3477                ++ctx->pin_count;
3478
3479                return ctx;
3480        }
3481
3482        err = -EINVAL;
3483        ctxn = pmu->task_ctx_nr;
3484        if (ctxn < 0)
3485                goto errout;
3486
3487        if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3488                task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3489                if (!task_ctx_data) {
3490                        err = -ENOMEM;
3491                        goto errout;
3492                }
3493        }
3494
3495retry:
3496        ctx = perf_lock_task_context(task, ctxn, &flags);
3497        if (ctx) {
3498                clone_ctx = unclone_ctx(ctx);
3499                ++ctx->pin_count;
3500
3501                if (task_ctx_data && !ctx->task_ctx_data) {
3502                        ctx->task_ctx_data = task_ctx_data;
3503                        task_ctx_data = NULL;
3504                }
3505                raw_spin_unlock_irqrestore(&ctx->lock, flags);
3506
3507                if (clone_ctx)
3508                        put_ctx(clone_ctx);
3509        } else {
3510                ctx = alloc_perf_context(pmu, task);
3511                err = -ENOMEM;
3512                if (!ctx)
3513                        goto errout;
3514
3515                if (task_ctx_data) {
3516                        ctx->task_ctx_data = task_ctx_data;
3517                        task_ctx_data = NULL;
3518                }
3519
3520                err = 0;
3521                mutex_lock(&task->perf_event_mutex);
3522                /*
3523                 * If it has already passed perf_event_exit_task().
3524                 * we must see PF_EXITING, it takes this mutex too.
3525                 */
3526                if (task->flags & PF_EXITING)
3527                        err = -ESRCH;
3528                else if (task->perf_event_ctxp[ctxn])
3529                        err = -EAGAIN;
3530                else {
3531                        get_ctx(ctx);
3532                        ++ctx->pin_count;
3533                        rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3534                }
3535                mutex_unlock(&task->perf_event_mutex);
3536
3537                if (unlikely(err)) {
3538                        put_ctx(ctx);
3539
3540                        if (err == -EAGAIN)
3541                                goto retry;
3542                        goto errout;
3543                }
3544        }
3545
3546        kfree(task_ctx_data);
3547        return ctx;
3548
3549errout:
3550        kfree(task_ctx_data);
3551        return ERR_PTR(err);
3552}
3553
3554static void perf_event_free_filter(struct perf_event *event);
3555static void perf_event_free_bpf_prog(struct perf_event *event);
3556
3557static void free_event_rcu(struct rcu_head *head)
3558{
3559        struct perf_event *event;
3560
3561        event = container_of(head, struct perf_event, rcu_head);
3562        if (event->ns)
3563                put_pid_ns(event->ns);
3564        perf_event_free_filter(event);
3565        kfree(event);
3566}
3567
3568static void ring_buffer_attach(struct perf_event *event,
3569                               struct ring_buffer *rb);
3570
3571static void unaccount_event_cpu(struct perf_event *event, int cpu)
3572{
3573        if (event->parent)
3574                return;
3575
3576        if (is_cgroup_event(event))
3577                atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3578}
3579
3580static void unaccount_event(struct perf_event *event)
3581{
3582        if (event->parent)
3583                return;
3584
3585        if (event->attach_state & PERF_ATTACH_TASK)
3586                static_key_slow_dec_deferred(&perf_sched_events);
3587        if (event->attr.mmap || event->attr.mmap_data)
3588                atomic_dec(&nr_mmap_events);
3589        if (event->attr.comm)
3590                atomic_dec(&nr_comm_events);
3591        if (event->attr.task)
3592                atomic_dec(&nr_task_events);
3593        if (event->attr.freq)
3594                atomic_dec(&nr_freq_events);
3595        if (event->attr.context_switch) {
3596                static_key_slow_dec_deferred(&perf_sched_events);
3597                atomic_dec(&nr_switch_events);
3598        }
3599        if (is_cgroup_event(event))
3600                static_key_slow_dec_deferred(&perf_sched_events);
3601        if (has_branch_stack(event))
3602                static_key_slow_dec_deferred(&perf_sched_events);
3603
3604        unaccount_event_cpu(event, event->cpu);
3605}
3606
3607/*
3608 * The following implement mutual exclusion of events on "exclusive" pmus
3609 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3610 * at a time, so we disallow creating events that might conflict, namely:
3611 *
3612 *  1) cpu-wide events in the presence of per-task events,
3613 *  2) per-task events in the presence of cpu-wide events,
3614 *  3) two matching events on the same context.
3615 *
3616 * The former two cases are handled in the allocation path (perf_event_alloc(),
3617 * __free_event()), the latter -- before the first perf_install_in_context().
3618 */
3619static int exclusive_event_init(struct perf_event *event)
3620{
3621        struct pmu *pmu = event->pmu;
3622
3623        if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3624                return 0;
3625
3626        /*
3627         * Prevent co-existence of per-task and cpu-wide events on the
3628         * same exclusive pmu.
3629         *
3630         * Negative pmu::exclusive_cnt means there are cpu-wide
3631         * events on this "exclusive" pmu, positive means there are
3632         * per-task events.
3633         *
3634         * Since this is called in perf_event_alloc() path, event::ctx
3635         * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
3636         * to mean "per-task event", because unlike other attach states it
3637         * never gets cleared.
3638         */
3639        if (event->attach_state & PERF_ATTACH_TASK) {
3640                if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3641                        return -EBUSY;
3642        } else {
3643                if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3644                        return -EBUSY;
3645        }
3646
3647        return 0;
3648}
3649
3650static void exclusive_event_destroy(struct perf_event *event)
3651{
3652        struct pmu *pmu = event->pmu;
3653
3654        if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3655                return;
3656
3657        /* see comment in exclusive_event_init() */
3658        if (event->attach_state & PERF_ATTACH_TASK)
3659                atomic_dec(&pmu->exclusive_cnt);
3660        else
3661                atomic_inc(&pmu->exclusive_cnt);
3662}
3663
3664static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3665{
3666        if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3667            (e1->cpu == e2->cpu ||
3668             e1->cpu == -1 ||
3669             e2->cpu == -1))
3670                return true;
3671        return false;
3672}
3673
3674/* Called under the same ctx::mutex as perf_install_in_context() */
3675static bool exclusive_event_installable(struct perf_event *event,
3676                                        struct perf_event_context *ctx)
3677{
3678        struct perf_event *iter_event;
3679        struct pmu *pmu = event->pmu;
3680
3681        if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3682                return true;
3683
3684        list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3685                if (exclusive_event_match(iter_event, event))
3686                        return false;
3687        }
3688
3689        return true;
3690}
3691
3692static void __free_event(struct perf_event *event)
3693{
3694        if (!event->parent) {
3695                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3696                        put_callchain_buffers();
3697        }
3698
3699        perf_event_free_bpf_prog(event);
3700
3701        if (event->destroy)
3702                event->destroy(event);
3703
3704        if (event->ctx)
3705                put_ctx(event->ctx);
3706
3707        if (event->pmu) {
3708                exclusive_event_destroy(event);
3709                module_put(event->pmu->module);
3710        }
3711
3712        call_rcu(&event->rcu_head, free_event_rcu);
3713}
3714
3715static void _free_event(struct perf_event *event)
3716{
3717        irq_work_sync(&event->pending);
3718
3719        unaccount_event(event);
3720
3721        if (event->rb) {
3722                /*
3723                 * Can happen when we close an event with re-directed output.
3724                 *
3725                 * Since we have a 0 refcount, perf_mmap_close() will skip
3726                 * over us; possibly making our ring_buffer_put() the last.
3727                 */
3728                mutex_lock(&event->mmap_mutex);
3729                ring_buffer_attach(event, NULL);
3730                mutex_unlock(&event->mmap_mutex);
3731        }
3732
3733        if (is_cgroup_event(event))
3734                perf_detach_cgroup(event);
3735
3736        __free_event(event);
3737}
3738
3739/*
3740 * Used to free events which have a known refcount of 1, such as in error paths
3741 * where the event isn't exposed yet and inherited events.
3742 */
3743static void free_event(struct perf_event *event)
3744{
3745        if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3746                                "unexpected event refcount: %ld; ptr=%p\n",
3747                                atomic_long_read(&event->refcount), event)) {
3748                /* leak to avoid use-after-free */
3749                return;
3750        }
3751
3752        _free_event(event);
3753}
3754
3755/*
3756 * Remove user event from the owner task.
3757 */
3758static void perf_remove_from_owner(struct perf_event *event)
3759{
3760        struct task_struct *owner;
3761
3762        rcu_read_lock();
3763        owner = ACCESS_ONCE(event->owner);
3764        /*
3765         * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3766         * !owner it means the list deletion is complete and we can indeed
3767         * free this event, otherwise we need to serialize on
3768         * owner->perf_event_mutex.
3769         */
3770        smp_read_barrier_depends();
3771        if (owner) {
3772                /*
3773                 * Since delayed_put_task_struct() also drops the last
3774                 * task reference we can safely take a new reference
3775                 * while holding the rcu_read_lock().
3776                 */
3777                get_task_struct(owner);
3778        }
3779        rcu_read_unlock();
3780
3781        if (owner) {
3782                /*
3783                 * If we're here through perf_event_exit_task() we're already
3784                 * holding ctx->mutex which would be an inversion wrt. the
3785                 * normal lock order.
3786                 *
3787                 * However we can safely take this lock because its the child
3788                 * ctx->mutex.
3789                 */
3790                mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3791
3792                /*
3793                 * We have to re-check the event->owner field, if it is cleared
3794                 * we raced with perf_event_exit_task(), acquiring the mutex
3795                 * ensured they're done, and we can proceed with freeing the
3796                 * event.
3797                 */
3798                if (event->owner)
3799                        list_del_init(&event->owner_entry);
3800                mutex_unlock(&owner->perf_event_mutex);
3801                put_task_struct(owner);
3802        }
3803}
3804
3805static void put_event(struct perf_event *event)
3806{
3807        struct perf_event_context *ctx;
3808
3809        if (!atomic_long_dec_and_test(&event->refcount))
3810                return;
3811
3812        if (!is_kernel_event(event))
3813                perf_remove_from_owner(event);
3814
3815        /*
3816         * There are two ways this annotation is useful:
3817         *
3818         *  1) there is a lock recursion from perf_event_exit_task
3819         *     see the comment there.
3820         *
3821         *  2) there is a lock-inversion with mmap_sem through
3822         *     perf_read_group(), which takes faults while
3823         *     holding ctx->mutex, however this is called after
3824         *     the last filedesc died, so there is no possibility
3825         *     to trigger the AB-BA case.
3826         */
3827        ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
3828        WARN_ON_ONCE(ctx->parent_ctx);
3829        perf_remove_from_context(event, true);
3830        perf_event_ctx_unlock(event, ctx);
3831
3832        _free_event(event);
3833}
3834
3835int perf_event_release_kernel(struct perf_event *event)
3836{
3837        put_event(event);
3838        return 0;
3839}
3840EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3841
3842/*
3843 * Called when the last reference to the file is gone.
3844 */
3845static int perf_release(struct inode *inode, struct file *file)
3846{
3847        put_event(file->private_data);
3848        return 0;
3849}
3850
3851/*
3852 * Remove all orphanes events from the context.
3853 */
3854static void orphans_remove_work(struct work_struct *work)
3855{
3856        struct perf_event_context *ctx;
3857        struct perf_event *event, *tmp;
3858
3859        ctx = container_of(work, struct perf_event_context,
3860                           orphans_remove.work);
3861
3862        mutex_lock(&ctx->mutex);
3863        list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3864                struct perf_event *parent_event = event->parent;
3865
3866                if (!is_orphaned_child(event))
3867                        continue;
3868
3869                perf_remove_from_context(event, true);
3870
3871                mutex_lock(&parent_event->child_mutex);
3872                list_del_init(&event->child_list);
3873                mutex_unlock(&parent_event->child_mutex);
3874
3875                free_event(event);
3876                put_event(parent_event);
3877        }
3878
3879        raw_spin_lock_irq(&ctx->lock);
3880        ctx->orphans_remove_sched = false;
3881        raw_spin_unlock_irq(&ctx->lock);
3882        mutex_unlock(&ctx->mutex);
3883
3884        put_ctx(ctx);
3885}
3886
3887u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3888{
3889        struct perf_event *child;
3890        u64 total = 0;
3891
3892        *enabled = 0;
3893        *running = 0;
3894
3895        mutex_lock(&event->child_mutex);
3896
3897        (void)perf_event_read(event, false);
3898        total += perf_event_count(event);
3899
3900        *enabled += event->total_time_enabled +
3901                        atomic64_read(&event->child_total_time_enabled);
3902        *running += event->total_time_running +
3903                        atomic64_read(&event->child_total_time_running);
3904
3905        list_for_each_entry(child, &event->child_list, child_list) {
3906                (void)perf_event_read(child, false);
3907                total += perf_event_count(child);
3908                *enabled += child->total_time_enabled;
3909                *running += child->total_time_running;
3910        }
3911        mutex_unlock(&event->child_mutex);
3912
3913        return total;
3914}
3915EXPORT_SYMBOL_GPL(perf_event_read_value);
3916
3917static int __perf_read_group_add(struct perf_event *leader,
3918                                        u64 read_format, u64 *values)
3919{
3920        struct perf_event *sub;
3921        int n = 1; /* skip @nr */
3922        int ret;
3923
3924        ret = perf_event_read(leader, true);
3925        if (ret)
3926                return ret;
3927
3928        /*
3929         * Since we co-schedule groups, {enabled,running} times of siblings
3930         * will be identical to those of the leader, so we only publish one
3931         * set.
3932         */
3933        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3934                values[n++] += leader->total_time_enabled +
3935                        atomic64_read(&leader->child_total_time_enabled);
3936        }
3937
3938        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3939                values[n++] += leader->total_time_running +
3940                        atomic64_read(&leader->child_total_time_running);
3941        }
3942
3943        /*
3944         * Write {count,id} tuples for every sibling.
3945         */
3946        values[n++] += perf_event_count(leader);
3947        if (read_format & PERF_FORMAT_ID)
3948                values[n++] = primary_event_id(leader);
3949
3950        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3951                values[n++] += perf_event_count(sub);
3952                if (read_format & PERF_FORMAT_ID)
3953                        values[n++] = primary_event_id(sub);
3954        }
3955
3956        return 0;
3957}
3958
3959static int perf_read_group(struct perf_event *event,
3960                                   u64 read_format, char __user *buf)
3961{
3962        struct perf_event *leader = event->group_leader, *child;
3963        struct perf_event_context *ctx = leader->ctx;
3964        int ret;
3965        u64 *values;
3966
3967        lockdep_assert_held(&ctx->mutex);
3968
3969        values = kzalloc(event->read_size, GFP_KERNEL);
3970        if (!values)
3971                return -ENOMEM;
3972
3973        values[0] = 1 + leader->nr_siblings;
3974
3975        /*
3976         * By locking the child_mutex of the leader we effectively
3977         * lock the child list of all siblings.. XXX explain how.
3978         */
3979        mutex_lock(&leader->child_mutex);
3980
3981        ret = __perf_read_group_add(leader, read_format, values);
3982        if (ret)
3983                goto unlock;
3984
3985        list_for_each_entry(child, &leader->child_list, child_list) {
3986                ret = __perf_read_group_add(child, read_format, values);
3987                if (ret)
3988                        goto unlock;
3989        }
3990
3991        mutex_unlock(&leader->child_mutex);
3992
3993        ret = event->read_size;
3994        if (copy_to_user(buf, values, event->read_size))
3995                ret = -EFAULT;
3996        goto out;
3997
3998unlock:
3999        mutex_unlock(&leader->child_mutex);
4000out:
4001        kfree(values);
4002        return ret;
4003}
4004
4005static int perf_read_one(struct perf_event *event,
4006                                 u64 read_format, char __user *buf)
4007{
4008        u64 enabled, running;
4009        u64 values[4];
4010        int n = 0;
4011
4012        values[n++] = perf_event_read_value(event, &enabled, &running);
4013        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4014                values[n++] = enabled;
4015        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4016                values[n++] = running;
4017        if (read_format & PERF_FORMAT_ID)
4018                values[n++] = primary_event_id(event);
4019
4020        if (copy_to_user(buf, values, n * sizeof(u64)))
4021                return -EFAULT;
4022
4023        return n * sizeof(u64);
4024}
4025
4026static bool is_event_hup(struct perf_event *event)
4027{
4028        bool no_children;
4029
4030        if (event->state != PERF_EVENT_STATE_EXIT)
4031                return false;
4032
4033        mutex_lock(&event->child_mutex);
4034        no_children = list_empty(&event->child_list);
4035        mutex_unlock(&event->child_mutex);
4036        return no_children;
4037}
4038
4039/*
4040 * Read the performance event - simple non blocking version for now
4041 */
4042static ssize_t
4043__perf_read(struct perf_event *event, char __user *buf, size_t count)
4044{
4045        u64 read_format = event->attr.read_format;
4046        int ret;
4047
4048        /*
4049         * Return end-of-file for a read on a event that is in
4050         * error state (i.e. because it was pinned but it couldn't be
4051         * scheduled on to the CPU at some point).
4052         */
4053        if (event->state == PERF_EVENT_STATE_ERROR)
4054                return 0;
4055
4056        if (count < event->read_size)
4057                return -ENOSPC;
4058
4059        WARN_ON_ONCE(event->ctx->parent_ctx);
4060        if (read_format & PERF_FORMAT_GROUP)
4061                ret = perf_read_group(event, read_format, buf);
4062        else
4063                ret = perf_read_one(event, read_format, buf);
4064
4065        return ret;
4066}
4067
4068static ssize_t
4069perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4070{
4071        struct perf_event *event = file->private_data;
4072        struct perf_event_context *ctx;
4073        int ret;
4074
4075        ctx = perf_event_ctx_lock(event);
4076        ret = __perf_read(event, buf, count);
4077        perf_event_ctx_unlock(event, ctx);
4078
4079        return ret;
4080}
4081
4082static unsigned int perf_poll(struct file *file, poll_table *wait)
4083{
4084        struct perf_event *event = file->private_data;
4085        struct ring_buffer *rb;
4086        unsigned int events = POLLHUP;
4087
4088        poll_wait(file, &event->waitq, wait);
4089
4090        if (is_event_hup(event))
4091                return events;
4092
4093        /*
4094         * Pin the event->rb by taking event->mmap_mutex; otherwise
4095         * perf_event_set_output() can swizzle our rb and make us miss wakeups.
4096         */
4097        mutex_lock(&event->mmap_mutex);
4098        rb = event->rb;
4099        if (rb)
4100                events = atomic_xchg(&rb->poll, 0);
4101        mutex_unlock(&event->mmap_mutex);
4102        return events;
4103}
4104
4105static void _perf_event_reset(struct perf_event *event)
4106{
4107        (void)perf_event_read(event, false);
4108        local64_set(&event->count, 0);
4109        perf_event_update_userpage(event);
4110}
4111
4112/*
4113 * Holding the top-level event's child_mutex means that any
4114 * descendant process that has inherited this event will block
4115 * in sync_child_event if it goes to exit, thus satisfying the
4116 * task existence requirements of perf_event_enable/disable.
4117 */
4118static void perf_event_for_each_child(struct perf_event *event,
4119                                        void (*func)(struct perf_event *))
4120{
4121        struct perf_event *child;
4122
4123        WARN_ON_ONCE(event->ctx->parent_ctx);
4124
4125        mutex_lock(&event->child_mutex);
4126        func(event);
4127        list_for_each_entry(child, &event->child_list, child_list)
4128                func(child);
4129        mutex_unlock(&event->child_mutex);
4130}
4131
4132static void perf_event_for_each(struct perf_event *event,
4133                                  void (*func)(struct perf_event *))
4134{
4135        struct perf_event_context *ctx = event->ctx;
4136        struct perf_event *sibling;
4137
4138        lockdep_assert_held(&ctx->mutex);
4139
4140        event = event->group_leader;
4141
4142        perf_event_for_each_child(event, func);
4143        list_for_each_entry(sibling, &event->sibling_list, group_entry)
4144                perf_event_for_each_child(sibling, func);
4145}
4146
4147struct period_event {
4148        struct perf_event *event;
4149        u64 value;
4150};
4151
4152static int __perf_event_period(void *info)
4153{
4154        struct period_event *pe = info;
4155        struct perf_event *event = pe->event;
4156        struct perf_event_context *ctx = event->ctx;
4157        u64 value = pe->value;
4158        bool active;
4159
4160        raw_spin_lock(&ctx->lock);
4161        if (event->attr.freq) {
4162                event->attr.sample_freq = value;
4163        } else {
4164                event->attr.sample_period = value;
4165                event->hw.sample_period = value;
4166        }
4167
4168        active = (event->state == PERF_EVENT_STATE_ACTIVE);
4169        if (active) {
4170                perf_pmu_disable(ctx->pmu);
4171                event->pmu->stop(event, PERF_EF_UPDATE);
4172        }
4173
4174        local64_set(&event->hw.period_left, 0);
4175
4176        if (active) {
4177                event->pmu->start(event, PERF_EF_RELOAD);
4178                perf_pmu_enable(ctx->pmu);
4179        }
4180        raw_spin_unlock(&ctx->lock);
4181
4182        return 0;
4183}
4184
4185static int perf_event_period(struct perf_event *event, u64 __user *arg)
4186{
4187        struct period_event pe = { .event = event, };
4188        struct perf_event_context *ctx = event->ctx;
4189        struct task_struct *task;
4190        u64 value;
4191
4192        if (!is_sampling_event(event))
4193                return -EINVAL;
4194
4195        if (copy_from_user(&value, arg, sizeof(value)))
4196                return -EFAULT;
4197
4198        if (!value)
4199                return -EINVAL;
4200
4201        if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4202                return -EINVAL;
4203
4204        task = ctx->task;
4205        pe.value = value;
4206
4207        if (!task) {
4208                cpu_function_call(event->cpu, __perf_event_period, &pe);
4209                return 0;
4210        }
4211
4212retry:
4213        if (!task_function_call(task, __perf_event_period, &pe))
4214                return 0;
4215
4216        raw_spin_lock_irq(&ctx->lock);
4217        if (ctx->is_active) {
4218                raw_spin_unlock_irq(&ctx->lock);
4219                task = ctx->task;
4220                goto retry;
4221        }
4222
4223        if (event->attr.freq) {
4224                event->attr.sample_freq = value;
4225        } else {
4226                event->attr.sample_period = value;
4227                event->hw.sample_period = value;
4228        }
4229
4230        local64_set(&event->hw.period_left, 0);
4231        raw_spin_unlock_irq(&ctx->lock);
4232
4233        return 0;
4234}
4235
4236static const struct file_operations perf_fops;
4237
4238static inline int perf_fget_light(int fd, struct fd *p)
4239{
4240        struct fd f = fdget(fd);
4241        if (!f.file)
4242                return -EBADF;
4243
4244        if (f.file->f_op != &perf_fops) {
4245                fdput(f);
4246                return -EBADF;
4247        }
4248        *p = f;
4249        return 0;
4250}
4251
4252static int perf_event_set_output(struct perf_event *event,
4253                                 struct perf_event *output_event);
4254static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4255static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4256
4257static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4258{
4259        void (*func)(struct perf_event *);
4260        u32 flags = arg;
4261
4262        switch (cmd) {
4263        case PERF_EVENT_IOC_ENABLE:
4264                func = _perf_event_enable;
4265                break;
4266        case PERF_EVENT_IOC_DISABLE:
4267                func = _perf_event_disable;
4268                break;
4269        case PERF_EVENT_IOC_RESET:
4270                func = _perf_event_reset;
4271                break;
4272
4273        case PERF_EVENT_IOC_REFRESH:
4274                return _perf_event_refresh(event, arg);
4275
4276        case PERF_EVENT_IOC_PERIOD:
4277                return perf_event_period(event, (u64 __user *)arg);
4278
4279        case PERF_EVENT_IOC_ID:
4280        {
4281                u64 id = primary_event_id(event);
4282
4283                if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4284                        return -EFAULT;
4285                return 0;
4286        }
4287
4288        case PERF_EVENT_IOC_SET_OUTPUT:
4289        {
4290                int ret;
4291                if (arg != -1) {
4292                        struct perf_event *output_event;
4293                        struct fd output;
4294                        ret = perf_fget_light(arg, &output);
4295                        if (ret)
4296                                return ret;
4297                        output_event = output.file->private_data;
4298                        ret = perf_event_set_output(event, output_event);
4299                        fdput(output);
4300                } else {
4301                        ret = perf_event_set_output(event, NULL);
4302                }
4303                return ret;
4304        }
4305
4306        case PERF_EVENT_IOC_SET_FILTER:
4307                return perf_event_set_filter(event, (void __user *)arg);
4308
4309        case PERF_EVENT_IOC_SET_BPF:
4310                return perf_event_set_bpf_prog(event, arg);
4311
4312        default:
4313                return -ENOTTY;
4314        }
4315
4316        if (flags & PERF_IOC_FLAG_GROUP)
4317                perf_event_for_each(event, func);
4318        else
4319                perf_event_for_each_child(event, func);
4320
4321        return 0;
4322}
4323
4324static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4325{
4326        struct perf_event *event = file->private_data;
4327        struct perf_event_context *ctx;
4328        long ret;
4329
4330        ctx = perf_event_ctx_lock(event);
4331        ret = _perf_ioctl(event, cmd, arg);
4332        perf_event_ctx_unlock(event, ctx);
4333
4334        return ret;
4335}
4336
4337#ifdef CONFIG_COMPAT
4338static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4339                                unsigned long arg)
4340{
4341        switch (_IOC_NR(cmd)) {
4342        case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4343        case _IOC_NR(PERF_EVENT_IOC_ID):
4344                /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4345                if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4346                        cmd &= ~IOCSIZE_MASK;
4347                        cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4348                }
4349                break;
4350        }
4351        return perf_ioctl(file, cmd, arg);
4352}
4353#else
4354# define perf_compat_ioctl NULL
4355#endif
4356
4357int perf_event_task_enable(void)
4358{
4359        struct perf_event_context *ctx;
4360        struct perf_event *event;
4361
4362        mutex_lock(&current->perf_event_mutex);
4363        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4364                ctx = perf_event_ctx_lock(event);
4365                perf_event_for_each_child(event, _perf_event_enable);
4366                perf_event_ctx_unlock(event, ctx);
4367        }
4368        mutex_unlock(&current->perf_event_mutex);
4369
4370        return 0;
4371}
4372
4373int perf_event_task_disable(void)
4374{
4375        struct perf_event_context *ctx;
4376        struct perf_event *event;
4377
4378        mutex_lock(&current->perf_event_mutex);
4379        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4380                ctx = perf_event_ctx_lock(event);
4381                perf_event_for_each_child(event, _perf_event_disable);
4382                perf_event_ctx_unlock(event, ctx);
4383        }
4384        mutex_unlock(&current->perf_event_mutex);
4385
4386        return 0;
4387}
4388
4389static int perf_event_index(struct perf_event *event)
4390{
4391        if (event->hw.state & PERF_HES_STOPPED)
4392                return 0;
4393
4394        if (event->state != PERF_EVENT_STATE_ACTIVE)
4395                return 0;
4396
4397        return event->pmu->event_idx(event);
4398}
4399
4400static void calc_timer_values(struct perf_event *event,
4401                                u64 *now,
4402                                u64 *enabled,
4403                                u64 *running)
4404{
4405        u64 ctx_time;
4406
4407        *now = perf_clock();
4408        ctx_time = event->shadow_ctx_time + *now;
4409        *enabled = ctx_time - event->tstamp_enabled;
4410        *running = ctx_time - event->tstamp_running;
4411}
4412
4413static void perf_event_init_userpage(struct perf_event *event)
4414{
4415        struct perf_event_mmap_page *userpg;
4416        struct ring_buffer *rb;
4417
4418        rcu_read_lock();
4419        rb = rcu_dereference(event->rb);
4420        if (!rb)
4421                goto unlock;
4422
4423        userpg = rb->user_page;
4424
4425        /* Allow new userspace to detect that bit 0 is deprecated */
4426        userpg->cap_bit0_is_deprecated = 1;
4427        userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4428        userpg->data_offset = PAGE_SIZE;
4429        userpg->data_size = perf_data_size(rb);
4430
4431unlock:
4432        rcu_read_unlock();
4433}
4434
4435void __weak arch_perf_update_userpage(
4436        struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4437{
4438}
4439
4440/*
4441 * Callers need to ensure there can be no nesting of this function, otherwise
4442 * the seqlock logic goes bad. We can not serialize this because the arch
4443 * code calls this from NMI context.
4444 */
4445void perf_event_update_userpage(struct perf_event *event)
4446{
4447        struct perf_event_mmap_page *userpg;
4448        struct ring_buffer *rb;
4449        u64 enabled, running, now;
4450
4451        rcu_read_lock();
4452        rb = rcu_dereference(event->rb);
4453        if (!rb)
4454                goto unlock;
4455
4456        /*
4457         * compute total_time_enabled, total_time_running
4458         * based on snapshot values taken when the event
4459         * was last scheduled in.
4460         *
4461         * we cannot simply called update_context_time()
4462         * because of locking issue as we can be called in
4463         * NMI context
4464         */
4465        calc_timer_values(event, &now, &enabled, &running);
4466
4467        userpg = rb->user_page;
4468        /*
4469         * Disable preemption so as to not let the corresponding user-space
4470         * spin too long if we get preempted.
4471         */
4472        preempt_disable();
4473        ++userpg->lock;
4474        barrier();
4475        userpg->index = perf_event_index(event);
4476        userpg->offset = perf_event_count(event);
4477        if (userpg->index)
4478                userpg->offset -= local64_read(&event->hw.prev_count);
4479
4480        userpg->time_enabled = enabled +
4481                        atomic64_read(&event->child_total_time_enabled);
4482
4483        userpg->time_running = running +
4484                        atomic64_read(&event->child_total_time_running);
4485
4486        arch_perf_update_userpage(event, userpg, now);
4487
4488        barrier();
4489        ++userpg->lock;
4490        preempt_enable();
4491unlock:
4492        rcu_read_unlock();
4493}
4494
4495static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4496{
4497        struct perf_event *event = vma->vm_file->private_data;
4498        struct ring_buffer *rb;
4499        int ret = VM_FAULT_SIGBUS;
4500
4501        if (vmf->flags & FAULT_FLAG_MKWRITE) {
4502                if (vmf->pgoff == 0)
4503                        ret = 0;
4504                return ret;
4505        }
4506
4507        rcu_read_lock();
4508        rb = rcu_dereference(event->rb);
4509        if (!rb)
4510                goto unlock;
4511
4512        if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4513                goto unlock;
4514
4515        vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4516        if (!vmf->page)
4517                goto unlock;
4518
4519        get_page(vmf->page);
4520        vmf->page->mapping = vma->vm_file->f_mapping;
4521        vmf->page->index   = vmf->pgoff;
4522
4523        ret = 0;
4524unlock:
4525        rcu_read_unlock();
4526
4527        return ret;
4528}
4529
4530static void ring_buffer_attach(struct perf_event *event,
4531                               struct ring_buffer *rb)
4532{
4533        struct ring_buffer *old_rb = NULL;
4534        unsigned long flags;
4535
4536        if (event->rb) {
4537                /*
4538                 * Should be impossible, we set this when removing
4539                 * event->rb_entry and wait/clear when adding event->rb_entry.
4540                 */
4541                WARN_ON_ONCE(event->rcu_pending);
4542
4543                old_rb = event->rb;
4544                spin_lock_irqsave(&old_rb->event_lock, flags);
4545                list_del_rcu(&event->rb_entry);
4546                spin_unlock_irqrestore(&old_rb->event_lock, flags);
4547
4548                event->rcu_batches = get_state_synchronize_rcu();
4549                event->rcu_pending = 1;
4550        }
4551
4552        if (rb) {
4553                if (event->rcu_pending) {
4554                        cond_synchronize_rcu(event->rcu_batches);
4555                        event->rcu_pending = 0;
4556                }
4557
4558                spin_lock_irqsave(&rb->event_lock, flags);
4559                list_add_rcu(&event->rb_entry, &rb->event_list);
4560                spin_unlock_irqrestore(&rb->event_lock, flags);
4561        }
4562
4563        rcu_assign_pointer(event->rb, rb);
4564
4565        if (old_rb) {
4566                ring_buffer_put(old_rb);
4567                /*
4568                 * Since we detached before setting the new rb, so that we
4569                 * could attach the new rb, we could have missed a wakeup.
4570                 * Provide it now.
4571                 */
4572                wake_up_all(&event->waitq);
4573        }
4574}
4575
4576static void ring_buffer_wakeup(struct perf_event *event)
4577{
4578        struct ring_buffer *rb;
4579
4580        rcu_read_lock();
4581        rb = rcu_dereference(event->rb);
4582        if (rb) {
4583                list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4584                        wake_up_all(&event->waitq);
4585        }
4586        rcu_read_unlock();
4587}
4588
4589struct ring_buffer *ring_buffer_get(struct perf_event *event)
4590{
4591        struct ring_buffer *rb;
4592
4593        rcu_read_lock();
4594        rb = rcu_dereference(event->rb);
4595        if (rb) {
4596                if (!atomic_inc_not_zero(&rb->refcount))
4597                        rb = NULL;
4598        }
4599        rcu_read_unlock();
4600
4601        return rb;
4602}
4603
4604void ring_buffer_put(struct ring_buffer *rb)
4605{
4606        if (!atomic_dec_and_test(&rb->refcount))
4607                return;
4608
4609        WARN_ON_ONCE(!list_empty(&rb->event_list));
4610
4611        call_rcu(&rb->rcu_head, rb_free_rcu);
4612}
4613
4614static void perf_mmap_open(struct vm_area_struct *vma)
4615{
4616        struct perf_event *event = vma->vm_file->private_data;
4617
4618        atomic_inc(&event->mmap_count);
4619        atomic_inc(&event->rb->mmap_count);
4620
4621        if (vma->vm_pgoff)
4622                atomic_inc(&event->rb->aux_mmap_count);
4623
4624        if (event->pmu->event_mapped)
4625                event->pmu->event_mapped(event);
4626}
4627
4628/*
4629 * A buffer can be mmap()ed multiple times; either directly through the same
4630 * event, or through other events by use of perf_event_set_output().
4631 *
4632 * In order to undo the VM accounting done by perf_mmap() we need to destroy
4633 * the buffer here, where we still have a VM context. This means we need
4634 * to detach all events redirecting to us.
4635 */
4636static void perf_mmap_close(struct vm_area_struct *vma)
4637{
4638        struct perf_event *event = vma->vm_file->private_data;
4639
4640        struct ring_buffer *rb = ring_buffer_get(event);
4641        struct user_struct *mmap_user = rb->mmap_user;
4642        int mmap_locked = rb->mmap_locked;
4643        unsigned long size = perf_data_size(rb);
4644
4645        if (event->pmu->event_unmapped)
4646                event->pmu->event_unmapped(event);
4647
4648        /*
4649         * rb->aux_mmap_count will always drop before rb->mmap_count and
4650         * event->mmap_count, so it is ok to use event->mmap_mutex to
4651         * serialize with perf_mmap here.
4652         */
4653        if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4654            atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4655                atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4656                vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4657
4658                rb_free_aux(rb);
4659                mutex_unlock(&event->mmap_mutex);
4660        }
4661
4662        atomic_dec(&rb->mmap_count);
4663
4664        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
4665                goto out_put;
4666
4667        ring_buffer_attach(event, NULL);
4668        mutex_unlock(&event->mmap_mutex);
4669
4670        /* If there's still other mmap()s of this buffer, we're done. */
4671        if (atomic_read(&rb->mmap_count))
4672                goto out_put;
4673
4674        /*
4675         * No other mmap()s, detach from all other events that might redirect
4676         * into the now unreachable buffer. Somewhat complicated by the
4677         * fact that rb::event_lock otherwise nests inside mmap_mutex.
4678         */
4679again:
4680        rcu_read_lock();
4681        list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4682                if (!atomic_long_inc_not_zero(&event->refcount)) {
4683                        /*
4684                         * This event is en-route to free_event() which will
4685                         * detach it and remove it from the list.
4686                         */
4687                        continue;
4688                }
4689                rcu_read_unlock();
4690
4691                mutex_lock(&event->mmap_mutex);
4692                /*
4693                 * Check we didn't race with perf_event_set_output() which can
4694                 * swizzle the rb from under us while we were waiting to
4695                 * acquire mmap_mutex.
4696                 *
4697                 * If we find a different rb; ignore this event, a next
4698                 * iteration will no longer find it on the list. We have to
4699                 * still restart the iteration to make sure we're not now
4700                 * iterating the wrong list.
4701                 */
4702                if (event->rb == rb)
4703                        ring_buffer_attach(event, NULL);
4704
4705                mutex_unlock(&event->mmap_mutex);
4706                put_event(event);
4707
4708                /*
4709                 * Restart the iteration; either we're on the wrong list or
4710                 * destroyed its integrity by doing a deletion.
4711                 */
4712                goto again;
4713        }
4714        rcu_read_unlock();
4715
4716        /*
4717         * It could be there's still a few 0-ref events on the list; they'll
4718         * get cleaned up by free_event() -- they'll also still have their
4719         * ref on the rb and will free it whenever they are done with it.
4720         *
4721         * Aside from that, this buffer is 'fully' detached and unmapped,
4722         * undo the VM accounting.
4723         */
4724
4725        atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4726        vma->vm_mm->pinned_vm -= mmap_locked;
4727        free_uid(mmap_user);
4728
4729out_put:
4730        ring_buffer_put(rb); /* could be last */
4731}
4732
4733static const struct vm_operations_struct perf_mmap_vmops = {
4734        .open           = perf_mmap_open,
4735        .close          = perf_mmap_close, /* non mergable */
4736        .fault          = perf_mmap_fault,
4737        .page_mkwrite   = perf_mmap_fault,
4738};
4739
4740static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4741{
4742        struct perf_event *event = file->private_data;
4743        unsigned long user_locked, user_lock_limit;
4744        struct user_struct *user = current_user();
4745        unsigned long locked, lock_limit;
4746        struct ring_buffer *rb = NULL;
4747        unsigned long vma_size;
4748        unsigned long nr_pages;
4749        long user_extra = 0, extra = 0;
4750        int ret = 0, flags = 0;
4751
4752        /*
4753         * Don't allow mmap() of inherited per-task counters. This would
4754         * create a performance issue due to all children writing to the
4755         * same rb.
4756         */
4757        if (event->cpu == -1 && event->attr.inherit)
4758                return -EINVAL;
4759
4760        if (!(vma->vm_flags & VM_SHARED))
4761                return -EINVAL;
4762
4763        vma_size = vma->vm_end - vma->vm_start;
4764
4765        if (vma->vm_pgoff == 0) {
4766                nr_pages = (vma_size / PAGE_SIZE) - 1;
4767        } else {
4768                /*
4769                 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
4770                 * mapped, all subsequent mappings should have the same size
4771                 * and offset. Must be above the normal perf buffer.
4772                 */
4773                u64 aux_offset, aux_size;
4774
4775                if (!event->rb)
4776                        return -EINVAL;
4777
4778                nr_pages = vma_size / PAGE_SIZE;
4779
4780                mutex_lock(&event->mmap_mutex);
4781                ret = -EINVAL;
4782
4783                rb = event->rb;
4784                if (!rb)
4785                        goto aux_unlock;
4786
4787                aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4788                aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4789
4790                if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4791                        goto aux_unlock;
4792
4793                if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4794                        goto aux_unlock;
4795
4796                /* already mapped with a different offset */
4797                if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4798                        goto aux_unlock;
4799
4800                if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4801                        goto aux_unlock;
4802
4803                /* already mapped with a different size */
4804                if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4805                        goto aux_unlock;
4806
4807                if (!is_power_of_2(nr_pages))
4808                        goto aux_unlock;
4809
4810                if (!atomic_inc_not_zero(&rb->mmap_count))
4811                        goto aux_unlock;
4812
4813                if (rb_has_aux(rb)) {
4814                        atomic_inc(&rb->aux_mmap_count);
4815                        ret = 0;
4816                        goto unlock;
4817                }
4818
4819                atomic_set(&rb->aux_mmap_count, 1);
4820                user_extra = nr_pages;
4821
4822                goto accounting;
4823        }
4824
4825        /*
4826         * If we have rb pages ensure they're a power-of-two number, so we
4827         * can do bitmasks instead of modulo.
4828         */
4829        if (nr_pages != 0 && !is_power_of_2(nr_pages))
4830                return -EINVAL;
4831
4832        if (vma_size != PAGE_SIZE * (1 + nr_pages))
4833                return -EINVAL;
4834
4835        WARN_ON_ONCE(event->ctx->parent_ctx);
4836again:
4837        mutex_lock(&event->mmap_mutex);
4838        if (event->rb) {
4839                if (event->rb->nr_pages != nr_pages) {
4840                        ret = -EINVAL;
4841                        goto unlock;
4842                }
4843
4844                if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4845                        /*
4846                         * Raced against perf_mmap_close() through
4847                         * perf_event_set_output(). Try again, hope for better
4848                         * luck.
4849                         */
4850                        mutex_unlock(&event->mmap_mutex);
4851                        goto again;
4852                }
4853
4854                goto unlock;
4855        }
4856
4857        user_extra = nr_pages + 1;
4858
4859accounting:
4860        user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4861
4862        /*
4863         * Increase the limit linearly with more CPUs:
4864         */
4865        user_lock_limit *= num_online_cpus();
4866
4867        user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4868
4869        if (user_locked > user_lock_limit)
4870                extra = user_locked - user_lock_limit;
4871
4872        lock_limit = rlimit(RLIMIT_MEMLOCK);
4873        lock_limit >>= PAGE_SHIFT;
4874        locked = vma->vm_mm->pinned_vm + extra;
4875
4876        if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4877                !capable(CAP_IPC_LOCK)) {
4878                ret = -EPERM;
4879                goto unlock;
4880        }
4881
4882        WARN_ON(!rb && event->rb);
4883
4884        if (vma->vm_flags & VM_WRITE)
4885                flags |= RING_BUFFER_WRITABLE;
4886
4887        if (!rb) {
4888                rb = rb_alloc(nr_pages,
4889                              event->attr.watermark ? event->attr.wakeup_watermark : 0,
4890                              event->cpu, flags);
4891
4892                if (!rb) {
4893                        ret = -ENOMEM;
4894                        goto unlock;
4895                }
4896
4897                atomic_set(&rb->mmap_count, 1);
4898                rb->mmap_user = get_current_user();
4899                rb->mmap_locked = extra;
4900
4901                ring_buffer_attach(event, rb);
4902
4903                perf_event_init_userpage(event);
4904                perf_event_update_userpage(event);
4905        } else {
4906                ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
4907                                   event->attr.aux_watermark, flags);
4908                if (!ret)
4909                        rb->aux_mmap_locked = extra;
4910        }
4911
4912unlock:
4913        if (!ret) {
4914                atomic_long_add(user_extra, &user->locked_vm);
4915                vma->vm_mm->pinned_vm += extra;
4916
4917                atomic_inc(&event->mmap_count);
4918        } else if (rb) {
4919                atomic_dec(&rb->mmap_count);
4920        }
4921aux_unlock:
4922        mutex_unlock(&event->mmap_mutex);
4923
4924        /*
4925         * Since pinned accounting is per vm we cannot allow fork() to copy our
4926         * vma.
4927         */
4928        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4929        vma->vm_ops = &perf_mmap_vmops;
4930
4931        if (event->pmu->event_mapped)
4932                event->pmu->event_mapped(event);
4933
4934        return ret;
4935}
4936
4937static int perf_fasync(int fd, struct file *filp, int on)
4938{
4939        struct inode *inode = file_inode(filp);
4940        struct perf_event *event = filp->private_data;
4941        int retval;
4942
4943        mutex_lock(&inode->i_mutex);
4944        retval = fasync_helper(fd, filp, on, &event->fasync);
4945        mutex_unlock(&inode->i_mutex);
4946
4947        if (retval < 0)
4948                return retval;
4949
4950        return 0;
4951}
4952
4953static const struct file_operations perf_fops = {
4954        .llseek                 = no_llseek,
4955        .release                = perf_release,
4956        .read                   = perf_read,
4957        .poll                   = perf_poll,
4958        .unlocked_ioctl         = perf_ioctl,
4959        .compat_ioctl           = perf_compat_ioctl,
4960        .mmap                   = perf_mmap,
4961        .fasync                 = perf_fasync,
4962};
4963
4964/*
4965 * Perf event wakeup
4966 *
4967 * If there's data, ensure we set the poll() state and publish everything
4968 * to user-space before waking everybody up.
4969 */
4970
4971static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
4972{
4973        /* only the parent has fasync state */
4974        if (event->parent)
4975                event = event->parent;
4976        return &event->fasync;
4977}
4978
4979void perf_event_wakeup(struct perf_event *event)
4980{
4981        ring_buffer_wakeup(event);
4982
4983        if (event->pending_kill) {
4984                kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
4985                event->pending_kill = 0;
4986        }
4987}
4988
4989static void perf_pending_event(struct irq_work *entry)
4990{
4991        struct perf_event *event = container_of(entry,
4992                        struct perf_event, pending);
4993        int rctx;
4994
4995        rctx = perf_swevent_get_recursion_context();
4996        /*
4997         * If we 'fail' here, that's OK, it means recursion is already disabled
4998         * and we won't recurse 'further'.
4999         */
5000
5001        if (event->pending_disable) {
5002                event->pending_disable = 0;
5003                __perf_event_disable(event);
5004        }
5005
5006        if (event->pending_wakeup) {
5007                event->pending_wakeup = 0;
5008                perf_event_wakeup(event);
5009        }
5010
5011        if (rctx >= 0)
5012                perf_swevent_put_recursion_context(rctx);
5013}
5014
5015/*
5016 * We assume there is only KVM supporting the callbacks.
5017 * Later on, we might change it to a list if there is
5018 * another virtualization implementation supporting the callbacks.
5019 */
5020struct perf_guest_info_callbacks *perf_guest_cbs;
5021
5022int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5023{
5024        perf_guest_cbs = cbs;
5025        return 0;
5026}
5027EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5028
5029int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5030{
5031        perf_guest_cbs = NULL;
5032        return 0;
5033}
5034EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5035
5036static void
5037perf_output_sample_regs(struct perf_output_handle *handle,
5038                        struct pt_regs *regs, u64 mask)
5039{
5040        int bit;
5041
5042        for_each_set_bit(bit, (const unsigned long *) &mask,
5043                         sizeof(mask) * BITS_PER_BYTE) {
5044                u64 val;
5045
5046                val = perf_reg_value(regs, bit);
5047                perf_output_put(handle, val);
5048        }
5049}
5050
5051static void perf_sample_regs_user(struct perf_regs *regs_user,
5052                                  struct pt_regs *regs,
5053                                  struct pt_regs *regs_user_copy)
5054{
5055        if (user_mode(regs)) {
5056                regs_user->abi = perf_reg_abi(current);
5057                regs_user->regs = regs;
5058        } else if (current->mm) {
5059                perf_get_regs_user(regs_user, regs, regs_user_copy);
5060        } else {
5061                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5062                regs_user->regs = NULL;
5063        }
5064}
5065
5066static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5067                                  struct pt_regs *regs)
5068{
5069        regs_intr->regs = regs;
5070        regs_intr->abi  = perf_reg_abi(current);
5071}
5072
5073
5074/*
5075 * Get remaining task size from user stack pointer.
5076 *
5077 * It'd be better to take stack vma map and limit this more
5078 * precisly, but there's no way to get it safely under interrupt,
5079 * so using TASK_SIZE as limit.
5080 */
5081static u64 perf_ustack_task_size(struct pt_regs *regs)
5082{
5083        unsigned long addr = perf_user_stack_pointer(regs);
5084
5085        if (!addr || addr >= TASK_SIZE)
5086                return 0;
5087
5088        return TASK_SIZE - addr;
5089}
5090
5091static u16
5092perf_sample_ustack_size(u16 stack_size, u16 header_size,
5093                        struct pt_regs *regs)
5094{
5095        u64 task_size;
5096
5097        /* No regs, no stack pointer, no dump. */
5098        if (!regs)
5099                return 0;
5100
5101        /*
5102         * Check if we fit in with the requested stack size into the:
5103         * - TASK_SIZE
5104         *   If we don't, we limit the size to the TASK_SIZE.
5105         *
5106         * - remaining sample size
5107         *   If we don't, we customize the stack size to
5108         *   fit in to the remaining sample size.
5109         */
5110
5111        task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5112        stack_size = min(stack_size, (u16) task_size);
5113
5114        /* Current header size plus static size and dynamic size. */
5115        header_size += 2 * sizeof(u64);
5116
5117        /* Do we fit in with the current stack dump size? */
5118        if ((u16) (header_size + stack_size) < header_size) {
5119                /*
5120                 * If we overflow the maximum size for the sample,
5121                 * we customize the stack dump size to fit in.
5122                 */
5123                stack_size = USHRT_MAX - header_size - sizeof(u64);
5124                stack_size = round_up(stack_size, sizeof(u64));
5125        }
5126
5127        return stack_size;
5128}
5129
5130static void
5131perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5132                          struct pt_regs *regs)
5133{
5134        /* Case of a kernel thread, nothing to dump */
5135        if (!regs) {
5136                u64 size = 0;
5137                perf_output_put(handle, size);
5138        } else {
5139                unsigned long sp;
5140                unsigned int rem;
5141                u64 dyn_size;
5142
5143                /*
5144                 * We dump:
5145                 * static size
5146                 *   - the size requested by user or the best one we can fit
5147                 *     in to the sample max size
5148                 * data
5149                 *   - user stack dump data
5150                 * dynamic size
5151                 *   - the actual dumped size
5152                 */
5153
5154                /* Static size. */
5155                perf_output_put(handle, dump_size);
5156
5157                /* Data. */
5158                sp = perf_user_stack_pointer(regs);
5159                rem = __output_copy_user(handle, (void *) sp, dump_size);
5160                dyn_size = dump_size - rem;
5161
5162                perf_output_skip(handle, rem);
5163
5164                /* Dynamic size. */
5165                perf_output_put(handle, dyn_size);
5166        }
5167}
5168
5169static void __perf_event_header__init_id(struct perf_event_header *header,
5170                                         struct perf_sample_data *data,
5171                                         struct perf_event *event)
5172{
5173        u64 sample_type = event->attr.sample_type;
5174
5175        data->type = sample_type;
5176        header->size += event->id_header_size;
5177
5178        if (sample_type & PERF_SAMPLE_TID) {
5179                /* namespace issues */
5180                data->tid_entry.pid = perf_event_pid(event, current);
5181                data->tid_entry.tid = perf_event_tid(event, current);
5182        }
5183
5184        if (sample_type & PERF_SAMPLE_TIME)
5185                data->time = perf_event_clock(event);
5186
5187        if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5188                data->id = primary_event_id(event);
5189
5190        if (sample_type & PERF_SAMPLE_STREAM_ID)
5191                data->stream_id = event->id;
5192
5193        if (sample_type & PERF_SAMPLE_CPU) {
5194                data->cpu_entry.cpu      = raw_smp_processor_id();
5195                data->cpu_entry.reserved = 0;
5196        }
5197}
5198
5199void perf_event_header__init_id(struct perf_event_header *header,
5200                                struct perf_sample_data *data,
5201                                struct perf_event *event)
5202{
5203        if (event->attr.sample_id_all)
5204                __perf_event_header__init_id(header, data, event);
5205}
5206
5207static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5208                                           struct perf_sample_data *data)
5209{
5210        u64 sample_type = data->type;
5211
5212        if (sample_type & PERF_SAMPLE_TID)
5213                perf_output_put(handle, data->tid_entry);
5214
5215        if (sample_type & PERF_SAMPLE_TIME)
5216                perf_output_put(handle, data->time);
5217
5218        if (sample_type & PERF_SAMPLE_ID)
5219                perf_output_put(handle, data->id);
5220
5221        if (sample_type & PERF_SAMPLE_STREAM_ID)
5222                perf_output_put(handle, data->stream_id);
5223
5224        if (sample_type & PERF_SAMPLE_CPU)
5225                perf_output_put(handle, data->cpu_entry);
5226
5227        if (sample_type & PERF_SAMPLE_IDENTIFIER)
5228                perf_output_put(handle, data->id);
5229}
5230
5231void perf_event__output_id_sample(struct perf_event *event,
5232                                  struct perf_output_handle *handle,
5233                                  struct perf_sample_data *sample)
5234{
5235        if (event->attr.sample_id_all)
5236                __perf_event__output_id_sample(handle, sample);
5237}
5238
5239static void perf_output_read_one(struct perf_output_handle *handle,
5240                                 struct perf_event *event,
5241                                 u64 enabled, u64 running)
5242{
5243        u64 read_format = event->attr.read_format;
5244        u64 values[4];
5245        int n = 0;
5246
5247        values[n++] = perf_event_count(event);
5248        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5249                values[n++] = enabled +
5250                        atomic64_read(&event->child_total_time_enabled);
5251        }
5252        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5253                values[n++] = running +
5254                        atomic64_read(&event->child_total_time_running);
5255        }
5256        if (read_format & PERF_FORMAT_ID)
5257                values[n++] = primary_event_id(event);
5258
5259        __output_copy(handle, values, n * sizeof(u64));
5260}
5261
5262/*
5263 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
5264 */
5265static void perf_output_read_group(struct perf_output_handle *handle,
5266                            struct perf_event *event,
5267                            u64 enabled, u64 running)
5268{
5269        struct perf_event *leader = event->group_leader, *sub;
5270        u64 read_format = event->attr.read_format;
5271        u64 values[5];
5272        int n = 0;
5273
5274        values[n++] = 1 + leader->nr_siblings;
5275
5276        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5277                values[n++] = enabled;
5278
5279        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5280                values[n++] = running;
5281
5282        if (leader != event)
5283                leader->pmu->read(leader);
5284
5285        values[n++] = perf_event_count(leader);
5286        if (read_format & PERF_FORMAT_ID)
5287                values[n++] = primary_event_id(leader);
5288
5289        __output_copy(handle, values, n * sizeof(u64));
5290
5291        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5292                n = 0;
5293
5294                if ((sub != event) &&
5295                    (sub->state == PERF_EVENT_STATE_ACTIVE))
5296                        sub->pmu->read(sub);
5297
5298                values[n++] = perf_event_count(sub);
5299                if (read_format & PERF_FORMAT_ID)
5300                        values[n++] = primary_event_id(sub);
5301
5302                __output_copy(handle, values, n * sizeof(u64));
5303        }
5304}
5305
5306#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5307                                 PERF_FORMAT_TOTAL_TIME_RUNNING)
5308
5309static void perf_output_read(struct perf_output_handle *handle,
5310                             struct perf_event *event)
5311{
5312        u64 enabled = 0, running = 0, now;
5313        u64 read_format = event->attr.read_format;
5314
5315        /*
5316         * compute total_time_enabled, total_time_running
5317         * based on snapshot values taken when the event
5318         * was last scheduled in.
5319         *
5320         * we cannot simply called update_context_time()
5321         * because of locking issue as we are called in
5322         * NMI context
5323         */
5324        if (read_format & PERF_FORMAT_TOTAL_TIMES)
5325                calc_timer_values(event, &now, &enabled, &running);
5326
5327        if (event->attr.read_format & PERF_FORMAT_GROUP)
5328                perf_output_read_group(handle, event, enabled, running);
5329        else
5330                perf_output_read_one(handle, event, enabled, running);
5331}
5332
5333void perf_output_sample(struct perf_output_handle *handle,
5334                        struct perf_event_header *header,
5335                        struct perf_sample_data *data,
5336                        struct perf_event *event)
5337{
5338        u64 sample_type = data->type;
5339
5340        perf_output_put(handle, *header);
5341
5342        if (sample_type & PERF_SAMPLE_IDENTIFIER)
5343                perf_output_put(handle, data->id);
5344
5345        if (sample_type & PERF_SAMPLE_IP)
5346                perf_output_put(handle, data->ip);
5347
5348        if (sample_type & PERF_SAMPLE_TID)
5349                perf_output_put(handle, data->tid_entry);
5350
5351        if (sample_type & PERF_SAMPLE_TIME)
5352                perf_output_put(handle, data->time);
5353
5354        if (sample_type & PERF_SAMPLE_ADDR)
5355                perf_output_put(handle, data->addr);
5356
5357        if (sample_type & PERF_SAMPLE_ID)
5358                perf_output_put(handle, data->id);
5359
5360        if (sample_type & PERF_SAMPLE_STREAM_ID)
5361                perf_output_put(handle, data->stream_id);
5362
5363        if (sample_type & PERF_SAMPLE_CPU)
5364                perf_output_put(handle, data->cpu_entry);
5365
5366        if (sample_type & PERF_SAMPLE_PERIOD)
5367                perf_output_put(handle, data->period);
5368
5369        if (sample_type & PERF_SAMPLE_READ)
5370                perf_output_read(handle, event);
5371
5372        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5373                if (data->callchain) {
5374                        int size = 1;
5375
5376                        if (data->callchain)
5377                                size += data->callchain->nr;
5378
5379                        size *= sizeof(u64);
5380
5381                        __output_copy(handle, data->callchain, size);
5382                } else {
5383                        u64 nr = 0;
5384                        perf_output_put(handle, nr);
5385                }
5386        }
5387
5388        if (sample_type & PERF_SAMPLE_RAW) {
5389                if (data->raw) {
5390                        u32 raw_size = data->raw->size;
5391                        u32 real_size = round_up(raw_size + sizeof(u32),
5392                                                 sizeof(u64)) - sizeof(u32);
5393                        u64 zero = 0;
5394
5395                        perf_output_put(handle, real_size);
5396                        __output_copy(handle, data->raw->data, raw_size);
5397                        if (real_size - raw_size)
5398                                __output_copy(handle, &zero, real_size - raw_size);
5399                } else {
5400                        struct {
5401                                u32     size;
5402                                u32     data;
5403                        } raw = {
5404                                .size = sizeof(u32),
5405                                .data = 0,
5406                        };
5407                        perf_output_put(handle, raw);
5408                }
5409        }
5410
5411        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5412                if (data->br_stack) {
5413                        size_t size;
5414
5415                        size = data->br_stack->nr
5416                             * sizeof(struct perf_branch_entry);
5417
5418                        perf_output_put(handle, data->br_stack->nr);
5419                        perf_output_copy(handle, data->br_stack->entries, size);
5420                } else {
5421                        /*
5422                         * we always store at least the value of nr
5423                         */
5424                        u64 nr = 0;
5425                        perf_output_put(handle, nr);
5426                }
5427        }
5428
5429        if (sample_type & PERF_SAMPLE_REGS_USER) {
5430                u64 abi = data->regs_user.abi;
5431
5432                /*
5433                 * If there are no regs to dump, notice it through
5434                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5435                 */
5436                perf_output_put(handle, abi);
5437
5438                if (abi) {
5439                        u64 mask = event->attr.sample_regs_user;
5440                        perf_output_sample_regs(handle,
5441                                                data->regs_user.regs,
5442                                                mask);
5443                }
5444        }
5445
5446        if (sample_type & PERF_SAMPLE_STACK_USER) {
5447                perf_output_sample_ustack(handle,
5448                                          data->stack_user_size,
5449                                          data->regs_user.regs);
5450        }
5451
5452        if (sample_type & PERF_SAMPLE_WEIGHT)
5453                perf_output_put(handle, data->weight);
5454
5455        if (sample_type & PERF_SAMPLE_DATA_SRC)
5456                perf_output_put(handle, data->data_src.val);
5457
5458        if (sample_type & PERF_SAMPLE_TRANSACTION)
5459                perf_output_put(handle, data->txn);
5460
5461        if (sample_type & PERF_SAMPLE_REGS_INTR) {
5462                u64 abi = data->regs_intr.abi;
5463                /*
5464                 * If there are no regs to dump, notice it through
5465                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5466                 */
5467                perf_output_put(handle, abi);
5468
5469                if (abi) {
5470                        u64 mask = event->attr.sample_regs_intr;
5471
5472                        perf_output_sample_regs(handle,
5473                                                data->regs_intr.regs,
5474                                                mask);
5475                }
5476        }
5477
5478        if (!event->attr.watermark) {
5479                int wakeup_events = event->attr.wakeup_events;
5480
5481                if (wakeup_events) {
5482                        struct ring_buffer *rb = handle->rb;
5483                        int events = local_inc_return(&rb->events);
5484
5485                        if (events >= wakeup_events) {
5486                                local_sub(wakeup_events, &rb->events);
5487                                local_inc(&rb->wakeup);
5488                        }
5489                }
5490        }
5491}
5492
5493void perf_prepare_sample(struct perf_event_header *header,
5494                         struct perf_sample_data *data,
5495                         struct perf_event *event,
5496                         struct pt_regs *regs)
5497{
5498        u64 sample_type = event->attr.sample_type;
5499
5500        header->type = PERF_RECORD_SAMPLE;
5501        header->size = sizeof(*header) + event->header_size;
5502
5503        header->misc = 0;
5504        header->misc |= perf_misc_flags(regs);
5505
5506        __perf_event_header__init_id(header, data, event);
5507
5508        if (sample_type & PERF_SAMPLE_IP)
5509                data->ip = perf_instruction_pointer(regs);
5510
5511        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5512                int size = 1;
5513
5514                data->callchain = perf_callchain(event, regs);
5515
5516                if (data->callchain)
5517                        size += data->callchain->nr;
5518
5519                header->size += size * sizeof(u64);
5520        }
5521
5522        if (sample_type & PERF_SAMPLE_RAW) {
5523                int size = sizeof(u32);
5524
5525                if (data->raw)
5526                        size += data->raw->size;
5527                else
5528                        size += sizeof(u32);
5529
5530                header->size += round_up(size, sizeof(u64));
5531        }
5532
5533        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5534                int size = sizeof(u64); /* nr */
5535                if (data->br_stack) {
5536                        size += data->br_stack->nr
5537                              * sizeof(struct perf_branch_entry);
5538                }
5539                header->size += size;
5540        }
5541
5542        if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
5543                perf_sample_regs_user(&data->regs_user, regs,
5544                                      &data->regs_user_copy);
5545
5546        if (sample_type & PERF_SAMPLE_REGS_USER) {
5547                /* regs dump ABI info */
5548                int size = sizeof(u64);
5549
5550                if (data->regs_user.regs) {
5551                        u64 mask = event->attr.sample_regs_user;
5552                        size += hweight64(mask) * sizeof(u64);
5553                }
5554
5555                header->size += size;
5556        }
5557
5558        if (sample_type & PERF_SAMPLE_STACK_USER) {
5559                /*
5560                 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5561                 * processed as the last one or have additional check added
5562                 * in case new sample type is added, because we could eat
5563                 * up the rest of the sample size.
5564                 */
5565                u16 stack_size = event->attr.sample_stack_user;
5566                u16 size = sizeof(u64);
5567
5568                stack_size = perf_sample_ustack_size(stack_size, header->size,
5569                                                     data->regs_user.regs);
5570
5571                /*
5572                 * If there is something to dump, add space for the dump
5573                 * itself and for the field that tells the dynamic size,
5574                 * which is how many have been actually dumped.
5575                 */
5576                if (stack_size)
5577                        size += sizeof(u64) + stack_size;
5578
5579                data->stack_user_size = stack_size;
5580                header->size += size;
5581        }
5582
5583        if (sample_type & PERF_SAMPLE_REGS_INTR) {
5584                /* regs dump ABI info */
5585                int size = sizeof(u64);
5586
5587                perf_sample_regs_intr(&data->regs_intr, regs);
5588
5589                if (data->regs_intr.regs) {
5590                        u64 mask = event->attr.sample_regs_intr;
5591
5592                        size += hweight64(mask) * sizeof(u64);
5593                }
5594
5595                header->size += size;
5596        }
5597}
5598
5599void perf_event_output(struct perf_event *event,
5600                        struct perf_sample_data *data,
5601                        struct pt_regs *regs)
5602{
5603        struct perf_output_handle handle;
5604        struct perf_event_header header;
5605
5606        /* protect the callchain buffers */
5607        rcu_read_lock();
5608
5609        perf_prepare_sample(&header, data, event, regs);
5610
5611        if (perf_output_begin(&handle, event, header.size))
5612                goto exit;
5613
5614        perf_output_sample(&handle, &header, data, event);
5615
5616        perf_output_end(&handle);
5617
5618exit:
5619        rcu_read_unlock();
5620}
5621
5622/*
5623 * read event_id
5624 */
5625
5626struct perf_read_event {
5627        struct perf_event_header        header;
5628
5629        u32                             pid;
5630        u32                             tid;
5631};
5632
5633static void
5634perf_event_read_event(struct perf_event *event,
5635                        struct task_struct *task)
5636{
5637        struct perf_output_handle handle;
5638        struct perf_sample_data sample;
5639        struct perf_read_event read_event = {
5640                .header = {
5641                        .type = PERF_RECORD_READ,
5642                        .misc = 0,
5643                        .size = sizeof(read_event) + event->read_size,
5644                },
5645                .pid = perf_event_pid(event, task),
5646                .tid = perf_event_tid(event, task),
5647        };
5648        int ret;
5649
5650        perf_event_header__init_id(&read_event.header, &sample, event);
5651        ret = perf_output_begin(&handle, event, read_event.header.size);
5652        if (ret)
5653                return;
5654
5655        perf_output_put(&handle, read_event);
5656        perf_output_read(&handle, event);
5657        perf_event__output_id_sample(event, &handle, &sample);
5658
5659        perf_output_end(&handle);
5660}
5661
5662typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5663
5664static void
5665perf_event_aux_ctx(struct perf_event_context *ctx,
5666                   perf_event_aux_output_cb output,
5667                   void *data)
5668{
5669        struct perf_event *event;
5670
5671        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5672                if (event->state < PERF_EVENT_STATE_INACTIVE)
5673                        continue;
5674                if (!event_filter_match(event))
5675                        continue;
5676                output(event, data);
5677        }
5678}
5679
5680static void
5681perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
5682                        struct perf_event_context *task_ctx)
5683{
5684        rcu_read_lock();
5685        preempt_disable();
5686        perf_event_aux_ctx(task_ctx, output, data);
5687        preempt_enable();
5688        rcu_read_unlock();
5689}
5690
5691static void
5692perf_event_aux(perf_event_aux_output_cb output, void *data,
5693               struct perf_event_context *task_ctx)
5694{
5695        struct perf_cpu_context *cpuctx;
5696        struct perf_event_context *ctx;
5697        struct pmu *pmu;
5698        int ctxn;
5699
5700        /*
5701         * If we have task_ctx != NULL we only notify
5702         * the task context itself. The task_ctx is set
5703         * only for EXIT events before releasing task
5704         * context.
5705         */
5706        if (task_ctx) {
5707                perf_event_aux_task_ctx(output, data, task_ctx);
5708                return;
5709        }
5710
5711        rcu_read_lock();
5712        list_for_each_entry_rcu(pmu, &pmus, entry) {
5713                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5714                if (cpuctx->unique_pmu != pmu)
5715                        goto next;
5716                perf_event_aux_ctx(&cpuctx->ctx, output, data);
5717                ctxn = pmu->task_ctx_nr;
5718                if (ctxn < 0)
5719                        goto next;
5720                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5721                if (ctx)
5722                        perf_event_aux_ctx(ctx, output, data);
5723next:
5724                put_cpu_ptr(pmu->pmu_cpu_context);
5725        }
5726        rcu_read_unlock();
5727}
5728
5729/*
5730 * task tracking -- fork/exit
5731 *
5732 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
5733 */
5734
5735struct perf_task_event {
5736        struct task_struct              *task;
5737        struct perf_event_context       *task_ctx;
5738
5739        struct {
5740                struct perf_event_header        header;
5741
5742                u32                             pid;
5743                u32                             ppid;
5744                u32                             tid;
5745                u32                             ptid;
5746                u64                             time;
5747        } event_id;
5748};
5749
5750static int perf_event_task_match(struct perf_event *event)
5751{
5752        return event->attr.comm  || event->attr.mmap ||
5753               event->attr.mmap2 || event->attr.mmap_data ||
5754               event->attr.task;
5755}
5756
5757static void perf_event_task_output(struct perf_event *event,
5758                                   void *data)
5759{
5760        struct perf_task_event *task_event = data;
5761        struct perf_output_handle handle;
5762        struct perf_sample_data sample;
5763        struct task_struct *task = task_event->task;
5764        int ret, size = task_event->event_id.header.size;
5765
5766        if (!perf_event_task_match(event))
5767                return;
5768
5769        perf_event_header__init_id(&task_event->event_id.header, &sample, event);
5770
5771        ret = perf_output_begin(&handle, event,
5772                                task_event->event_id.header.size);
5773        if (ret)
5774                goto out;
5775
5776        task_event->event_id.pid = perf_event_pid(event, task);
5777        task_event->event_id.ppid = perf_event_pid(event, current);
5778
5779        task_event->event_id.tid = perf_event_tid(event, task);
5780        task_event->event_id.ptid = perf_event_tid(event, current);
5781
5782        task_event->event_id.time = perf_event_clock(event);
5783
5784        perf_output_put(&handle, task_event->event_id);
5785
5786        perf_event__output_id_sample(event, &handle, &sample);
5787
5788        perf_output_end(&handle);
5789out:
5790        task_event->event_id.header.size = size;
5791}
5792
5793static void perf_event_task(struct task_struct *task,
5794                              struct perf_event_context *task_ctx,
5795                              int new)
5796{
5797        struct perf_task_event task_event;
5798
5799        if (!atomic_read(&nr_comm_events) &&
5800            !atomic_read(&nr_mmap_events) &&
5801            !atomic_read(&nr_task_events))
5802                return;
5803
5804        task_event = (struct perf_task_event){
5805                .task     = task,
5806                .task_ctx = task_ctx,
5807                .event_id    = {
5808                        .header = {
5809                                .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
5810                                .misc = 0,
5811                                .size = sizeof(task_event.event_id),
5812                        },
5813                        /* .pid  */
5814                        /* .ppid */
5815                        /* .tid  */
5816                        /* .ptid */
5817                        /* .time */
5818                },
5819        };
5820
5821        perf_event_aux(perf_event_task_output,
5822                       &task_event,
5823                       task_ctx);
5824}
5825
5826void perf_event_fork(struct task_struct *task)
5827{
5828        perf_event_task(task, NULL, 1);
5829}
5830
5831/*
5832 * comm tracking
5833 */
5834
5835struct perf_comm_event {
5836        struct task_struct      *task;
5837        char                    *comm;
5838        int                     comm_size;
5839
5840        struct {
5841                struct perf_event_header        header;
5842
5843                u32                             pid;
5844                u32                             tid;
5845        } event_id;
5846};
5847
5848static int perf_event_comm_match(struct perf_event *event)
5849{
5850        return event->attr.comm;
5851}
5852
5853static void perf_event_comm_output(struct perf_event *event,
5854                                   void *data)
5855{
5856        struct perf_comm_event *comm_event = data;
5857        struct perf_output_handle handle;
5858        struct perf_sample_data sample;
5859        int size = comm_event->event_id.header.size;
5860        int ret;
5861
5862        if (!perf_event_comm_match(event))
5863                return;
5864
5865        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5866        ret = perf_output_begin(&handle, event,
5867                                comm_event->event_id.header.size);
5868
5869        if (ret)
5870                goto out;
5871
5872        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5873        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
5874
5875        perf_output_put(&handle, comm_event->event_id);
5876        __output_copy(&handle, comm_event->comm,
5877                                   comm_event->comm_size);
5878
5879        perf_event__output_id_sample(event, &handle, &sample);
5880
5881        perf_output_end(&handle);
5882out:
5883        comm_event->event_id.header.size = size;
5884}
5885
5886static void perf_event_comm_event(struct perf_comm_event *comm_event)
5887{
5888        char comm[TASK_COMM_LEN];
5889        unsigned int size;
5890
5891        memset(comm, 0, sizeof(comm));
5892        strlcpy(comm, comm_event->task->comm, sizeof(comm));
5893        size = ALIGN(strlen(comm)+1, sizeof(u64));
5894
5895        comm_event->comm = comm;
5896        comm_event->comm_size = size;
5897
5898        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
5899
5900        perf_event_aux(perf_event_comm_output,
5901                       comm_event,
5902                       NULL);
5903}
5904
5905void perf_event_comm(struct task_struct *task, bool exec)
5906{
5907        struct perf_comm_event comm_event;
5908
5909        if (!atomic_read(&nr_comm_events))
5910                return;
5911
5912        comm_event = (struct perf_comm_event){
5913                .task   = task,
5914                /* .comm      */
5915                /* .comm_size */
5916                .event_id  = {
5917                        .header = {
5918                                .type = PERF_RECORD_COMM,
5919                                .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
5920                                /* .size */
5921                        },
5922                        /* .pid */
5923                        /* .tid */
5924                },
5925        };
5926
5927        perf_event_comm_event(&comm_event);
5928}
5929
5930/*
5931 * mmap tracking
5932 */
5933
5934struct perf_mmap_event {
5935        struct vm_area_struct   *vma;
5936
5937        const char              *file_name;
5938        int                     file_size;
5939        int                     maj, min;
5940        u64                     ino;
5941        u64                     ino_generation;
5942        u32                     prot, flags;
5943
5944        struct {
5945                struct perf_event_header        header;
5946
5947                u32                             pid;
5948                u32                             tid;
5949                u64                             start;
5950                u64                             len;
5951                u64                             pgoff;
5952        } event_id;
5953};
5954
5955static int perf_event_mmap_match(struct perf_event *event,
5956                                 void *data)
5957{
5958        struct perf_mmap_event *mmap_event = data;
5959        struct vm_area_struct *vma = mmap_event->vma;
5960        int executable = vma->vm_flags & VM_EXEC;
5961
5962        return (!executable && event->attr.mmap_data) ||
5963               (executable && (event->attr.mmap || event->attr.mmap2));
5964}
5965
5966static void perf_event_mmap_output(struct perf_event *event,
5967                                   void *data)
5968{
5969        struct perf_mmap_event *mmap_event = data;
5970        struct perf_output_handle handle;
5971        struct perf_sample_data sample;
5972        int size = mmap_event->event_id.header.size;
5973        int ret;
5974
5975        if (!perf_event_mmap_match(event, data))
5976                return;
5977
5978        if (event->attr.mmap2) {
5979                mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5980                mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5981                mmap_event->event_id.header.size += sizeof(mmap_event->min);
5982                mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5983                mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5984                mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5985                mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5986        }
5987
5988        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5989        ret = perf_output_begin(&handle, event,
5990                                mmap_event->event_id.header.size);
5991        if (ret)
5992                goto out;
5993
5994        mmap_event->event_id.pid = perf_event_pid(event, current);
5995        mmap_event->event_id.tid = perf_event_tid(event, current);
5996
5997        perf_output_put(&handle, mmap_event->event_id);
5998
5999        if (event->attr.mmap2) {
6000                perf_output_put(&handle, mmap_event->maj);
6001                perf_output_put(&handle, mmap_event->min);
6002                perf_output_put(&handle, mmap_event->ino);
6003                perf_output_put(&handle, mmap_event->ino_generation);
6004                perf_output_put(&handle, mmap_event->prot);
6005                perf_output_put(&handle, mmap_event->flags);
6006        }
6007
6008        __output_copy(&handle, mmap_event->file_name,
6009                                   mmap_event->file_size);
6010
6011        perf_event__output_id_sample(event, &handle, &sample);
6012
6013        perf_output_end(&handle);
6014out:
6015        mmap_event->event_id.header.size = size;
6016}
6017
6018static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
6019{
6020        struct vm_area_struct *vma = mmap_event->vma;
6021        struct file *file = vma->vm_file;
6022        int maj = 0, min = 0;
6023        u64 ino = 0, gen = 0;
6024        u32 prot = 0, flags = 0;
6025        unsigned int size;
6026        char tmp[16];
6027        char *buf = NULL;
6028        char *name;
6029
6030        if (file) {
6031                struct inode *inode;
6032                dev_t dev;
6033
6034                buf = kmalloc(PATH_MAX, GFP_KERNEL);
6035                if (!buf) {
6036                        name = "//enomem";
6037                        goto cpy_name;
6038                }
6039                /*
6040                 * d_path() works from the end of the rb backwards, so we
6041                 * need to add enough zero bytes after the string to handle
6042                 * the 64bit alignment we do later.
6043                 */
6044                name = file_path(file, buf, PATH_MAX - sizeof(u64));
6045                if (IS_ERR(name)) {
6046                        name = "//toolong";
6047                        goto cpy_name;
6048                }
6049                inode = file_inode(vma->vm_file);
6050                dev = inode->i_sb->s_dev;
6051                ino = inode->i_ino;
6052                gen = inode->i_generation;
6053                maj = MAJOR(dev);
6054                min = MINOR(dev);
6055
6056                if (vma->vm_flags & VM_READ)
6057                        prot |= PROT_READ;
6058                if (vma->vm_flags & VM_WRITE)
6059                        prot |= PROT_WRITE;
6060                if (vma->vm_flags & VM_EXEC)
6061                        prot |= PROT_EXEC;
6062
6063                if (vma->vm_flags & VM_MAYSHARE)
6064                        flags = MAP_SHARED;
6065                else
6066                        flags = MAP_PRIVATE;
6067
6068                if (vma->vm_flags & VM_DENYWRITE)
6069                        flags |= MAP_DENYWRITE;
6070                if (vma->vm_flags & VM_MAYEXEC)
6071                        flags |= MAP_EXECUTABLE;
6072                if (vma->vm_flags & VM_LOCKED)
6073                        flags |= MAP_LOCKED;
6074                if (vma->vm_flags & VM_HUGETLB)
6075                        flags |= MAP_HUGETLB;
6076
6077                goto got_name;
6078        } else {
6079                if (vma->vm_ops && vma->vm_ops->name) {
6080                        name = (char *) vma->vm_ops->name(vma);
6081                        if (name)
6082                                goto cpy_name;
6083                }
6084
6085                name = (char *)arch_vma_name(vma);
6086                if (name)
6087                        goto cpy_name;
6088
6089                if (vma->vm_start <= vma->vm_mm->start_brk &&
6090                                vma->vm_end >= vma->vm_mm->brk) {
6091                        name = "[heap]";
6092                        goto cpy_name;
6093                }
6094                if (vma->vm_start <= vma->vm_mm->start_stack &&
6095                                vma->vm_end >= vma->vm_mm->start_stack) {
6096                        name = "[stack]";
6097                        goto cpy_name;
6098                }
6099
6100                name = "//anon";
6101                goto cpy_name;
6102        }
6103
6104cpy_name:
6105        strlcpy(tmp, name, sizeof(tmp));
6106        name = tmp;
6107got_name:
6108        /*
6109         * Since our buffer works in 8 byte units we need to align our string
6110         * size to a multiple of 8. However, we must guarantee the tail end is
6111         * zero'd out to avoid leaking random bits to userspace.
6112         */
6113        size = strlen(name)+1;
6114        while (!IS_ALIGNED(size, sizeof(u64)))
6115                name[size++] = '\0';
6116
6117        mmap_event->file_name = name;
6118        mmap_event->file_size = size;
6119        mmap_event->maj = maj;
6120        mmap_event->min = min;
6121        mmap_event->ino = ino;
6122        mmap_event->ino_generation = gen;
6123        mmap_event->prot = prot;
6124        mmap_event->flags = flags;
6125
6126        if (!(vma->vm_flags & VM_EXEC))
6127                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
6128
6129        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
6130
6131        perf_event_aux(perf_event_mmap_output,
6132                       mmap_event,
6133                       NULL);
6134
6135        kfree(buf);
6136}
6137
6138void perf_event_mmap(struct vm_area_struct *vma)
6139{
6140        struct perf_mmap_event mmap_event;
6141
6142        if (!atomic_read(&nr_mmap_events))
6143                return;
6144
6145        mmap_event = (struct perf_mmap_event){
6146                .vma    = vma,
6147                /* .file_name */
6148                /* .file_size */
6149                .event_id  = {
6150                        .header = {
6151                                .type = PERF_RECORD_MMAP,
6152                                .misc = PERF_RECORD_MISC_USER,
6153                                /* .size */
6154                        },
6155                        /* .pid */
6156                        /* .tid */
6157                        .start  = vma->vm_start,
6158                        .len    = vma->vm_end - vma->vm_start,
6159                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
6160                },
6161                /* .maj (attr_mmap2 only) */
6162                /* .min (attr_mmap2 only) */
6163                /* .ino (attr_mmap2 only) */
6164                /* .ino_generation (attr_mmap2 only) */
6165                /* .prot (attr_mmap2 only) */
6166                /* .flags (attr_mmap2 only) */
6167        };
6168
6169        perf_event_mmap_event(&mmap_event);
6170}
6171
6172void perf_event_aux_event(struct perf_event *event, unsigned long head,
6173                          unsigned long size, u64 flags)
6174{
6175        struct perf_output_handle handle;
6176        struct perf_sample_data sample;
6177        struct perf_aux_event {
6178                struct perf_event_header        header;
6179                u64                             offset;
6180                u64                             size;
6181                u64                             flags;
6182        } rec = {
6183                .header = {
6184                        .type = PERF_RECORD_AUX,
6185                        .misc = 0,
6186                        .size = sizeof(rec),
6187                },
6188                .offset         = head,
6189                .size           = size,
6190                .flags          = flags,
6191        };
6192        int ret;
6193
6194        perf_event_header__init_id(&rec.header, &sample, event);
6195        ret = perf_output_begin(&handle, event, rec.header.size);
6196
6197        if (ret)
6198                return;
6199
6200        perf_output_put(&handle, rec);
6201        perf_event__output_id_sample(event, &handle, &sample);
6202
6203        perf_output_end(&handle);
6204}
6205
6206/*
6207 * Lost/dropped samples logging
6208 */
6209void perf_log_lost_samples(struct perf_event *event, u64 lost)
6210{
6211        struct perf_output_handle handle;
6212        struct perf_sample_data sample;
6213        int ret;
6214
6215        struct {
6216                struct perf_event_header        header;
6217                u64                             lost;
6218        } lost_samples_event = {
6219                .header = {
6220                        .type = PERF_RECORD_LOST_SAMPLES,
6221                        .misc = 0,
6222                        .size = sizeof(lost_samples_event),
6223                },
6224                .lost           = lost,
6225        };
6226
6227        perf_event_header__init_id(&lost_samples_event.header, &sample, event);
6228
6229        ret = perf_output_begin(&handle, event,
6230                                lost_samples_event.header.size);
6231        if (ret)
6232                return;
6233
6234        perf_output_put(&handle, lost_samples_event);
6235        perf_event__output_id_sample(event, &handle, &sample);
6236        perf_output_end(&handle);
6237}
6238
6239/*
6240 * context_switch tracking
6241 */
6242
6243struct perf_switch_event {
6244        struct task_struct      *task;
6245        struct task_struct      *next_prev;
6246
6247        struct {
6248                struct perf_event_header        header;
6249                u32                             next_prev_pid;
6250                u32                             next_prev_tid;
6251        } event_id;
6252};
6253
6254static int perf_event_switch_match(struct perf_event *event)
6255{
6256        return event->attr.context_switch;
6257}
6258
6259static void perf_event_switch_output(struct perf_event *event, void *data)
6260{
6261        struct perf_switch_event *se = data;
6262        struct perf_output_handle handle;
6263        struct perf_sample_data sample;
6264        int ret;
6265
6266        if (!perf_event_switch_match(event))
6267                return;
6268
6269        /* Only CPU-wide events are allowed to see next/prev pid/tid */
6270        if (event->ctx->task) {
6271                se->event_id.header.type = PERF_RECORD_SWITCH;
6272                se->event_id.header.size = sizeof(se->event_id.header);
6273        } else {
6274                se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
6275                se->event_id.header.size = sizeof(se->event_id);
6276                se->event_id.next_prev_pid =
6277                                        perf_event_pid(event, se->next_prev);
6278                se->event_id.next_prev_tid =
6279                                        perf_event_tid(event, se->next_prev);
6280        }
6281
6282        perf_event_header__init_id(&se->event_id.header, &sample, event);
6283
6284        ret = perf_output_begin(&handle, event, se->event_id.header.size);
6285        if (ret)
6286                return;
6287
6288        if (event->ctx->task)
6289                perf_output_put(&handle, se->event_id.header);
6290        else
6291                perf_output_put(&handle, se->event_id);
6292
6293        perf_event__output_id_sample(event, &handle, &sample);
6294
6295        perf_output_end(&handle);
6296}
6297
6298static void perf_event_switch(struct task_struct *task,
6299                              struct task_struct *next_prev, bool sched_in)
6300{
6301        struct perf_switch_event switch_event;
6302
6303        /* N.B. caller checks nr_switch_events != 0 */
6304
6305        switch_event = (struct perf_switch_event){
6306                .task           = task,
6307                .next_prev      = next_prev,
6308                .event_id       = {
6309                        .header = {
6310                                /* .type */
6311                                .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
6312                                /* .size */
6313                        },
6314                        /* .next_prev_pid */
6315                        /* .next_prev_tid */
6316                },
6317        };
6318
6319        perf_event_aux(perf_event_switch_output,
6320                       &switch_event,
6321                       NULL);
6322}
6323
6324/*
6325 * IRQ throttle logging
6326 */
6327
6328static void perf_log_throttle(struct perf_event *event, int enable)
6329{
6330        struct perf_output_handle handle;
6331        struct perf_sample_data sample;
6332        int ret;
6333
6334        struct {
6335                struct perf_event_header        header;
6336                u64                             time;
6337                u64                             id;
6338                u64                             stream_id;
6339        } throttle_event = {
6340                .header = {
6341                        .type = PERF_RECORD_THROTTLE,
6342                        .misc = 0,
6343                        .size = sizeof(throttle_event),
6344                },
6345                .time           = perf_event_clock(event),
6346                .id             = primary_event_id(event),
6347                .stream_id      = event->id,
6348        };
6349
6350        if (enable)
6351                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
6352
6353        perf_event_header__init_id(&throttle_event.header, &sample, event);
6354
6355        ret = perf_output_begin(&handle, event,
6356                                throttle_event.header.size);
6357        if (ret)
6358                return;
6359
6360        perf_output_put(&handle, throttle_event);
6361        perf_event__output_id_sample(event, &handle, &sample);
6362        perf_output_end(&handle);
6363}
6364
6365static void perf_log_itrace_start(struct perf_event *event)
6366{
6367        struct perf_output_handle handle;
6368        struct perf_sample_data sample;
6369        struct perf_aux_event {
6370                struct perf_event_header        header;
6371                u32                             pid;
6372                u32                             tid;
6373        } rec;
6374        int ret;
6375
6376        if (event->parent)
6377                event = event->parent;
6378
6379        if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
6380            event->hw.itrace_started)
6381                return;
6382
6383        rec.header.type = PERF_RECORD_ITRACE_START;
6384        rec.header.misc = 0;
6385        rec.header.size = sizeof(rec);
6386        rec.pid = perf_event_pid(event, current);
6387        rec.tid = perf_event_tid(event, current);
6388
6389        perf_event_header__init_id(&rec.header, &sample, event);
6390        ret = perf_output_begin(&handle, event, rec.header.size);
6391
6392        if (ret)
6393                return;
6394
6395        perf_output_put(&handle, rec);
6396        perf_event__output_id_sample(event, &handle, &sample);
6397
6398        perf_output_end(&handle);
6399}
6400
6401/*
6402 * Generic event overflow handling, sampling.
6403 */
6404
6405static int __perf_event_overflow(struct perf_event *event,
6406                                   int throttle, struct perf_sample_data *data,
6407                                   struct pt_regs *regs)
6408{
6409        int events = atomic_read(&event->event_limit);
6410        struct hw_perf_event *hwc = &event->hw;
6411        u64 seq;
6412        int ret = 0;
6413
6414        /*
6415         * Non-sampling counters might still use the PMI to fold short
6416         * hardware counters, ignore those.
6417         */
6418        if (unlikely(!is_sampling_event(event)))
6419                return 0;
6420
6421        seq = __this_cpu_read(perf_throttled_seq);
6422        if (seq != hwc->interrupts_seq) {
6423                hwc->interrupts_seq = seq;
6424                hwc->interrupts = 1;
6425        } else {
6426                hwc->interrupts++;
6427                if (unlikely(throttle
6428                             && hwc->interrupts >= max_samples_per_tick)) {
6429                        __this_cpu_inc(perf_throttled_count);
6430                        hwc->interrupts = MAX_INTERRUPTS;
6431                        perf_log_throttle(event, 0);
6432                        tick_nohz_full_kick();
6433                        ret = 1;
6434                }
6435        }
6436
6437        if (event->attr.freq) {
6438                u64 now = perf_clock();
6439                s64 delta = now - hwc->freq_time_stamp;
6440
6441                hwc->freq_time_stamp = now;
6442
6443                if (delta > 0 && delta < 2*TICK_NSEC)
6444                        perf_adjust_period(event, delta, hwc->last_period, true);
6445        }
6446
6447        /*
6448         * XXX event_limit might not quite work as expected on inherited
6449         * events
6450         */
6451
6452        event->pending_kill = POLL_IN;
6453        if (events && atomic_dec_and_test(&event->event_limit)) {
6454                ret = 1;
6455                event->pending_kill = POLL_HUP;
6456                event->pending_disable = 1;
6457                irq_work_queue(&event->pending);
6458        }
6459
6460        if (event->overflow_handler)
6461                event->overflow_handler(event, data, regs);
6462        else
6463                perf_event_output(event, data, regs);
6464
6465        if (*perf_event_fasync(event) && event->pending_kill) {
6466                event->pending_wakeup = 1;
6467                irq_work_queue(&event->pending);
6468        }
6469
6470        return ret;
6471}
6472
6473int perf_event_overflow(struct perf_event *event,
6474                          struct perf_sample_data *data,
6475                          struct pt_regs *regs)
6476{
6477        return __perf_event_overflow(event, 1, data, regs);
6478}
6479
6480/*
6481 * Generic software event infrastructure
6482 */
6483
6484struct swevent_htable {
6485        struct swevent_hlist            *swevent_hlist;
6486        struct mutex                    hlist_mutex;
6487        int                             hlist_refcount;
6488
6489        /* Recursion avoidance in each contexts */
6490        int                             recursion[PERF_NR_CONTEXTS];
6491};
6492
6493static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
6494
6495/*
6496 * We directly increment event->count and keep a second value in
6497 * event->hw.period_left to count intervals. This period event
6498 * is kept in the range [-sample_period, 0] so that we can use the
6499 * sign as trigger.
6500 */
6501
6502u64 perf_swevent_set_period(struct perf_event *event)
6503{
6504        struct hw_perf_event *hwc = &event->hw;
6505        u64 period = hwc->last_period;
6506        u64 nr, offset;
6507        s64 old, val;
6508
6509        hwc->last_period = hwc->sample_period;
6510
6511again:
6512        old = val = local64_read(&hwc->period_left);
6513        if (val < 0)
6514                return 0;
6515
6516        nr = div64_u64(period + val, period);
6517        offset = nr * period;
6518        val -= offset;
6519        if (local64_cmpxchg(&hwc->period_left, old, val) != old)
6520                goto again;
6521
6522        return nr;
6523}
6524
6525static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
6526                                    struct perf_sample_data *data,
6527                                    struct pt_regs *regs)
6528{
6529        struct hw_perf_event *hwc = &event->hw;
6530        int throttle = 0;
6531
6532        if (!overflow)
6533                overflow = perf_swevent_set_period(event);
6534
6535        if (hwc->interrupts == MAX_INTERRUPTS)
6536                return;
6537
6538        for (; overflow; overflow--) {
6539                if (__perf_event_overflow(event, throttle,
6540                                            data, regs)) {
6541                        /*
6542                         * We inhibit the overflow from happening when
6543                         * hwc->interrupts == MAX_INTERRUPTS.
6544                         */
6545                        break;
6546                }
6547                throttle = 1;
6548        }
6549}
6550
6551static void perf_swevent_event(struct perf_event *event, u64 nr,
6552                               struct perf_sample_data *data,
6553                               struct pt_regs *regs)
6554{
6555        struct hw_perf_event *hwc = &event->hw;
6556
6557        local64_add(nr, &event->count);
6558
6559        if (!regs)
6560                return;
6561
6562        if (!is_sampling_event(event))
6563                return;
6564
6565        if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
6566                data->period = nr;
6567                return perf_swevent_overflow(event, 1, data, regs);
6568        } else
6569                data->period = event->hw.last_period;
6570
6571        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
6572                return perf_swevent_overflow(event, 1, data, regs);
6573
6574        if (local64_add_negative(nr, &hwc->period_left))
6575                return;
6576
6577        perf_swevent_overflow(event, 0, data, regs);
6578}
6579
6580static int perf_exclude_event(struct perf_event *event,
6581                              struct pt_regs *regs)
6582{
6583        if (event->hw.state & PERF_HES_STOPPED)
6584                return 1;
6585
6586        if (regs) {
6587                if (event->attr.exclude_user && user_mode(regs))
6588                        return 1;
6589
6590                if (event->attr.exclude_kernel && !user_mode(regs))
6591                        return 1;
6592        }
6593
6594        return 0;
6595}
6596
6597static int perf_swevent_match(struct perf_event *event,
6598                                enum perf_type_id type,
6599                                u32 event_id,
6600                                struct perf_sample_data *data,
6601                                struct pt_regs *regs)
6602{
6603        if (event->attr.type != type)
6604                return 0;
6605
6606        if (event->attr.config != event_id)
6607                return 0;
6608
6609        if (perf_exclude_event(event, regs))
6610                return 0;
6611
6612        return 1;
6613}
6614
6615static inline u64 swevent_hash(u64 type, u32 event_id)
6616{
6617        u64 val = event_id | (type << 32);
6618
6619        return hash_64(val, SWEVENT_HLIST_BITS);
6620}
6621
6622static inline struct hlist_head *
6623__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
6624{
6625        u64 hash = swevent_hash(type, event_id);
6626
6627        return &hlist->heads[hash];
6628}
6629
6630/* For the read side: events when they trigger */
6631static inline struct hlist_head *
6632find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
6633{
6634        struct swevent_hlist *hlist;
6635
6636        hlist = rcu_dereference(swhash->swevent_hlist);
6637        if (!hlist)
6638                return NULL;
6639
6640        return __find_swevent_head(hlist, type, event_id);
6641}
6642
6643/* For the event head insertion and removal in the hlist */
6644static inline struct hlist_head *
6645find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
6646{
6647        struct swevent_hlist *hlist;
6648        u32 event_id = event->attr.config;
6649        u64 type = event->attr.type;
6650
6651        /*
6652         * Event scheduling is always serialized against hlist allocation
6653         * and release. Which makes the protected version suitable here.
6654         * The context lock guarantees that.
6655         */
6656        hlist = rcu_dereference_protected(swhash->swevent_hlist,
6657                                          lockdep_is_held(&event->ctx->lock));
6658        if (!hlist)
6659                return NULL;
6660
6661        return __find_swevent_head(hlist, type, event_id);
6662}
6663
6664static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
6665                                    u64 nr,
6666                                    struct perf_sample_data *data,
6667                                    struct pt_regs *regs)
6668{
6669        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6670        struct perf_event *event;
6671        struct hlist_head *head;
6672
6673        rcu_read_lock();
6674        head = find_swevent_head_rcu(swhash, type, event_id);
6675        if (!head)
6676                goto end;
6677
6678        hlist_for_each_entry_rcu(event, head, hlist_entry) {
6679                if (perf_swevent_match(event, type, event_id, data, regs))
6680                        perf_swevent_event(event, nr, data, regs);
6681        }
6682end:
6683        rcu_read_unlock();
6684}
6685
6686DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6687
6688int perf_swevent_get_recursion_context(void)
6689{
6690        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6691
6692        return get_recursion_context(swhash->recursion);
6693}
6694EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
6695
6696inline void perf_swevent_put_recursion_context(int rctx)
6697{
6698        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6699
6700        put_recursion_context(swhash->recursion, rctx);
6701}
6702
6703void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6704{
6705        struct perf_sample_data data;
6706
6707        if (WARN_ON_ONCE(!regs))
6708                return;
6709
6710        perf_sample_data_init(&data, addr, 0);
6711        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
6712}
6713
6714void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6715{
6716        int rctx;
6717
6718        preempt_disable_notrace();
6719        rctx = perf_swevent_get_recursion_context();
6720        if (unlikely(rctx < 0))
6721                goto fail;
6722
6723        ___perf_sw_event(event_id, nr, regs, addr);
6724
6725        perf_swevent_put_recursion_context(rctx);
6726fail:
6727        preempt_enable_notrace();
6728}
6729
6730static void perf_swevent_read(struct perf_event *event)
6731{
6732}
6733
6734static int perf_swevent_add(struct perf_event *event, int flags)
6735{
6736        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6737        struct hw_perf_event *hwc = &event->hw;
6738        struct hlist_head *head;
6739
6740        if (is_sampling_event(event)) {
6741                hwc->last_period = hwc->sample_period;
6742                perf_swevent_set_period(event);
6743        }
6744
6745        hwc->state = !(flags & PERF_EF_START);
6746
6747        head = find_swevent_head(swhash, event);
6748        if (WARN_ON_ONCE(!head))
6749                return -EINVAL;
6750
6751        hlist_add_head_rcu(&event->hlist_entry, head);
6752        perf_event_update_userpage(event);
6753
6754        return 0;
6755}
6756
6757static void perf_swevent_del(struct perf_event *event, int flags)
6758{
6759        hlist_del_rcu(&event->hlist_entry);
6760}
6761
6762static void perf_swevent_start(struct perf_event *event, int flags)
6763{
6764        event->hw.state = 0;
6765}
6766
6767static void perf_swevent_stop(struct perf_event *event, int flags)
6768{
6769        event->hw.state = PERF_HES_STOPPED;
6770}
6771
6772/* Deref the hlist from the update side */
6773static inline struct swevent_hlist *
6774swevent_hlist_deref(struct swevent_htable *swhash)
6775{
6776        return rcu_dereference_protected(swhash->swevent_hlist,
6777                                         lockdep_is_held(&swhash->hlist_mutex));
6778}
6779
6780static void swevent_hlist_release(struct swevent_htable *swhash)
6781{
6782        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
6783
6784        if (!hlist)
6785                return;
6786
6787        RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
6788        kfree_rcu(hlist, rcu_head);
6789}
6790
6791static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
6792{
6793        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6794
6795        mutex_lock(&swhash->hlist_mutex);
6796
6797        if (!--swhash->hlist_refcount)
6798                swevent_hlist_release(swhash);
6799
6800        mutex_unlock(&swhash->hlist_mutex);
6801}
6802
6803static void swevent_hlist_put(struct perf_event *event)
6804{
6805        int cpu;
6806
6807        for_each_possible_cpu(cpu)
6808                swevent_hlist_put_cpu(event, cpu);
6809}
6810
6811static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
6812{
6813        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6814        int err = 0;
6815
6816        mutex_lock(&swhash->hlist_mutex);
6817        if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
6818                struct swevent_hlist *hlist;
6819
6820                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
6821                if (!hlist) {
6822                        err = -ENOMEM;
6823                        goto exit;
6824                }
6825                rcu_assign_pointer(swhash->swevent_hlist, hlist);
6826        }
6827        swhash->hlist_refcount++;
6828exit:
6829        mutex_unlock(&swhash->hlist_mutex);
6830
6831        return err;
6832}
6833
6834static int swevent_hlist_get(struct perf_event *event)
6835{
6836        int err;
6837        int cpu, failed_cpu;
6838
6839        get_online_cpus();
6840        for_each_possible_cpu(cpu) {
6841                err = swevent_hlist_get_cpu(event, cpu);
6842                if (err) {
6843                        failed_cpu = cpu;
6844                        goto fail;
6845                }
6846        }
6847        put_online_cpus();
6848
6849        return 0;
6850fail:
6851        for_each_possible_cpu(cpu) {
6852                if (cpu == failed_cpu)
6853                        break;
6854                swevent_hlist_put_cpu(event, cpu);
6855        }
6856
6857        put_online_cpus();
6858        return err;
6859}
6860
6861struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
6862
6863static void sw_perf_event_destroy(struct perf_event *event)
6864{
6865        u64 event_id = event->attr.config;
6866
6867        WARN_ON(event->parent);
6868
6869        static_key_slow_dec(&perf_swevent_enabled[event_id]);
6870        swevent_hlist_put(event);
6871}
6872
6873static int perf_swevent_init(struct perf_event *event)
6874{
6875        u64 event_id = event->attr.config;
6876
6877        if (event->attr.type != PERF_TYPE_SOFTWARE)
6878                return -ENOENT;
6879
6880        /*
6881         * no branch sampling for software events
6882         */
6883        if (has_branch_stack(event))
6884                return -EOPNOTSUPP;
6885
6886        switch (event_id) {
6887        case PERF_COUNT_SW_CPU_CLOCK:
6888        case PERF_COUNT_SW_TASK_CLOCK:
6889                return -ENOENT;
6890
6891        default:
6892                break;
6893        }
6894
6895        if (event_id >= PERF_COUNT_SW_MAX)
6896                return -ENOENT;
6897
6898        if (!event->parent) {
6899                int err;
6900
6901                err = swevent_hlist_get(event);
6902                if (err)
6903                        return err;
6904
6905                static_key_slow_inc(&perf_swevent_enabled[event_id]);
6906                event->destroy = sw_perf_event_destroy;
6907        }
6908
6909        return 0;
6910}
6911
6912static struct pmu perf_swevent = {
6913        .task_ctx_nr    = perf_sw_context,
6914
6915        .capabilities   = PERF_PMU_CAP_NO_NMI,
6916
6917        .event_init     = perf_swevent_init,
6918        .add            = perf_swevent_add,
6919        .del            = perf_swevent_del,
6920        .start          = perf_swevent_start,
6921        .stop           = perf_swevent_stop,
6922        .read           = perf_swevent_read,
6923};
6924
6925#ifdef CONFIG_EVENT_TRACING
6926
6927static int perf_tp_filter_match(struct perf_event *event,
6928                                struct perf_sample_data *data)
6929{
6930        void *record = data->raw->data;
6931
6932        /* only top level events have filters set */
6933        if (event->parent)
6934                event = event->parent;
6935
6936        if (likely(!event->filter) || filter_match_preds(event->filter, record))
6937                return 1;
6938        return 0;
6939}
6940
6941static int perf_tp_event_match(struct perf_event *event,
6942                                struct perf_sample_data *data,
6943                                struct pt_regs *regs)
6944{
6945        if (event->hw.state & PERF_HES_STOPPED)
6946                return 0;
6947        /*
6948         * All tracepoints are from kernel-space.
6949         */
6950        if (event->attr.exclude_kernel)
6951                return 0;
6952
6953        if (!perf_tp_filter_match(event, data))
6954                return 0;
6955
6956        return 1;
6957}
6958
6959void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
6960                   struct pt_regs *regs, struct hlist_head *head, int rctx,
6961                   struct task_struct *task)
6962{
6963        struct perf_sample_data data;
6964        struct perf_event *event;
6965
6966        struct perf_raw_record raw = {
6967                .size = entry_size,
6968                .data = record,
6969        };
6970
6971        perf_sample_data_init(&data, addr, 0);
6972        data.raw = &raw;
6973
6974        hlist_for_each_entry_rcu(event, head, hlist_entry) {
6975                if (perf_tp_event_match(event, &data, regs))
6976                        perf_swevent_event(event, count, &data, regs);
6977        }
6978
6979        /*
6980         * If we got specified a target task, also iterate its context and
6981         * deliver this event there too.
6982         */
6983        if (task && task != current) {
6984                struct perf_event_context *ctx;
6985                struct trace_entry *entry = record;
6986
6987                rcu_read_lock();
6988                ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
6989                if (!ctx)
6990                        goto unlock;
6991
6992                list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6993                        if (event->attr.type != PERF_TYPE_TRACEPOINT)
6994                                continue;
6995                        if (event->attr.config != entry->type)
6996                                continue;
6997                        if (perf_tp_event_match(event, &data, regs))
6998                                perf_swevent_event(event, count, &data, regs);
6999                }
7000unlock:
7001                rcu_read_unlock();
7002        }
7003
7004        perf_swevent_put_recursion_context(rctx);
7005}
7006EXPORT_SYMBOL_GPL(perf_tp_event);
7007
7008static void tp_perf_event_destroy(struct perf_event *event)
7009{
7010        perf_trace_destroy(event);
7011}
7012
7013static int perf_tp_event_init(struct perf_event *event)
7014{
7015        int err;
7016
7017        if (event->attr.type != PERF_TYPE_TRACEPOINT)
7018                return -ENOENT;
7019
7020        /*
7021         * no branch sampling for tracepoint events
7022         */
7023        if (has_branch_stack(event))
7024                return -EOPNOTSUPP;
7025
7026        err = perf_trace_init(event);
7027        if (err)
7028                return err;
7029
7030        event->destroy = tp_perf_event_destroy;
7031
7032        return 0;
7033}
7034
7035static struct pmu perf_tracepoint = {
7036        .task_ctx_nr    = perf_sw_context,
7037
7038        .event_init     = perf_tp_event_init,
7039        .add            = perf_trace_add,
7040        .del            = perf_trace_del,
7041        .start          = perf_swevent_start,
7042        .stop           = perf_swevent_stop,
7043        .read           = perf_swevent_read,
7044};
7045
7046static inline void perf_tp_register(void)
7047{
7048        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
7049}
7050
7051static int perf_event_set_filter(struct perf_event *event, void __user *arg)
7052{
7053        char *filter_str;
7054        int ret;
7055
7056        if (event->attr.type != PERF_TYPE_TRACEPOINT)
7057                return -EINVAL;
7058
7059        filter_str = strndup_user(arg, PAGE_SIZE);
7060        if (IS_ERR(filter_str))
7061                return PTR_ERR(filter_str);
7062
7063        ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
7064
7065        kfree(filter_str);
7066        return ret;
7067}
7068
7069static void perf_event_free_filter(struct perf_event *event)
7070{
7071        ftrace_profile_free_filter(event);
7072}
7073
7074static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7075{
7076        struct bpf_prog *prog;
7077
7078        if (event->attr.type != PERF_TYPE_TRACEPOINT)
7079                return -EINVAL;
7080
7081        if (event->tp_event->prog)
7082                return -EEXIST;
7083
7084        if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
7085                /* bpf programs can only be attached to u/kprobes */
7086                return -EINVAL;
7087
7088        prog = bpf_prog_get(prog_fd);
7089        if (IS_ERR(prog))
7090                return PTR_ERR(prog);
7091
7092        if (prog->type != BPF_PROG_TYPE_KPROBE) {
7093                /* valid fd, but invalid bpf program type */
7094                bpf_prog_put(prog);
7095                return -EINVAL;
7096        }
7097
7098        event->tp_event->prog = prog;
7099
7100        return 0;
7101}
7102
7103static void perf_event_free_bpf_prog(struct perf_event *event)
7104{
7105        struct bpf_prog *prog;
7106
7107        if (!event->tp_event)
7108                return;
7109
7110        prog = event->tp_event->prog;
7111        if (prog) {
7112                event->tp_event->prog = NULL;
7113                bpf_prog_put(prog);
7114        }
7115}
7116
7117#else
7118
7119static inline void perf_tp_register(void)
7120{
7121}
7122
7123static int perf_event_set_filter(struct perf_event *event, void __user *arg)
7124{
7125        return -ENOENT;
7126}
7127
7128static void perf_event_free_filter(struct perf_event *event)
7129{
7130}
7131
7132static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7133{
7134        return -ENOENT;
7135}
7136
7137static void perf_event_free_bpf_prog(struct perf_event *event)
7138{
7139}
7140#endif /* CONFIG_EVENT_TRACING */
7141
7142#ifdef CONFIG_HAVE_HW_BREAKPOINT
7143void perf_bp_event(struct perf_event *bp, void *data)
7144{
7145        struct perf_sample_data sample;
7146        struct pt_regs *regs = data;
7147
7148        perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
7149
7150        if (!bp->hw.state && !perf_exclude_event(bp, regs))
7151                perf_swevent_event(bp, 1, &sample, regs);
7152}
7153#endif
7154
7155/*
7156 * hrtimer based swevent callback
7157 */
7158
7159static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
7160{
7161        enum hrtimer_restart ret = HRTIMER_RESTART;
7162        struct perf_sample_data data;
7163        struct pt_regs *regs;
7164        struct perf_event *event;
7165        u64 period;
7166
7167        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
7168
7169        if (event->state != PERF_EVENT_STATE_ACTIVE)
7170                return HRTIMER_NORESTART;
7171
7172        event->pmu->read(event);
7173
7174        perf_sample_data_init(&data, 0, event->hw.last_period);
7175        regs = get_irq_regs();
7176
7177        if (regs && !perf_exclude_event(event, regs)) {
7178                if (!(event->attr.exclude_idle && is_idle_task(current)))
7179                        if (__perf_event_overflow(event, 1, &data, regs))
7180                                ret = HRTIMER_NORESTART;
7181        }
7182
7183        period = max_t(u64, 10000, event->hw.sample_period);
7184        hrtimer_forward_now(hrtimer, ns_to_ktime(period));
7185
7186        return ret;
7187}
7188
7189static void perf_swevent_start_hrtimer(struct perf_event *event)
7190{
7191        struct hw_perf_event *hwc = &event->hw;
7192        s64 period;
7193
7194        if (!is_sampling_event(event))
7195                return;
7196
7197        period = local64_read(&hwc->period_left);
7198        if (period) {
7199                if (period < 0)
7200                        period = 10000;
7201
7202                local64_set(&hwc->period_left, 0);
7203        } else {
7204                period = max_t(u64, 10000, hwc->sample_period);
7205        }
7206        hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
7207                      HRTIMER_MODE_REL_PINNED);
7208}
7209
7210static void perf_swevent_cancel_hrtimer(struct perf_event *event)
7211{
7212        struct hw_perf_event *hwc = &event->hw;
7213
7214        if (is_sampling_event(event)) {
7215                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
7216                local64_set(&hwc->period_left, ktime_to_ns(remaining));
7217
7218                hrtimer_cancel(&hwc->hrtimer);
7219        }
7220}
7221
7222static void perf_swevent_init_hrtimer(struct perf_event *event)
7223{
7224        struct hw_perf_event *hwc = &event->hw;
7225
7226        if (!is_sampling_event(event))
7227                return;
7228
7229        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
7230        hwc->hrtimer.function = perf_swevent_hrtimer;
7231
7232        /*
7233         * Since hrtimers have a fixed rate, we can do a static freq->period
7234         * mapping and avoid the whole period adjust feedback stuff.
7235         */
7236        if (event->attr.freq) {
7237                long freq = event->attr.sample_freq;
7238
7239                event->attr.sample_period = NSEC_PER_SEC / freq;
7240                hwc->sample_period = event->attr.sample_period;
7241                local64_set(&hwc->period_left, hwc->sample_period);
7242                hwc->last_period = hwc->sample_period;
7243                event->attr.freq = 0;
7244        }
7245}
7246
7247/*
7248 * Software event: cpu wall time clock
7249 */
7250
7251static void cpu_clock_event_update(struct perf_event *event)
7252{
7253        s64 prev;
7254        u64 now;
7255
7256        now = local_clock();
7257        prev = local64_xchg(&event->hw.prev_count, now);
7258        local64_add(now - prev, &event->count);
7259}
7260
7261static void cpu_clock_event_start(struct perf_event *event, int flags)
7262{
7263        local64_set(&event->hw.prev_count, local_clock());
7264        perf_swevent_start_hrtimer(event);
7265}
7266
7267static void cpu_clock_event_stop(struct perf_event *event, int flags)
7268{
7269        perf_swevent_cancel_hrtimer(event);
7270        cpu_clock_event_update(event);
7271}
7272
7273static int cpu_clock_event_add(struct perf_event *event, int flags)
7274{
7275        if (flags & PERF_EF_START)
7276                cpu_clock_event_start(event, flags);
7277        perf_event_update_userpage(event);
7278
7279        return 0;
7280}
7281
7282static void cpu_clock_event_del(struct perf_event *event, int flags)
7283{
7284        cpu_clock_event_stop(event, flags);
7285}
7286
7287static void cpu_clock_event_read(struct perf_event *event)
7288{
7289        cpu_clock_event_update(event);
7290}
7291
7292static int cpu_clock_event_init(struct perf_event *event)
7293{
7294        if (event->attr.type != PERF_TYPE_SOFTWARE)
7295                return -ENOENT;
7296
7297        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
7298                return -ENOENT;
7299
7300        /*
7301         * no branch sampling for software events
7302         */
7303        if (has_branch_stack(event))
7304                return -EOPNOTSUPP;
7305
7306        perf_swevent_init_hrtimer(event);
7307
7308        return 0;
7309}
7310
7311static struct pmu perf_cpu_clock = {
7312        .task_ctx_nr    = perf_sw_context,
7313
7314        .capabilities   = PERF_PMU_CAP_NO_NMI,
7315
7316        .event_init     = cpu_clock_event_init,
7317        .add            = cpu_clock_event_add,
7318        .del            = cpu_clock_event_del,
7319        .start          = cpu_clock_event_start,
7320        .stop           = cpu_clock_event_stop,
7321        .read           = cpu_clock_event_read,
7322};
7323
7324/*
7325 * Software event: task time clock
7326 */
7327
7328static void task_clock_event_update(struct perf_event *event, u64 now)
7329{
7330        u64 prev;
7331        s64 delta;
7332
7333        prev = local64_xchg(&event->hw.prev_count, now);
7334        delta = now - prev;
7335        local64_add(delta, &event->count);
7336}
7337
7338static void task_clock_event_start(struct perf_event *event, int flags)
7339{
7340        local64_set(&event->hw.prev_count, event->ctx->time);
7341        perf_swevent_start_hrtimer(event);
7342}
7343
7344static void task_clock_event_stop(struct perf_event *event, int flags)
7345{
7346        perf_swevent_cancel_hrtimer(event);
7347        task_clock_event_update(event, event->ctx->time);
7348}
7349
7350static int task_clock_event_add(struct perf_event *event, int flags)
7351{
7352        if (flags & PERF_EF_START)
7353                task_clock_event_start(event, flags);
7354        perf_event_update_userpage(event);
7355
7356        return 0;
7357}
7358
7359static void task_clock_event_del(struct perf_event *event, int flags)
7360{
7361        task_clock_event_stop(event, PERF_EF_UPDATE);
7362}
7363
7364static void task_clock_event_read(struct perf_event *event)
7365{
7366        u64 now = perf_clock();
7367        u64 delta = now - event->ctx->timestamp;
7368        u64 time = event->ctx->time + delta;
7369
7370        task_clock_event_update(event, time);
7371}
7372
7373static int task_clock_event_init(struct perf_event *event)
7374{
7375        if (event->attr.type != PERF_TYPE_SOFTWARE)
7376                return -ENOENT;
7377
7378        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
7379                return -ENOENT;
7380
7381        /*
7382         * no branch sampling for software events
7383         */
7384        if (has_branch_stack(event))
7385                return -EOPNOTSUPP;
7386
7387        perf_swevent_init_hrtimer(event);
7388
7389        return 0;
7390}
7391
7392static struct pmu perf_task_clock = {
7393        .task_ctx_nr    = perf_sw_context,
7394
7395        .capabilities   = PERF_PMU_CAP_NO_NMI,
7396
7397        .event_init     = task_clock_event_init,
7398        .add            = task_clock_event_add,
7399        .del            = task_clock_event_del,
7400        .start          = task_clock_event_start,
7401        .stop           = task_clock_event_stop,
7402        .read           = task_clock_event_read,
7403};
7404
7405static void perf_pmu_nop_void(struct pmu *pmu)
7406{
7407}
7408
7409static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
7410{
7411}
7412
7413static int perf_pmu_nop_int(struct pmu *pmu)
7414{
7415        return 0;
7416}
7417
7418static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
7419
7420static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
7421{
7422        __this_cpu_write(nop_txn_flags, flags);
7423
7424        if (flags & ~PERF_PMU_TXN_ADD)
7425                return;
7426
7427        perf_pmu_disable(pmu);
7428}
7429
7430static int perf_pmu_commit_txn(struct pmu *pmu)
7431{
7432        unsigned int flags = __this_cpu_read(nop_txn_flags);
7433
7434        __this_cpu_write(nop_txn_flags, 0);
7435
7436        if (flags & ~PERF_PMU_TXN_ADD)
7437                return 0;
7438
7439        perf_pmu_enable(pmu);
7440        return 0;
7441}
7442
7443static void perf_pmu_cancel_txn(struct pmu *pmu)
7444{
7445        unsigned int flags =  __this_cpu_read(nop_txn_flags);
7446
7447        __this_cpu_write(nop_txn_flags, 0);
7448
7449        if (flags & ~PERF_PMU_TXN_ADD)
7450                return;
7451
7452        perf_pmu_enable(pmu);
7453}
7454
7455static int perf_event_idx_default(struct perf_event *event)
7456{
7457        return 0;
7458}
7459
7460/*
7461 * Ensures all contexts with the same task_ctx_nr have the same
7462 * pmu_cpu_context too.
7463 */
7464static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
7465{
7466        struct pmu *pmu;
7467
7468        if (ctxn < 0)
7469                return NULL;
7470
7471        list_for_each_entry(pmu, &pmus, entry) {
7472                if (pmu->task_ctx_nr == ctxn)
7473                        return pmu->pmu_cpu_context;
7474        }
7475
7476        return NULL;
7477}
7478
7479static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
7480{
7481        int cpu;
7482
7483        for_each_possible_cpu(cpu) {
7484                struct perf_cpu_context *cpuctx;
7485
7486                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7487
7488                if (cpuctx->unique_pmu == old_pmu)
7489                        cpuctx->unique_pmu = pmu;
7490        }
7491}
7492
7493static void free_pmu_context(struct pmu *pmu)
7494{
7495        struct pmu *i;
7496
7497        mutex_lock(&pmus_lock);
7498        /*
7499         * Like a real lame refcount.
7500         */
7501        list_for_each_entry(i, &pmus, entry) {
7502                if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
7503                        update_pmu_context(i, pmu);
7504                        goto out;
7505                }
7506        }
7507
7508        free_percpu(pmu->pmu_cpu_context);
7509out:
7510        mutex_unlock(&pmus_lock);
7511}
7512static struct idr pmu_idr;
7513
7514static ssize_t
7515type_show(struct device *dev, struct device_attribute *attr, char *page)
7516{
7517        struct pmu *pmu = dev_get_drvdata(dev);
7518
7519        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
7520}
7521static DEVICE_ATTR_RO(type);
7522
7523static ssize_t
7524perf_event_mux_interval_ms_show(struct device *dev,
7525                                struct device_attribute *attr,
7526                                char *page)
7527{
7528        struct pmu *pmu = dev_get_drvdata(dev);
7529
7530        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
7531}
7532
7533static DEFINE_MUTEX(mux_interval_mutex);
7534
7535static ssize_t
7536perf_event_mux_interval_ms_store(struct device *dev,
7537                                 struct device_attribute *attr,
7538                                 const char *buf, size_t count)
7539{
7540        struct pmu *pmu = dev_get_drvdata(dev);
7541        int timer, cpu, ret;
7542
7543        ret = kstrtoint(buf, 0, &timer);
7544        if (ret)
7545                return ret;
7546
7547        if (timer < 1)
7548                return -EINVAL;
7549
7550        /* same value, noting to do */
7551        if (timer == pmu->hrtimer_interval_ms)
7552                return count;
7553
7554        mutex_lock(&mux_interval_mutex);
7555        pmu->hrtimer_interval_ms = timer;
7556
7557        /* update all cpuctx for this PMU */
7558        get_online_cpus();
7559        for_each_online_cpu(cpu) {
7560                struct perf_cpu_context *cpuctx;
7561                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7562                cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
7563
7564                cpu_function_call(cpu,
7565                        (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
7566        }
7567        put_online_cpus();
7568        mutex_unlock(&mux_interval_mutex);
7569
7570        return count;
7571}
7572static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
7573
7574static struct attribute *pmu_dev_attrs[] = {
7575        &dev_attr_type.attr,
7576        &dev_attr_perf_event_mux_interval_ms.attr,
7577        NULL,
7578};
7579ATTRIBUTE_GROUPS(pmu_dev);
7580
7581static int pmu_bus_running;
7582static struct bus_type pmu_bus = {
7583        .name           = "event_source",
7584        .dev_groups     = pmu_dev_groups,
7585};
7586
7587static void pmu_dev_release(struct device *dev)
7588{
7589        kfree(dev);
7590}
7591
7592static int pmu_dev_alloc(struct pmu *pmu)
7593{
7594        int ret = -ENOMEM;
7595
7596        pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
7597        if (!pmu->dev)
7598                goto out;
7599
7600        pmu->dev->groups = pmu->attr_groups;
7601        device_initialize(pmu->dev);
7602        ret = dev_set_name(pmu->dev, "%s", pmu->name);
7603        if (ret)
7604                goto free_dev;
7605
7606        dev_set_drvdata(pmu->dev, pmu);
7607        pmu->dev->bus = &pmu_bus;
7608        pmu->dev->release = pmu_dev_release;
7609        ret = device_add(pmu->dev);
7610        if (ret)
7611                goto free_dev;
7612
7613out:
7614        return ret;
7615
7616free_dev:
7617        put_device(pmu->dev);
7618        goto out;
7619}
7620
7621static struct lock_class_key cpuctx_mutex;
7622static struct lock_class_key cpuctx_lock;
7623
7624int perf_pmu_register(struct pmu *pmu, const char *name, int type)
7625{
7626        int cpu, ret;
7627
7628        mutex_lock(&pmus_lock);
7629        ret = -ENOMEM;
7630        pmu->pmu_disable_count = alloc_percpu(int);
7631        if (!pmu->pmu_disable_count)
7632                goto unlock;
7633
7634        pmu->type = -1;
7635        if (!name)
7636                goto skip_type;
7637        pmu->name = name;
7638
7639        if (type < 0) {
7640                type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
7641                if (type < 0) {
7642                        ret = type;
7643                        goto free_pdc;
7644                }
7645        }
7646        pmu->type = type;
7647
7648        if (pmu_bus_running) {
7649                ret = pmu_dev_alloc(pmu);
7650                if (ret)
7651                        goto free_idr;
7652        }
7653
7654skip_type:
7655        pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
7656        if (pmu->pmu_cpu_context)
7657                goto got_cpu_context;
7658
7659        ret = -ENOMEM;
7660        pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
7661        if (!pmu->pmu_cpu_context)
7662                goto free_dev;
7663
7664        for_each_possible_cpu(cpu) {
7665                struct perf_cpu_context *cpuctx;
7666
7667                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7668                __perf_event_init_context(&cpuctx->ctx);
7669                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
7670                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
7671                cpuctx->ctx.pmu = pmu;
7672
7673                __perf_mux_hrtimer_init(cpuctx, cpu);
7674
7675                cpuctx->unique_pmu = pmu;
7676        }
7677
7678got_cpu_context:
7679        if (!pmu->start_txn) {
7680                if (pmu->pmu_enable) {
7681                        /*
7682                         * If we have pmu_enable/pmu_disable calls, install
7683                         * transaction stubs that use that to try and batch
7684                         * hardware accesses.
7685                         */
7686                        pmu->start_txn  = perf_pmu_start_txn;
7687                        pmu->commit_txn = perf_pmu_commit_txn;
7688                        pmu->cancel_txn = perf_pmu_cancel_txn;
7689                } else {
7690                        pmu->start_txn  = perf_pmu_nop_txn;
7691                        pmu->commit_txn = perf_pmu_nop_int;
7692                        pmu->cancel_txn = perf_pmu_nop_void;
7693                }
7694        }
7695
7696        if (!pmu->pmu_enable) {
7697                pmu->pmu_enable  = perf_pmu_nop_void;
7698                pmu->pmu_disable = perf_pmu_nop_void;
7699        }
7700
7701        if (!pmu->event_idx)
7702                pmu->event_idx = perf_event_idx_default;
7703
7704        list_add_rcu(&pmu->entry, &pmus);
7705        atomic_set(&pmu->exclusive_cnt, 0);
7706        ret = 0;
7707unlock:
7708        mutex_unlock(&pmus_lock);
7709
7710        return ret;
7711
7712free_dev:
7713        device_del(pmu->dev);
7714        put_device(pmu->dev);
7715
7716free_idr:
7717        if (pmu->type >= PERF_TYPE_MAX)
7718                idr_remove(&pmu_idr, pmu->type);
7719
7720free_pdc:
7721        free_percpu(pmu->pmu_disable_count);
7722        goto unlock;
7723}
7724EXPORT_SYMBOL_GPL(perf_pmu_register);
7725
7726void perf_pmu_unregister(struct pmu *pmu)
7727{
7728        mutex_lock(&pmus_lock);
7729        list_del_rcu(&pmu->entry);
7730        mutex_unlock(&pmus_lock);
7731
7732        /*
7733         * We dereference the pmu list under both SRCU and regular RCU, so
7734         * synchronize against both of those.
7735         */
7736        synchronize_srcu(&pmus_srcu);
7737        synchronize_rcu();
7738
7739        free_percpu(pmu->pmu_disable_count);
7740        if (pmu->type >= PERF_TYPE_MAX)
7741                idr_remove(&pmu_idr, pmu->type);
7742        device_del(pmu->dev);
7743        put_device(pmu->dev);
7744        free_pmu_context(pmu);
7745}
7746EXPORT_SYMBOL_GPL(perf_pmu_unregister);
7747
7748static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7749{
7750        struct perf_event_context *ctx = NULL;
7751        int ret;
7752
7753        if (!try_module_get(pmu->module))
7754                return -ENODEV;
7755
7756        if (event->group_leader != event) {
7757                /*
7758                 * This ctx->mutex can nest when we're called through
7759                 * inheritance. See the perf_event_ctx_lock_nested() comment.
7760                 */
7761                ctx = perf_event_ctx_lock_nested(event->group_leader,
7762                                                 SINGLE_DEPTH_NESTING);
7763                BUG_ON(!ctx);
7764        }
7765
7766        event->pmu = pmu;
7767        ret = pmu->event_init(event);
7768
7769        if (ctx)
7770                perf_event_ctx_unlock(event->group_leader, ctx);
7771
7772        if (ret)
7773                module_put(pmu->module);
7774
7775        return ret;
7776}
7777
7778static struct pmu *perf_init_event(struct perf_event *event)
7779{
7780        struct pmu *pmu = NULL;
7781        int idx;
7782        int ret;
7783
7784        idx = srcu_read_lock(&pmus_srcu);
7785
7786        rcu_read_lock();
7787        pmu = idr_find(&pmu_idr, event->attr.type);
7788        rcu_read_unlock();
7789        if (pmu) {
7790                ret = perf_try_init_event(pmu, event);
7791                if (ret)
7792                        pmu = ERR_PTR(ret);
7793                goto unlock;
7794        }
7795
7796        list_for_each_entry_rcu(pmu, &pmus, entry) {
7797                ret = perf_try_init_event(pmu, event);
7798                if (!ret)
7799                        goto unlock;
7800
7801                if (ret != -ENOENT) {
7802                        pmu = ERR_PTR(ret);
7803                        goto unlock;
7804                }
7805        }
7806        pmu = ERR_PTR(-ENOENT);
7807unlock:
7808        srcu_read_unlock(&pmus_srcu, idx);
7809
7810        return pmu;
7811}
7812
7813static void account_event_cpu(struct perf_event *event, int cpu)
7814{
7815        if (event->parent)
7816                return;
7817
7818        if (is_cgroup_event(event))
7819                atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7820}
7821
7822static void account_event(struct perf_event *event)
7823{
7824        if (event->parent)
7825                return;
7826
7827        if (event->attach_state & PERF_ATTACH_TASK)
7828                static_key_slow_inc(&perf_sched_events.key);
7829        if (event->attr.mmap || event->attr.mmap_data)
7830                atomic_inc(&nr_mmap_events);
7831        if (event->attr.comm)
7832                atomic_inc(&nr_comm_events);
7833        if (event->attr.task)
7834                atomic_inc(&nr_task_events);
7835        if (event->attr.freq) {
7836                if (atomic_inc_return(&nr_freq_events) == 1)
7837                        tick_nohz_full_kick_all();
7838        }
7839        if (event->attr.context_switch) {
7840                atomic_inc(&nr_switch_events);
7841                static_key_slow_inc(&perf_sched_events.key);
7842        }
7843        if (has_branch_stack(event))
7844                static_key_slow_inc(&perf_sched_events.key);
7845        if (is_cgroup_event(event))
7846                static_key_slow_inc(&perf_sched_events.key);
7847
7848        account_event_cpu(event, event->cpu);
7849}
7850
7851/*
7852 * Allocate and initialize a event structure
7853 */
7854static struct perf_event *
7855perf_event_alloc(struct perf_event_attr *attr, int cpu,
7856                 struct task_struct *task,
7857                 struct perf_event *group_leader,
7858                 struct perf_event *parent_event,
7859                 perf_overflow_handler_t overflow_handler,
7860                 void *context, int cgroup_fd)
7861{
7862        struct pmu *pmu;
7863        struct perf_event *event;
7864        struct hw_perf_event *hwc;
7865        long err = -EINVAL;
7866
7867        if ((unsigned)cpu >= nr_cpu_ids) {
7868                if (!task || cpu != -1)
7869                        return ERR_PTR(-EINVAL);
7870        }
7871
7872        event = kzalloc(sizeof(*event), GFP_KERNEL);
7873        if (!event)
7874                return ERR_PTR(-ENOMEM);
7875
7876        /*
7877         * Single events are their own group leaders, with an
7878         * empty sibling list:
7879         */
7880        if (!group_leader)
7881                group_leader = event;
7882
7883        mutex_init(&event->child_mutex);
7884        INIT_LIST_HEAD(&event->child_list);
7885
7886        INIT_LIST_HEAD(&event->group_entry);
7887        INIT_LIST_HEAD(&event->event_entry);
7888        INIT_LIST_HEAD(&event->sibling_list);
7889        INIT_LIST_HEAD(&event->rb_entry);
7890        INIT_LIST_HEAD(&event->active_entry);
7891        INIT_HLIST_NODE(&event->hlist_entry);
7892
7893
7894        init_waitqueue_head(&event->waitq);
7895        init_irq_work(&event->pending, perf_pending_event);
7896
7897        mutex_init(&event->mmap_mutex);
7898
7899        atomic_long_set(&event->refcount, 1);
7900        event->cpu              = cpu;
7901        event->attr             = *attr;
7902        event->group_leader     = group_leader;
7903        event->pmu              = NULL;
7904        event->oncpu            = -1;
7905
7906        event->parent           = parent_event;
7907
7908        event->ns               = get_pid_ns(task_active_pid_ns(current));
7909        event->id               = atomic64_inc_return(&perf_event_id);
7910
7911        event->state            = PERF_EVENT_STATE_INACTIVE;
7912
7913        if (task) {
7914                event->attach_state = PERF_ATTACH_TASK;
7915                /*
7916                 * XXX pmu::event_init needs to know what task to account to
7917                 * and we cannot use the ctx information because we need the
7918                 * pmu before we get a ctx.
7919                 */
7920                event->hw.target = task;
7921        }
7922
7923        event->clock = &local_clock;
7924        if (parent_event)
7925                event->clock = parent_event->clock;
7926
7927        if (!overflow_handler && parent_event) {
7928                overflow_handler = parent_event->overflow_handler;
7929                context = parent_event->overflow_handler_context;
7930        }
7931
7932        event->overflow_handler = overflow_handler;
7933        event->overflow_handler_context = context;
7934
7935        perf_event__state_init(event);
7936
7937        pmu = NULL;
7938
7939        hwc = &event->hw;
7940        hwc->sample_period = attr->sample_period;
7941        if (attr->freq && attr->sample_freq)
7942                hwc->sample_period = 1;
7943        hwc->last_period = hwc->sample_period;
7944
7945        local64_set(&hwc->period_left, hwc->sample_period);
7946
7947        /*
7948         * we currently do not support PERF_FORMAT_GROUP on inherited events
7949         */
7950        if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
7951                goto err_ns;
7952
7953        if (!has_branch_stack(event))
7954                event->attr.branch_sample_type = 0;
7955
7956        if (cgroup_fd != -1) {
7957                err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
7958                if (err)
7959                        goto err_ns;
7960        }
7961
7962        pmu = perf_init_event(event);
7963        if (!pmu)
7964                goto err_ns;
7965        else if (IS_ERR(pmu)) {
7966                err = PTR_ERR(pmu);
7967                goto err_ns;
7968        }
7969
7970        err = exclusive_event_init(event);
7971        if (err)
7972                goto err_pmu;
7973
7974        if (!event->parent) {
7975                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
7976                        err = get_callchain_buffers();
7977                        if (err)
7978                                goto err_per_task;
7979                }
7980        }
7981
7982        return event;
7983
7984err_per_task:
7985        exclusive_event_destroy(event);
7986
7987err_pmu:
7988        if (event->destroy)
7989                event->destroy(event);
7990        module_put(pmu->module);
7991err_ns:
7992        if (is_cgroup_event(event))
7993                perf_detach_cgroup(event);
7994        if (event->ns)
7995                put_pid_ns(event->ns);
7996        kfree(event);
7997
7998        return ERR_PTR(err);
7999}
8000
8001static int perf_copy_attr(struct perf_event_attr __user *uattr,
8002                          struct perf_event_attr *attr)
8003{
8004        u32 size;
8005        int ret;
8006
8007        if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
8008                return -EFAULT;
8009
8010        /*
8011         * zero the full structure, so that a short copy will be nice.
8012         */
8013        memset(attr, 0, sizeof(*attr));
8014
8015        ret = get_user(size, &uattr->size);
8016        if (ret)
8017                return ret;
8018
8019        if (size > PAGE_SIZE)   /* silly large */
8020                goto err_size;
8021
8022        if (!size)              /* abi compat */
8023                size = PERF_ATTR_SIZE_VER0;
8024
8025        if (size < PERF_ATTR_SIZE_VER0)
8026                goto err_size;
8027
8028        /*
8029         * If we're handed a bigger struct than we know of,
8030         * ensure all the unknown bits are 0 - i.e. new
8031         * user-space does not rely on any kernel feature
8032         * extensions we dont know about yet.
8033         */
8034        if (size > sizeof(*attr)) {
8035                unsigned char __user *addr;
8036                unsigned char __user *end;
8037                unsigned char val;
8038
8039                addr = (void __user *)uattr + sizeof(*attr);
8040                end  = (void __user *)uattr + size;
8041
8042                for (; addr < end; addr++) {
8043                        ret = get_user(val, addr);
8044                        if (ret)
8045                                return ret;
8046                        if (val)
8047                                goto err_size;
8048                }
8049                size = sizeof(*attr);
8050        }
8051
8052        ret = copy_from_user(attr, uattr, size);
8053        if (ret)
8054                return -EFAULT;
8055
8056        if (attr->__reserved_1)
8057                return -EINVAL;
8058
8059        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
8060                return -EINVAL;
8061
8062        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
8063                return -EINVAL;
8064
8065        if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
8066                u64 mask = attr->branch_sample_type;
8067
8068                /* only using defined bits */
8069                if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
8070                        return -EINVAL;
8071
8072                /* at least one branch bit must be set */
8073                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
8074                        return -EINVAL;
8075
8076                /* propagate priv level, when not set for branch */
8077                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
8078
8079                        /* exclude_kernel checked on syscall entry */
8080                        if (!attr->exclude_kernel)
8081                                mask |= PERF_SAMPLE_BRANCH_KERNEL;
8082
8083                        if (!attr->exclude_user)
8084                                mask |= PERF_SAMPLE_BRANCH_USER;
8085
8086                        if (!attr->exclude_hv)
8087                                mask |= PERF_SAMPLE_BRANCH_HV;
8088                        /*
8089                         * adjust user setting (for HW filter setup)
8090                         */
8091                        attr->branch_sample_type = mask;
8092                }
8093                /* privileged levels capture (kernel, hv): check permissions */
8094                if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
8095                    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
8096                        return -EACCES;
8097        }
8098
8099        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
8100                ret = perf_reg_validate(attr->sample_regs_user);
8101                if (ret)
8102                        return ret;
8103        }
8104
8105        if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
8106                if (!arch_perf_have_user_stack_dump())
8107                        return -ENOSYS;
8108
8109                /*
8110                 * We have __u32 type for the size, but so far
8111                 * we can only use __u16 as maximum due to the
8112                 * __u16 sample size limit.
8113                 */
8114                if (attr->sample_stack_user >= USHRT_MAX)
8115                        ret = -EINVAL;
8116                else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
8117                        ret = -EINVAL;
8118        }
8119
8120        if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
8121                ret = perf_reg_validate(attr->sample_regs_intr);
8122out:
8123        return ret;
8124
8125err_size:
8126        put_user(sizeof(*attr), &uattr->size);
8127        ret = -E2BIG;
8128        goto out;
8129}
8130
8131static int
8132perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
8133{
8134        struct ring_buffer *rb = NULL;
8135        int ret = -EINVAL;
8136
8137        if (!output_event)
8138                goto set;
8139
8140        /* don't allow circular references */
8141        if (event == output_event)
8142                goto out;
8143
8144        /*
8145         * Don't allow cross-cpu buffers
8146         */
8147        if (output_event->cpu != event->cpu)
8148                goto out;
8149
8150        /*
8151         * If its not a per-cpu rb, it must be the same task.
8152         */
8153        if (output_event->cpu == -1 && output_event->ctx != event->ctx)
8154                goto out;
8155
8156        /*
8157         * Mixing clocks in the same buffer is trouble you don't need.
8158         */
8159        if (output_event->clock != event->clock)
8160                goto out;
8161
8162        /*
8163         * If both events generate aux data, they must be on the same PMU
8164         */
8165        if (has_aux(event) && has_aux(output_event) &&
8166            event->pmu != output_event->pmu)
8167                goto out;
8168
8169set:
8170        mutex_lock(&event->mmap_mutex);
8171        /* Can't redirect output if we've got an active mmap() */
8172        if (atomic_read(&event->mmap_count))
8173                goto unlock;
8174
8175        if (output_event) {
8176                /* get the rb we want to redirect to */
8177                rb = ring_buffer_get(output_event);
8178                if (!rb)
8179                        goto unlock;
8180        }
8181
8182        ring_buffer_attach(event, rb);
8183
8184        ret = 0;
8185unlock:
8186        mutex_unlock(&event->mmap_mutex);
8187
8188out:
8189        return ret;
8190}
8191
8192static void mutex_lock_double(struct mutex *a, struct mutex *b)
8193{
8194        if (b < a)
8195                swap(a, b);
8196
8197        mutex_lock(a);
8198        mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
8199}
8200
8201static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
8202{
8203        bool nmi_safe = false;
8204
8205        switch (clk_id) {
8206        case CLOCK_MONOTONIC:
8207                event->clock = &ktime_get_mono_fast_ns;
8208                nmi_safe = true;
8209                break;
8210
8211        case CLOCK_MONOTONIC_RAW:
8212                event->clock = &ktime_get_raw_fast_ns;
8213                nmi_safe = true;
8214                break;
8215
8216        case CLOCK_REALTIME:
8217                event->clock = &ktime_get_real_ns;
8218                break;
8219
8220        case CLOCK_BOOTTIME:
8221                event->clock = &ktime_get_boot_ns;
8222                break;
8223
8224        case CLOCK_TAI:
8225                event->clock = &ktime_get_tai_ns;
8226                break;
8227
8228        default:
8229                return -EINVAL;
8230        }
8231
8232        if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
8233                return -EINVAL;
8234
8235        return 0;
8236}
8237
8238/**
8239 * sys_perf_event_open - open a performance event, associate it to a task/cpu
8240 *
8241 * @attr_uptr:  event_id type attributes for monitoring/sampling
8242 * @pid:                target pid
8243 * @cpu:                target cpu
8244 * @group_fd:           group leader event fd
8245 */
8246SYSCALL_DEFINE5(perf_event_open,
8247                struct perf_event_attr __user *, attr_uptr,
8248                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
8249{
8250        struct perf_event *group_leader = NULL, *output_event = NULL;
8251        struct perf_event *event, *sibling;
8252        struct perf_event_attr attr;
8253        struct perf_event_context *ctx, *uninitialized_var(gctx);
8254        struct file *event_file = NULL;
8255        struct fd group = {NULL, 0};
8256        struct task_struct *task = NULL;
8257        struct pmu *pmu;
8258        int event_fd;
8259        int move_group = 0;
8260        int err;
8261        int f_flags = O_RDWR;
8262        int cgroup_fd = -1;
8263
8264        /* for future expandability... */
8265        if (flags & ~PERF_FLAG_ALL)
8266                return -EINVAL;
8267
8268        err = perf_copy_attr(attr_uptr, &attr);
8269        if (err)
8270                return err;
8271
8272        if (!attr.exclude_kernel) {
8273                if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
8274                        return -EACCES;
8275        }
8276
8277        if (attr.freq) {
8278                if (attr.sample_freq > sysctl_perf_event_sample_rate)
8279                        return -EINVAL;
8280        } else {
8281                if (attr.sample_period & (1ULL << 63))
8282                        return -EINVAL;
8283        }
8284
8285        /*
8286         * In cgroup mode, the pid argument is used to pass the fd
8287         * opened to the cgroup directory in cgroupfs. The cpu argument
8288         * designates the cpu on which to monitor threads from that
8289         * cgroup.
8290         */
8291        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
8292                return -EINVAL;
8293
8294        if (flags & PERF_FLAG_FD_CLOEXEC)
8295                f_flags |= O_CLOEXEC;
8296
8297        event_fd = get_unused_fd_flags(f_flags);
8298        if (event_fd < 0)
8299                return event_fd;
8300
8301        if (group_fd != -1) {
8302                err = perf_fget_light(group_fd, &group);
8303                if (err)
8304                        goto err_fd;
8305                group_leader = group.file->private_data;
8306                if (flags & PERF_FLAG_FD_OUTPUT)
8307                        output_event = group_leader;
8308                if (flags & PERF_FLAG_FD_NO_GROUP)
8309                        group_leader = NULL;
8310        }
8311
8312        if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
8313                task = find_lively_task_by_vpid(pid);
8314                if (IS_ERR(task)) {
8315                        err = PTR_ERR(task);
8316                        goto err_group_fd;
8317                }
8318        }
8319
8320        if (task && group_leader &&
8321            group_leader->attr.inherit != attr.inherit) {
8322                err = -EINVAL;
8323                goto err_task;
8324        }
8325
8326        get_online_cpus();
8327
8328        if (flags & PERF_FLAG_PID_CGROUP)
8329                cgroup_fd = pid;
8330
8331        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
8332                                 NULL, NULL, cgroup_fd);
8333        if (IS_ERR(event)) {
8334                err = PTR_ERR(event);
8335                goto err_cpus;
8336        }
8337
8338        if (is_sampling_event(event)) {
8339                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
8340                        err = -ENOTSUPP;
8341                        goto err_alloc;
8342                }
8343        }
8344
8345        account_event(event);
8346
8347        /*
8348         * Special case software events and allow them to be part of
8349         * any hardware group.
8350         */
8351        pmu = event->pmu;
8352
8353        if (attr.use_clockid) {
8354                err = perf_event_set_clock(event, attr.clockid);
8355                if (err)
8356                        goto err_alloc;
8357        }
8358
8359        if (group_leader &&
8360            (is_software_event(event) != is_software_event(group_leader))) {
8361                if (is_software_event(event)) {
8362                        /*
8363                         * If event and group_leader are not both a software
8364                         * event, and event is, then group leader is not.
8365                         *
8366                         * Allow the addition of software events to !software
8367                         * groups, this is safe because software events never
8368                         * fail to schedule.
8369                         */
8370                        pmu = group_leader->pmu;
8371                } else if (is_software_event(group_leader) &&
8372                           (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
8373                        /*
8374                         * In case the group is a pure software group, and we
8375                         * try to add a hardware event, move the whole group to
8376                         * the hardware context.
8377                         */
8378                        move_group = 1;
8379                }
8380        }
8381
8382        /*
8383         * Get the target context (task or percpu):
8384         */
8385        ctx = find_get_context(pmu, task, event);
8386        if (IS_ERR(ctx)) {
8387                err = PTR_ERR(ctx);
8388                goto err_alloc;
8389        }
8390
8391        if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
8392                err = -EBUSY;
8393                goto err_context;
8394        }
8395
8396        if (task) {
8397                put_task_struct(task);
8398                task = NULL;
8399        }
8400
8401        /*
8402         * Look up the group leader (we will attach this event to it):
8403         */
8404        if (group_leader) {
8405                err = -EINVAL;
8406
8407                /*
8408                 * Do not allow a recursive hierarchy (this new sibling
8409                 * becoming part of another group-sibling):
8410                 */
8411                if (group_leader->group_leader != group_leader)
8412                        goto err_context;
8413
8414                /* All events in a group should have the same clock */
8415                if (group_leader->clock != event->clock)
8416                        goto err_context;
8417
8418                /*
8419                 * Do not allow to attach to a group in a different
8420                 * task or CPU context:
8421                 */
8422                if (move_group) {
8423                        /*
8424                         * Make sure we're both on the same task, or both
8425                         * per-cpu events.
8426                         */
8427                        if (group_leader->ctx->task != ctx->task)
8428                                goto err_context;
8429
8430                        /*
8431                         * Make sure we're both events for the same CPU;
8432                         * grouping events for different CPUs is broken; since
8433                         * you can never concurrently schedule them anyhow.
8434                         */
8435                        if (group_leader->cpu != event->cpu)
8436                                goto err_context;
8437                } else {
8438                        if (group_leader->ctx != ctx)
8439                                goto err_context;
8440                }
8441
8442                /*
8443                 * Only a group leader can be exclusive or pinned
8444                 */
8445                if (attr.exclusive || attr.pinned)
8446                        goto err_context;
8447        }
8448
8449        if (output_event) {
8450                err = perf_event_set_output(event, output_event);
8451                if (err)
8452                        goto err_context;
8453        }
8454
8455        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
8456                                        f_flags);
8457        if (IS_ERR(event_file)) {
8458                err = PTR_ERR(event_file);
8459                goto err_context;
8460        }
8461
8462        if (move_group) {
8463                gctx = group_leader->ctx;
8464                mutex_lock_double(&gctx->mutex, &ctx->mutex);
8465        } else {
8466                mutex_lock(&ctx->mutex);
8467        }
8468
8469        if (!perf_event_validate_size(event)) {
8470                err = -E2BIG;
8471                goto err_locked;
8472        }
8473
8474        /*
8475         * Must be under the same ctx::mutex as perf_install_in_context(),
8476         * because we need to serialize with concurrent event creation.
8477         */
8478        if (!exclusive_event_installable(event, ctx)) {
8479                /* exclusive and group stuff are assumed mutually exclusive */
8480                WARN_ON_ONCE(move_group);
8481
8482                err = -EBUSY;
8483                goto err_locked;
8484        }
8485
8486        WARN_ON_ONCE(ctx->parent_ctx);
8487
8488        if (move_group) {
8489                /*
8490                 * See perf_event_ctx_lock() for comments on the details
8491                 * of swizzling perf_event::ctx.
8492                 */
8493                perf_remove_from_context(group_leader, false);
8494
8495                list_for_each_entry(sibling, &group_leader->sibling_list,
8496                                    group_entry) {
8497                        perf_remove_from_context(sibling, false);
8498                        put_ctx(gctx);
8499                }
8500
8501                /*
8502                 * Wait for everybody to stop referencing the events through
8503                 * the old lists, before installing it on new lists.
8504                 */
8505                synchronize_rcu();
8506
8507                /*
8508                 * Install the group siblings before the group leader.
8509                 *
8510                 * Because a group leader will try and install the entire group
8511                 * (through the sibling list, which is still in-tact), we can
8512                 * end up with siblings installed in the wrong context.
8513                 *
8514                 * By installing siblings first we NO-OP because they're not
8515                 * reachable through the group lists.
8516                 */
8517                list_for_each_entry(sibling, &group_leader->sibling_list,
8518                                    group_entry) {
8519                        perf_event__state_init(sibling);
8520                        perf_install_in_context(ctx, sibling, sibling->cpu);
8521                        get_ctx(ctx);
8522                }
8523
8524                /*
8525                 * Removing from the context ends up with disabled
8526                 * event. What we want here is event in the initial
8527                 * startup state, ready to be add into new context.
8528                 */
8529                perf_event__state_init(group_leader);
8530                perf_install_in_context(ctx, group_leader, group_leader->cpu);
8531                get_ctx(ctx);
8532
8533                /*
8534                 * Now that all events are installed in @ctx, nothing
8535                 * references @gctx anymore, so drop the last reference we have
8536                 * on it.
8537                 */
8538                put_ctx(gctx);
8539        }
8540
8541        /*
8542         * Precalculate sample_data sizes; do while holding ctx::mutex such
8543         * that we're serialized against further additions and before
8544         * perf_install_in_context() which is the point the event is active and
8545         * can use these values.
8546         */
8547        perf_event__header_size(event);
8548        perf_event__id_header_size(event);
8549
8550        perf_install_in_context(ctx, event, event->cpu);
8551        perf_unpin_context(ctx);
8552
8553        if (move_group)
8554                mutex_unlock(&gctx->mutex);
8555        mutex_unlock(&ctx->mutex);
8556
8557        put_online_cpus();
8558
8559        event->owner = current;
8560
8561        mutex_lock(&current->perf_event_mutex);
8562        list_add_tail(&event->owner_entry, &current->perf_event_list);
8563        mutex_unlock(&current->perf_event_mutex);
8564
8565        /*
8566         * Drop the reference on the group_event after placing the
8567         * new event on the sibling_list. This ensures destruction
8568         * of the group leader will find the pointer to itself in
8569         * perf_group_detach().
8570         */
8571        fdput(group);
8572        fd_install(event_fd, event_file);
8573        return event_fd;
8574
8575err_locked:
8576        if (move_group)
8577                mutex_unlock(&gctx->mutex);
8578        mutex_unlock(&ctx->mutex);
8579/* err_file: */
8580        fput(event_file);
8581err_context:
8582        perf_unpin_context(ctx);
8583        put_ctx(ctx);
8584err_alloc:
8585        free_event(event);
8586err_cpus:
8587        put_online_cpus();
8588err_task:
8589        if (task)
8590                put_task_struct(task);
8591err_group_fd:
8592        fdput(group);
8593err_fd:
8594        put_unused_fd(event_fd);
8595        return err;
8596}
8597
8598/**
8599 * perf_event_create_kernel_counter
8600 *
8601 * @attr: attributes of the counter to create
8602 * @cpu: cpu in which the counter is bound
8603 * @task: task to profile (NULL for percpu)
8604 */
8605struct perf_event *
8606perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
8607                                 struct task_struct *task,
8608                                 perf_overflow_handler_t overflow_handler,
8609                                 void *context)
8610{
8611        struct perf_event_context *ctx;
8612        struct perf_event *event;
8613        int err;
8614
8615        /*
8616         * Get the target context (task or percpu):
8617         */
8618
8619        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
8620                                 overflow_handler, context, -1);
8621        if (IS_ERR(event)) {
8622                err = PTR_ERR(event);
8623                goto err;
8624        }
8625
8626        /* Mark owner so we could distinguish it from user events. */
8627        event->owner = EVENT_OWNER_KERNEL;
8628
8629        account_event(event);
8630
8631        ctx = find_get_context(event->pmu, task, event);
8632        if (IS_ERR(ctx)) {
8633                err = PTR_ERR(ctx);
8634                goto err_free;
8635        }
8636
8637        WARN_ON_ONCE(ctx->parent_ctx);
8638        mutex_lock(&ctx->mutex);
8639        if (!exclusive_event_installable(event, ctx)) {
8640                mutex_unlock(&ctx->mutex);
8641                perf_unpin_context(ctx);
8642                put_ctx(ctx);
8643                err = -EBUSY;
8644                goto err_free;
8645        }
8646
8647        perf_install_in_context(ctx, event, cpu);
8648        perf_unpin_context(ctx);
8649        mutex_unlock(&ctx->mutex);
8650
8651        return event;
8652
8653err_free:
8654        free_event(event);
8655err:
8656        return ERR_PTR(err);
8657}
8658EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
8659
8660void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
8661{
8662        struct perf_event_context *src_ctx;
8663        struct perf_event_context *dst_ctx;
8664        struct perf_event *event, *tmp;
8665        LIST_HEAD(events);
8666
8667        src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
8668        dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
8669
8670        /*
8671         * See perf_event_ctx_lock() for comments on the details
8672         * of swizzling perf_event::ctx.
8673         */
8674        mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
8675        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
8676                                 event_entry) {
8677                perf_remove_from_context(event, false);
8678                unaccount_event_cpu(event, src_cpu);
8679                put_ctx(src_ctx);
8680                list_add(&event->migrate_entry, &events);
8681        }
8682
8683        /*
8684         * Wait for the events to quiesce before re-instating them.
8685         */
8686        synchronize_rcu();
8687
8688        /*
8689         * Re-instate events in 2 passes.
8690         *
8691         * Skip over group leaders and only install siblings on this first
8692         * pass, siblings will not get enabled without a leader, however a
8693         * leader will enable its siblings, even if those are still on the old
8694         * context.
8695         */
8696        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8697                if (event->group_leader == event)
8698                        continue;
8699
8700                list_del(&event->migrate_entry);
8701                if (event->state >= PERF_EVENT_STATE_OFF)
8702                        event->state = PERF_EVENT_STATE_INACTIVE;
8703                account_event_cpu(event, dst_cpu);
8704                perf_install_in_context(dst_ctx, event, dst_cpu);
8705                get_ctx(dst_ctx);
8706        }
8707
8708        /*
8709         * Once all the siblings are setup properly, install the group leaders
8710         * to make it go.
8711         */
8712        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8713                list_del(&event->migrate_entry);
8714                if (event->state >= PERF_EVENT_STATE_OFF)
8715                        event->state = PERF_EVENT_STATE_INACTIVE;
8716                account_event_cpu(event, dst_cpu);
8717                perf_install_in_context(dst_ctx, event, dst_cpu);
8718                get_ctx(dst_ctx);
8719        }
8720        mutex_unlock(&dst_ctx->mutex);
8721        mutex_unlock(&src_ctx->mutex);
8722}
8723EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
8724
8725static void sync_child_event(struct perf_event *child_event,
8726                               struct task_struct *child)
8727{
8728        struct perf_event *parent_event = child_event->parent;
8729        u64 child_val;
8730
8731        if (child_event->attr.inherit_stat)
8732                perf_event_read_event(child_event, child);
8733
8734        child_val = perf_event_count(child_event);
8735
8736        /*
8737         * Add back the child's count to the parent's count:
8738         */
8739        atomic64_add(child_val, &parent_event->child_count);
8740        atomic64_add(child_event->total_time_enabled,
8741                     &parent_event->child_total_time_enabled);
8742        atomic64_add(child_event->total_time_running,
8743                     &parent_event->child_total_time_running);
8744
8745        /*
8746         * Remove this event from the parent's list
8747         */
8748        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8749        mutex_lock(&parent_event->child_mutex);
8750        list_del_init(&child_event->child_list);
8751        mutex_unlock(&parent_event->child_mutex);
8752
8753        /*
8754         * Make sure user/parent get notified, that we just
8755         * lost one event.
8756         */
8757        perf_event_wakeup(parent_event);
8758
8759        /*
8760         * Release the parent event, if this was the last
8761         * reference to it.
8762         */
8763        put_event(parent_event);
8764}
8765
8766static void
8767__perf_event_exit_task(struct perf_event *child_event,
8768                         struct perf_event_context *child_ctx,
8769                         struct task_struct *child)
8770{
8771        /*
8772         * Do not destroy the 'original' grouping; because of the context
8773         * switch optimization the original events could've ended up in a
8774         * random child task.
8775         *
8776         * If we were to destroy the original group, all group related
8777         * operations would cease to function properly after this random
8778         * child dies.
8779         *
8780         * Do destroy all inherited groups, we don't care about those
8781         * and being thorough is better.
8782         */
8783        perf_remove_from_context(child_event, !!child_event->parent);
8784
8785        /*
8786         * It can happen that the parent exits first, and has events
8787         * that are still around due to the child reference. These
8788         * events need to be zapped.
8789         */
8790        if (child_event->parent) {
8791                sync_child_event(child_event, child);
8792                free_event(child_event);
8793        } else {
8794                child_event->state = PERF_EVENT_STATE_EXIT;
8795                perf_event_wakeup(child_event);
8796        }
8797}
8798
8799static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
8800{
8801        struct perf_event *child_event, *next;
8802        struct perf_event_context *child_ctx, *clone_ctx = NULL;
8803        unsigned long flags;
8804
8805        if (likely(!child->perf_event_ctxp[ctxn]))
8806                return;
8807
8808        local_irq_save(flags);
8809        /*
8810         * We can't reschedule here because interrupts are disabled,
8811         * and either child is current or it is a task that can't be
8812         * scheduled, so we are now safe from rescheduling changing
8813         * our context.
8814         */
8815        child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
8816
8817        /*
8818         * Take the context lock here so that if find_get_context is
8819         * reading child->perf_event_ctxp, we wait until it has
8820         * incremented the context's refcount before we do put_ctx below.
8821         */
8822        raw_spin_lock(&child_ctx->lock);
8823        task_ctx_sched_out(child_ctx);
8824        child->perf_event_ctxp[ctxn] = NULL;
8825
8826        /*
8827         * If this context is a clone; unclone it so it can't get
8828         * swapped to another process while we're removing all
8829         * the events from it.
8830         */
8831        clone_ctx = unclone_ctx(child_ctx);
8832        update_context_time(child_ctx);
8833        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8834
8835        if (clone_ctx)
8836                put_ctx(clone_ctx);
8837
8838        /*
8839         * Report the task dead after unscheduling the events so that we
8840         * won't get any samples after PERF_RECORD_EXIT. We can however still
8841         * get a few PERF_RECORD_READ events.
8842         */
8843        perf_event_task(child, child_ctx, 0);
8844
8845        /*
8846         * We can recurse on the same lock type through:
8847         *
8848         *   __perf_event_exit_task()
8849         *     sync_child_event()
8850         *       put_event()
8851         *         mutex_lock(&ctx->mutex)
8852         *
8853         * But since its the parent context it won't be the same instance.
8854         */
8855        mutex_lock(&child_ctx->mutex);
8856
8857        list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
8858                __perf_event_exit_task(child_event, child_ctx, child);
8859
8860        mutex_unlock(&child_ctx->mutex);
8861
8862        put_ctx(child_ctx);
8863}
8864
8865/*
8866 * When a child task exits, feed back event values to parent events.
8867 */
8868void perf_event_exit_task(struct task_struct *child)
8869{
8870        struct perf_event *event, *tmp;
8871        int ctxn;
8872
8873        mutex_lock(&child->perf_event_mutex);
8874        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
8875                                 owner_entry) {
8876                list_del_init(&event->owner_entry);
8877
8878                /*
8879                 * Ensure the list deletion is visible before we clear
8880                 * the owner, closes a race against perf_release() where
8881                 * we need to serialize on the owner->perf_event_mutex.
8882                 */
8883                smp_wmb();
8884                event->owner = NULL;
8885        }
8886        mutex_unlock(&child->perf_event_mutex);
8887
8888        for_each_task_context_nr(ctxn)
8889                perf_event_exit_task_context(child, ctxn);
8890
8891        /*
8892         * The perf_event_exit_task_context calls perf_event_task
8893         * with child's task_ctx, which generates EXIT events for
8894         * child contexts and sets child->perf_event_ctxp[] to NULL.
8895         * At this point we need to send EXIT events to cpu contexts.
8896         */
8897        perf_event_task(child, NULL, 0);
8898}
8899
8900static void perf_free_event(struct perf_event *event,
8901                            struct perf_event_context *ctx)
8902{
8903        struct perf_event *parent = event->parent;
8904
8905        if (WARN_ON_ONCE(!parent))
8906                return;
8907
8908        mutex_lock(&parent->child_mutex);
8909        list_del_init(&event->child_list);
8910        mutex_unlock(&parent->child_mutex);
8911
8912        put_event(parent);
8913
8914        raw_spin_lock_irq(&ctx->lock);
8915        perf_group_detach(event);
8916        list_del_event(event, ctx);
8917        raw_spin_unlock_irq(&ctx->lock);
8918        free_event(event);
8919}
8920
8921/*
8922 * Free an unexposed, unused context as created by inheritance by
8923 * perf_event_init_task below, used by fork() in case of fail.
8924 *
8925 * Not all locks are strictly required, but take them anyway to be nice and
8926 * help out with the lockdep assertions.
8927 */
8928void perf_event_free_task(struct task_struct *task)
8929{
8930        struct perf_event_context *ctx;
8931        struct perf_event *event, *tmp;
8932        int ctxn;
8933
8934        for_each_task_context_nr(ctxn) {
8935                ctx = task->perf_event_ctxp[ctxn];
8936                if (!ctx)
8937                        continue;
8938
8939                mutex_lock(&ctx->mutex);
8940again:
8941                list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
8942                                group_entry)
8943                        perf_free_event(event, ctx);
8944
8945                list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
8946                                group_entry)
8947                        perf_free_event(event, ctx);
8948
8949                if (!list_empty(&ctx->pinned_groups) ||
8950                                !list_empty(&ctx->flexible_groups))
8951                        goto again;
8952
8953                mutex_unlock(&ctx->mutex);
8954
8955                put_ctx(ctx);
8956        }
8957}
8958
8959void perf_event_delayed_put(struct task_struct *task)
8960{
8961        int ctxn;
8962
8963        for_each_task_context_nr(ctxn)
8964                WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
8965}
8966
8967struct perf_event *perf_event_get(unsigned int fd)
8968{
8969        int err;
8970        struct fd f;
8971        struct perf_event *event;
8972
8973        err = perf_fget_light(fd, &f);
8974        if (err)
8975                return ERR_PTR(err);
8976
8977        event = f.file->private_data;
8978        atomic_long_inc(&event->refcount);
8979        fdput(f);
8980
8981        return event;
8982}
8983
8984const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
8985{
8986        if (!event)
8987                return ERR_PTR(-EINVAL);
8988
8989        return &event->attr;
8990}
8991
8992/*
8993 * inherit a event from parent task to child task:
8994 */
8995static struct perf_event *
8996inherit_event(struct perf_event *parent_event,
8997              struct task_struct *parent,
8998              struct perf_event_context *parent_ctx,
8999              struct task_struct *child,
9000              struct perf_event *group_leader,
9001              struct perf_event_context *child_ctx)
9002{
9003        enum perf_event_active_state parent_state = parent_event->state;
9004        struct perf_event *child_event;
9005        unsigned long flags;
9006
9007        /*
9008         * Instead of creating recursive hierarchies of events,
9009         * we link inherited events back to the original parent,
9010         * which has a filp for sure, which we use as the reference
9011         * count:
9012         */
9013        if (parent_event->parent)
9014                parent_event = parent_event->parent;
9015
9016        child_event = perf_event_alloc(&parent_event->attr,
9017                                           parent_event->cpu,
9018                                           child,
9019                                           group_leader, parent_event,
9020                                           NULL, NULL, -1);
9021        if (IS_ERR(child_event))
9022                return child_event;
9023
9024        if (is_orphaned_event(parent_event) ||
9025            !atomic_long_inc_not_zero(&parent_event->refcount)) {
9026                free_event(child_event);
9027                return NULL;
9028        }
9029
9030        get_ctx(child_ctx);
9031
9032        /*
9033         * Make the child state follow the state of the parent event,
9034         * not its attr.disabled bit.  We hold the parent's mutex,
9035         * so we won't race with perf_event_{en, dis}able_family.
9036         */
9037        if (parent_state >= PERF_EVENT_STATE_INACTIVE)
9038                child_event->state = PERF_EVENT_STATE_INACTIVE;
9039        else
9040                child_event->state = PERF_EVENT_STATE_OFF;
9041
9042        if (parent_event->attr.freq) {
9043                u64 sample_period = parent_event->hw.sample_period;
9044                struct hw_perf_event *hwc = &child_event->hw;
9045
9046                hwc->sample_period = sample_period;
9047                hwc->last_period   = sample_period;
9048
9049                local64_set(&hwc->period_left, sample_period);
9050        }
9051
9052        child_event->ctx = child_ctx;
9053        child_event->overflow_handler = parent_event->overflow_handler;
9054        child_event->overflow_handler_context
9055                = parent_event->overflow_handler_context;
9056
9057        /*
9058         * Precalculate sample_data sizes
9059         */
9060        perf_event__header_size(child_event);
9061        perf_event__id_header_size(child_event);
9062
9063        /*
9064         * Link it up in the child's context:
9065         */
9066        raw_spin_lock_irqsave(&child_ctx->lock, flags);
9067        add_event_to_ctx(child_event, child_ctx);
9068        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
9069
9070        /*
9071         * Link this into the parent event's child list
9072         */
9073        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
9074        mutex_lock(&parent_event->child_mutex);
9075        list_add_tail(&child_event->child_list, &parent_event->child_list);
9076        mutex_unlock(&parent_event->child_mutex);
9077
9078        return child_event;
9079}
9080
9081static int inherit_group(struct perf_event *parent_event,
9082              struct task_struct *parent,
9083              struct perf_event_context *parent_ctx,
9084              struct task_struct *child,
9085              struct perf_event_context *child_ctx)
9086{
9087        struct perf_event *leader;
9088        struct perf_event *sub;
9089        struct perf_event *child_ctr;
9090
9091        leader = inherit_event(parent_event, parent, parent_ctx,
9092                                 child, NULL, child_ctx);
9093        if (IS_ERR(leader))
9094                return PTR_ERR(leader);
9095        list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
9096                child_ctr = inherit_event(sub, parent, parent_ctx,
9097                                            child, leader, child_ctx);
9098                if (IS_ERR(child_ctr))
9099                        return PTR_ERR(child_ctr);
9100        }
9101        return 0;
9102}
9103
9104static int
9105inherit_task_group(struct perf_event *event, struct task_struct *parent,
9106                   struct perf_event_context *parent_ctx,
9107                   struct task_struct *child, int ctxn,
9108                   int *inherited_all)
9109{
9110        int ret;
9111        struct perf_event_context *child_ctx;
9112
9113        if (!event->attr.inherit) {
9114                *inherited_all = 0;
9115                return 0;
9116        }
9117
9118        child_ctx = child->perf_event_ctxp[ctxn];
9119        if (!child_ctx) {
9120                /*
9121                 * This is executed from the parent task context, so
9122                 * inherit events that have been marked for cloning.
9123                 * First allocate and initialize a context for the
9124                 * child.
9125                 */
9126
9127                child_ctx = alloc_perf_context(parent_ctx->pmu, child);
9128                if (!child_ctx)
9129                        return -ENOMEM;
9130
9131                child->perf_event_ctxp[ctxn] = child_ctx;
9132        }
9133
9134        ret = inherit_group(event, parent, parent_ctx,
9135                            child, child_ctx);
9136
9137        if (ret)
9138                *inherited_all = 0;
9139
9140        return ret;
9141}
9142
9143/*
9144 * Initialize the perf_event context in task_struct
9145 */
9146static int perf_event_init_context(struct task_struct *child, int ctxn)
9147{
9148        struct perf_event_context *child_ctx, *parent_ctx;
9149        struct perf_event_context *cloned_ctx;
9150        struct perf_event *event;
9151        struct task_struct *parent = current;
9152        int inherited_all = 1;
9153        unsigned long flags;
9154        int ret = 0;
9155
9156        if (likely(!parent->perf_event_ctxp[ctxn]))
9157                return 0;
9158
9159        /*
9160         * If the parent's context is a clone, pin it so it won't get
9161         * swapped under us.
9162         */
9163        parent_ctx = perf_pin_task_context(parent, ctxn);
9164        if (!parent_ctx)
9165                return 0;
9166
9167        /*
9168         * No need to check if parent_ctx != NULL here; since we saw
9169         * it non-NULL earlier, the only reason for it to become NULL
9170         * is if we exit, and since we're currently in the middle of
9171         * a fork we can't be exiting at the same time.
9172         */
9173
9174        /*
9175         * Lock the parent list. No need to lock the child - not PID
9176         * hashed yet and not running, so nobody can access it.
9177         */
9178        mutex_lock(&parent_ctx->mutex);
9179
9180        /*
9181         * We dont have to disable NMIs - we are only looking at
9182         * the list, not manipulating it:
9183         */
9184        list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
9185                ret = inherit_task_group(event, parent, parent_ctx,
9186                                         child, ctxn, &inherited_all);
9187                if (ret)
9188                        break;
9189        }
9190
9191        /*
9192         * We can't hold ctx->lock when iterating the ->flexible_group list due
9193         * to allocations, but we need to prevent rotation because
9194         * rotate_ctx() will change the list from interrupt context.
9195         */
9196        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
9197        parent_ctx->rotate_disable = 1;
9198        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
9199
9200        list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
9201                ret = inherit_task_group(event, parent, parent_ctx,
9202                                         child, ctxn, &inherited_all);
9203                if (ret)
9204                        break;
9205        }
9206
9207        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
9208        parent_ctx->rotate_disable = 0;
9209
9210        child_ctx = child->perf_event_ctxp[ctxn];
9211
9212        if (child_ctx && inherited_all) {
9213                /*
9214                 * Mark the child context as a clone of the parent
9215                 * context, or of whatever the parent is a clone of.
9216                 *
9217                 * Note that if the parent is a clone, the holding of
9218                 * parent_ctx->lock avoids it from being uncloned.
9219                 */
9220                cloned_ctx = parent_ctx->parent_ctx;
9221                if (cloned_ctx) {
9222                        child_ctx->parent_ctx = cloned_ctx;
9223                        child_ctx->parent_gen = parent_ctx->parent_gen;
9224                } else {
9225                        child_ctx->parent_ctx = parent_ctx;
9226                        child_ctx->parent_gen = parent_ctx->generation;
9227                }
9228                get_ctx(child_ctx->parent_ctx);
9229        }
9230
9231        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
9232        mutex_unlock(&parent_ctx->mutex);
9233
9234        perf_unpin_context(parent_ctx);
9235        put_ctx(parent_ctx);
9236
9237        return ret;
9238}
9239
9240/*
9241 * Initialize the perf_event context in task_struct
9242 */
9243int perf_event_init_task(struct task_struct *child)
9244{
9245        int ctxn, ret;
9246
9247        memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
9248        mutex_init(&child->perf_event_mutex);
9249        INIT_LIST_HEAD(&child->perf_event_list);
9250
9251        for_each_task_context_nr(ctxn) {
9252                ret = perf_event_init_context(child, ctxn);
9253                if (ret) {
9254                        perf_event_free_task(child);
9255                        return ret;
9256                }
9257        }
9258
9259        return 0;
9260}
9261
9262static void __init perf_event_init_all_cpus(void)
9263{
9264        struct swevent_htable *swhash;
9265        int cpu;
9266
9267        for_each_possible_cpu(cpu) {
9268                swhash = &per_cpu(swevent_htable, cpu);
9269                mutex_init(&swhash->hlist_mutex);
9270                INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
9271        }
9272}
9273
9274static void perf_event_init_cpu(int cpu)
9275{
9276        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9277
9278        mutex_lock(&swhash->hlist_mutex);
9279        if (swhash->hlist_refcount > 0) {
9280                struct swevent_hlist *hlist;
9281
9282                hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
9283                WARN_ON(!hlist);
9284                rcu_assign_pointer(swhash->swevent_hlist, hlist);
9285        }
9286        mutex_unlock(&swhash->hlist_mutex);
9287}
9288
9289#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
9290static void __perf_event_exit_context(void *__info)
9291{
9292        struct remove_event re = { .detach_group = true };
9293        struct perf_event_context *ctx = __info;
9294
9295        rcu_read_lock();
9296        list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
9297                __perf_remove_from_context(&re);
9298        rcu_read_unlock();
9299}
9300
9301static void perf_event_exit_cpu_context(int cpu)
9302{
9303        struct perf_event_context *ctx;
9304        struct pmu *pmu;
9305        int idx;
9306
9307        idx = srcu_read_lock(&pmus_srcu);
9308        list_for_each_entry_rcu(pmu, &pmus, entry) {
9309                ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
9310
9311                mutex_lock(&ctx->mutex);
9312                smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
9313                mutex_unlock(&ctx->mutex);
9314        }
9315        srcu_read_unlock(&pmus_srcu, idx);
9316}
9317
9318static void perf_event_exit_cpu(int cpu)
9319{
9320        perf_event_exit_cpu_context(cpu);
9321}
9322#else
9323static inline void perf_event_exit_cpu(int cpu) { }
9324#endif
9325
9326static int
9327perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
9328{
9329        int cpu;
9330
9331        for_each_online_cpu(cpu)
9332                perf_event_exit_cpu(cpu);
9333
9334        return NOTIFY_OK;
9335}
9336
9337/*
9338 * Run the perf reboot notifier at the very last possible moment so that
9339 * the generic watchdog code runs as long as possible.
9340 */
9341static struct notifier_block perf_reboot_notifier = {
9342        .notifier_call = perf_reboot,
9343        .priority = INT_MIN,
9344};
9345
9346static int
9347perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
9348{
9349        unsigned int cpu = (long)hcpu;
9350
9351        switch (action & ~CPU_TASKS_FROZEN) {
9352
9353        case CPU_UP_PREPARE:
9354        case CPU_DOWN_FAILED:
9355                perf_event_init_cpu(cpu);
9356                break;
9357
9358        case CPU_UP_CANCELED:
9359        case CPU_DOWN_PREPARE:
9360                perf_event_exit_cpu(cpu);
9361                break;
9362        default:
9363                break;
9364        }
9365
9366        return NOTIFY_OK;
9367}
9368
9369void __init perf_event_init(void)
9370{
9371        int ret;
9372
9373        idr_init(&pmu_idr);
9374
9375        perf_event_init_all_cpus();
9376        init_srcu_struct(&pmus_srcu);
9377        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
9378        perf_pmu_register(&perf_cpu_clock, NULL, -1);
9379        perf_pmu_register(&perf_task_clock, NULL, -1);
9380        perf_tp_register();
9381        perf_cpu_notifier(perf_cpu_notify);
9382        register_reboot_notifier(&perf_reboot_notifier);
9383
9384        ret = init_hw_breakpoint();
9385        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
9386
9387        /* do not patch jump label more than once per second */
9388        jump_label_rate_limit(&perf_sched_events, HZ);
9389
9390        /*
9391         * Build time assertion that we keep the data_head at the intended
9392         * location.  IOW, validation we got the __reserved[] size right.
9393         */
9394        BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
9395                     != 1024);
9396}
9397
9398ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
9399                              char *page)
9400{
9401        struct perf_pmu_events_attr *pmu_attr =
9402                container_of(attr, struct perf_pmu_events_attr, attr);
9403
9404        if (pmu_attr->event_str)
9405                return sprintf(page, "%s\n", pmu_attr->event_str);
9406
9407        return 0;
9408}
9409
9410static int __init perf_event_sysfs_init(void)
9411{
9412        struct pmu *pmu;
9413        int ret;
9414
9415        mutex_lock(&pmus_lock);
9416
9417        ret = bus_register(&pmu_bus);
9418        if (ret)
9419                goto unlock;
9420
9421        list_for_each_entry(pmu, &pmus, entry) {
9422                if (!pmu->name || pmu->type < 0)
9423                        continue;
9424
9425                ret = pmu_dev_alloc(pmu);
9426                WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
9427        }
9428        pmu_bus_running = 1;
9429        ret = 0;
9430
9431unlock:
9432        mutex_unlock(&pmus_lock);
9433
9434        return ret;
9435}
9436device_initcall(perf_event_sysfs_init);
9437
9438#ifdef CONFIG_CGROUP_PERF
9439static struct cgroup_subsys_state *
9440perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
9441{
9442        struct perf_cgroup *jc;
9443
9444        jc = kzalloc(sizeof(*jc), GFP_KERNEL);
9445        if (!jc)
9446                return ERR_PTR(-ENOMEM);
9447
9448        jc->info = alloc_percpu(struct perf_cgroup_info);
9449        if (!jc->info) {
9450                kfree(jc);
9451                return ERR_PTR(-ENOMEM);
9452        }
9453
9454        return &jc->css;
9455}
9456
9457static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
9458{
9459        struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
9460
9461        free_percpu(jc->info);
9462        kfree(jc);
9463}
9464
9465static int __perf_cgroup_move(void *info)
9466{
9467        struct task_struct *task = info;
9468        rcu_read_lock();
9469        perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
9470        rcu_read_unlock();
9471        return 0;
9472}
9473
9474static void perf_cgroup_attach(struct cgroup_taskset *tset)
9475{
9476        struct task_struct *task;
9477        struct cgroup_subsys_state *css;
9478
9479        cgroup_taskset_for_each(task, css, tset)
9480                task_function_call(task, __perf_cgroup_move, task);
9481}
9482
9483struct cgroup_subsys perf_event_cgrp_subsys = {
9484        .css_alloc      = perf_cgroup_css_alloc,
9485        .css_free       = perf_cgroup_css_free,
9486        .attach         = perf_cgroup_attach,
9487};
9488#endif /* CONFIG_CGROUP_PERF */
9489