linux/kernel/perf_event.c
<<
>>
Prefs
   1/*
   2 * Performance events core code:
   3 *
   4 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8 *
   9 * For licensing details see kernel-base/COPYING
  10 */
  11
  12#include <linux/fs.h>
  13#include <linux/mm.h>
  14#include <linux/cpu.h>
  15#include <linux/smp.h>
  16#include <linux/file.h>
  17#include <linux/poll.h>
  18#include <linux/sysfs.h>
  19#include <linux/dcache.h>
  20#include <linux/percpu.h>
  21#include <linux/ptrace.h>
  22#include <linux/vmstat.h>
  23#include <linux/vmalloc.h>
  24#include <linux/hardirq.h>
  25#include <linux/rculist.h>
  26#include <linux/uaccess.h>
  27#include <linux/syscalls.h>
  28#include <linux/anon_inodes.h>
  29#include <linux/kernel_stat.h>
  30#include <linux/perf_event.h>
  31
  32#include <asm/irq_regs.h>
  33
  34/*
  35 * Each CPU has a list of per CPU events:
  36 */
  37DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  38
  39int perf_max_events __read_mostly = 1;
  40static int perf_reserved_percpu __read_mostly;
  41static int perf_overcommit __read_mostly = 1;
  42
  43static atomic_t nr_events __read_mostly;
  44static atomic_t nr_mmap_events __read_mostly;
  45static atomic_t nr_comm_events __read_mostly;
  46static atomic_t nr_task_events __read_mostly;
  47
  48/*
  49 * perf event paranoia level:
  50 *  -1 - not paranoid at all
  51 *   0 - disallow raw tracepoint access for unpriv
  52 *   1 - disallow cpu events for unpriv
  53 *   2 - disallow kernel profiling for unpriv
  54 */
  55int sysctl_perf_event_paranoid __read_mostly = 1;
  56
  57static inline bool perf_paranoid_tracepoint_raw(void)
  58{
  59        return sysctl_perf_event_paranoid > -1;
  60}
  61
  62static inline bool perf_paranoid_cpu(void)
  63{
  64        return sysctl_perf_event_paranoid > 0;
  65}
  66
  67static inline bool perf_paranoid_kernel(void)
  68{
  69        return sysctl_perf_event_paranoid > 1;
  70}
  71
  72int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
  73
  74/*
  75 * max perf event sample rate
  76 */
  77int sysctl_perf_event_sample_rate __read_mostly = 100000;
  78
  79static atomic64_t perf_event_id;
  80
  81/*
  82 * Lock for (sysadmin-configurable) event reservations:
  83 */
  84static DEFINE_SPINLOCK(perf_resource_lock);
  85
  86/*
  87 * Architecture provided APIs - weak aliases:
  88 */
  89extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
  90{
  91        return NULL;
  92}
  93
  94void __weak hw_perf_disable(void)               { barrier(); }
  95void __weak hw_perf_enable(void)                { barrier(); }
  96
  97void __weak hw_perf_event_setup(int cpu)        { barrier(); }
  98void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
  99
 100int __weak
 101hw_perf_group_sched_in(struct perf_event *group_leader,
 102               struct perf_cpu_context *cpuctx,
 103               struct perf_event_context *ctx, int cpu)
 104{
 105        return 0;
 106}
 107
 108void __weak perf_event_print_debug(void)        { }
 109
 110static DEFINE_PER_CPU(int, perf_disable_count);
 111
 112void __perf_disable(void)
 113{
 114        __get_cpu_var(perf_disable_count)++;
 115}
 116
 117bool __perf_enable(void)
 118{
 119        return !--__get_cpu_var(perf_disable_count);
 120}
 121
 122void perf_disable(void)
 123{
 124        __perf_disable();
 125        hw_perf_disable();
 126}
 127
 128void perf_enable(void)
 129{
 130        if (__perf_enable())
 131                hw_perf_enable();
 132}
 133
 134static void get_ctx(struct perf_event_context *ctx)
 135{
 136        WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 137}
 138
 139static void free_ctx(struct rcu_head *head)
 140{
 141        struct perf_event_context *ctx;
 142
 143        ctx = container_of(head, struct perf_event_context, rcu_head);
 144        kfree(ctx);
 145}
 146
 147static void put_ctx(struct perf_event_context *ctx)
 148{
 149        if (atomic_dec_and_test(&ctx->refcount)) {
 150                if (ctx->parent_ctx)
 151                        put_ctx(ctx->parent_ctx);
 152                if (ctx->task)
 153                        put_task_struct(ctx->task);
 154                call_rcu(&ctx->rcu_head, free_ctx);
 155        }
 156}
 157
 158static void unclone_ctx(struct perf_event_context *ctx)
 159{
 160        if (ctx->parent_ctx) {
 161                put_ctx(ctx->parent_ctx);
 162                ctx->parent_ctx = NULL;
 163        }
 164}
 165
 166/*
 167 * If we inherit events we want to return the parent event id
 168 * to userspace.
 169 */
 170static u64 primary_event_id(struct perf_event *event)
 171{
 172        u64 id = event->id;
 173
 174        if (event->parent)
 175                id = event->parent->id;
 176
 177        return id;
 178}
 179
 180/*
 181 * Get the perf_event_context for a task and lock it.
 182 * This has to cope with with the fact that until it is locked,
 183 * the context could get moved to another task.
 184 */
 185static struct perf_event_context *
 186perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 187{
 188        struct perf_event_context *ctx;
 189
 190        rcu_read_lock();
 191 retry:
 192        ctx = rcu_dereference(task->perf_event_ctxp);
 193        if (ctx) {
 194                /*
 195                 * If this context is a clone of another, it might
 196                 * get swapped for another underneath us by
 197                 * perf_event_task_sched_out, though the
 198                 * rcu_read_lock() protects us from any context
 199                 * getting freed.  Lock the context and check if it
 200                 * got swapped before we could get the lock, and retry
 201                 * if so.  If we locked the right context, then it
 202                 * can't get swapped on us any more.
 203                 */
 204                spin_lock_irqsave(&ctx->lock, *flags);
 205                if (ctx != rcu_dereference(task->perf_event_ctxp)) {
 206                        spin_unlock_irqrestore(&ctx->lock, *flags);
 207                        goto retry;
 208                }
 209
 210                if (!atomic_inc_not_zero(&ctx->refcount)) {
 211                        spin_unlock_irqrestore(&ctx->lock, *flags);
 212                        ctx = NULL;
 213                }
 214        }
 215        rcu_read_unlock();
 216        return ctx;
 217}
 218
 219/*
 220 * Get the context for a task and increment its pin_count so it
 221 * can't get swapped to another task.  This also increments its
 222 * reference count so that the context can't get freed.
 223 */
 224static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
 225{
 226        struct perf_event_context *ctx;
 227        unsigned long flags;
 228
 229        ctx = perf_lock_task_context(task, &flags);
 230        if (ctx) {
 231                ++ctx->pin_count;
 232                spin_unlock_irqrestore(&ctx->lock, flags);
 233        }
 234        return ctx;
 235}
 236
 237static void perf_unpin_context(struct perf_event_context *ctx)
 238{
 239        unsigned long flags;
 240
 241        spin_lock_irqsave(&ctx->lock, flags);
 242        --ctx->pin_count;
 243        spin_unlock_irqrestore(&ctx->lock, flags);
 244        put_ctx(ctx);
 245}
 246
 247/*
 248 * Add a event from the lists for its context.
 249 * Must be called with ctx->mutex and ctx->lock held.
 250 */
 251static void
 252list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 253{
 254        struct perf_event *group_leader = event->group_leader;
 255
 256        /*
 257         * Depending on whether it is a standalone or sibling event,
 258         * add it straight to the context's event list, or to the group
 259         * leader's sibling list:
 260         */
 261        if (group_leader == event)
 262                list_add_tail(&event->group_entry, &ctx->group_list);
 263        else {
 264                list_add_tail(&event->group_entry, &group_leader->sibling_list);
 265                group_leader->nr_siblings++;
 266        }
 267
 268        list_add_rcu(&event->event_entry, &ctx->event_list);
 269        ctx->nr_events++;
 270        if (event->attr.inherit_stat)
 271                ctx->nr_stat++;
 272}
 273
 274/*
 275 * Remove a event from the lists for its context.
 276 * Must be called with ctx->mutex and ctx->lock held.
 277 */
 278static void
 279list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 280{
 281        struct perf_event *sibling, *tmp;
 282
 283        if (list_empty(&event->group_entry))
 284                return;
 285        ctx->nr_events--;
 286        if (event->attr.inherit_stat)
 287                ctx->nr_stat--;
 288
 289        list_del_init(&event->group_entry);
 290        list_del_rcu(&event->event_entry);
 291
 292        if (event->group_leader != event)
 293                event->group_leader->nr_siblings--;
 294
 295        /*
 296         * If this was a group event with sibling events then
 297         * upgrade the siblings to singleton events by adding them
 298         * to the context list directly:
 299         */
 300        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 301
 302                list_move_tail(&sibling->group_entry, &ctx->group_list);
 303                sibling->group_leader = sibling;
 304        }
 305}
 306
 307static void
 308event_sched_out(struct perf_event *event,
 309                  struct perf_cpu_context *cpuctx,
 310                  struct perf_event_context *ctx)
 311{
 312        if (event->state != PERF_EVENT_STATE_ACTIVE)
 313                return;
 314
 315        event->state = PERF_EVENT_STATE_INACTIVE;
 316        if (event->pending_disable) {
 317                event->pending_disable = 0;
 318                event->state = PERF_EVENT_STATE_OFF;
 319        }
 320        event->tstamp_stopped = ctx->time;
 321        event->pmu->disable(event);
 322        event->oncpu = -1;
 323
 324        if (!is_software_event(event))
 325                cpuctx->active_oncpu--;
 326        ctx->nr_active--;
 327        if (event->attr.exclusive || !cpuctx->active_oncpu)
 328                cpuctx->exclusive = 0;
 329}
 330
 331static void
 332group_sched_out(struct perf_event *group_event,
 333                struct perf_cpu_context *cpuctx,
 334                struct perf_event_context *ctx)
 335{
 336        struct perf_event *event;
 337
 338        if (group_event->state != PERF_EVENT_STATE_ACTIVE)
 339                return;
 340
 341        event_sched_out(group_event, cpuctx, ctx);
 342
 343        /*
 344         * Schedule out siblings (if any):
 345         */
 346        list_for_each_entry(event, &group_event->sibling_list, group_entry)
 347                event_sched_out(event, cpuctx, ctx);
 348
 349        if (group_event->attr.exclusive)
 350                cpuctx->exclusive = 0;
 351}
 352
 353/*
 354 * Cross CPU call to remove a performance event
 355 *
 356 * We disable the event on the hardware level first. After that we
 357 * remove it from the context list.
 358 */
 359static void __perf_event_remove_from_context(void *info)
 360{
 361        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 362        struct perf_event *event = info;
 363        struct perf_event_context *ctx = event->ctx;
 364
 365        /*
 366         * If this is a task context, we need to check whether it is
 367         * the current task context of this cpu. If not it has been
 368         * scheduled out before the smp call arrived.
 369         */
 370        if (ctx->task && cpuctx->task_ctx != ctx)
 371                return;
 372
 373        spin_lock(&ctx->lock);
 374        /*
 375         * Protect the list operation against NMI by disabling the
 376         * events on a global level.
 377         */
 378        perf_disable();
 379
 380        event_sched_out(event, cpuctx, ctx);
 381
 382        list_del_event(event, ctx);
 383
 384        if (!ctx->task) {
 385                /*
 386                 * Allow more per task events with respect to the
 387                 * reservation:
 388                 */
 389                cpuctx->max_pertask =
 390                        min(perf_max_events - ctx->nr_events,
 391                            perf_max_events - perf_reserved_percpu);
 392        }
 393
 394        perf_enable();
 395        spin_unlock(&ctx->lock);
 396}
 397
 398
 399/*
 400 * Remove the event from a task's (or a CPU's) list of events.
 401 *
 402 * Must be called with ctx->mutex held.
 403 *
 404 * CPU events are removed with a smp call. For task events we only
 405 * call when the task is on a CPU.
 406 *
 407 * If event->ctx is a cloned context, callers must make sure that
 408 * every task struct that event->ctx->task could possibly point to
 409 * remains valid.  This is OK when called from perf_release since
 410 * that only calls us on the top-level context, which can't be a clone.
 411 * When called from perf_event_exit_task, it's OK because the
 412 * context has been detached from its task.
 413 */
 414static void perf_event_remove_from_context(struct perf_event *event)
 415{
 416        struct perf_event_context *ctx = event->ctx;
 417        struct task_struct *task = ctx->task;
 418
 419        if (!task) {
 420                /*
 421                 * Per cpu events are removed via an smp call and
 422                 * the removal is always sucessful.
 423                 */
 424                smp_call_function_single(event->cpu,
 425                                         __perf_event_remove_from_context,
 426                                         event, 1);
 427                return;
 428        }
 429
 430retry:
 431        task_oncpu_function_call(task, __perf_event_remove_from_context,
 432                                 event);
 433
 434        spin_lock_irq(&ctx->lock);
 435        /*
 436         * If the context is active we need to retry the smp call.
 437         */
 438        if (ctx->nr_active && !list_empty(&event->group_entry)) {
 439                spin_unlock_irq(&ctx->lock);
 440                goto retry;
 441        }
 442
 443        /*
 444         * The lock prevents that this context is scheduled in so we
 445         * can remove the event safely, if the call above did not
 446         * succeed.
 447         */
 448        if (!list_empty(&event->group_entry)) {
 449                list_del_event(event, ctx);
 450        }
 451        spin_unlock_irq(&ctx->lock);
 452}
 453
 454static inline u64 perf_clock(void)
 455{
 456        return cpu_clock(smp_processor_id());
 457}
 458
 459/*
 460 * Update the record of the current time in a context.
 461 */
 462static void update_context_time(struct perf_event_context *ctx)
 463{
 464        u64 now = perf_clock();
 465
 466        ctx->time += now - ctx->timestamp;
 467        ctx->timestamp = now;
 468}
 469
 470/*
 471 * Update the total_time_enabled and total_time_running fields for a event.
 472 */
 473static void update_event_times(struct perf_event *event)
 474{
 475        struct perf_event_context *ctx = event->ctx;
 476        u64 run_end;
 477
 478        if (event->state < PERF_EVENT_STATE_INACTIVE ||
 479            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 480                return;
 481
 482        event->total_time_enabled = ctx->time - event->tstamp_enabled;
 483
 484        if (event->state == PERF_EVENT_STATE_INACTIVE)
 485                run_end = event->tstamp_stopped;
 486        else
 487                run_end = ctx->time;
 488
 489        event->total_time_running = run_end - event->tstamp_running;
 490}
 491
 492/*
 493 * Update total_time_enabled and total_time_running for all events in a group.
 494 */
 495static void update_group_times(struct perf_event *leader)
 496{
 497        struct perf_event *event;
 498
 499        update_event_times(leader);
 500        list_for_each_entry(event, &leader->sibling_list, group_entry)
 501                update_event_times(event);
 502}
 503
 504/*
 505 * Cross CPU call to disable a performance event
 506 */
 507static void __perf_event_disable(void *info)
 508{
 509        struct perf_event *event = info;
 510        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 511        struct perf_event_context *ctx = event->ctx;
 512
 513        /*
 514         * If this is a per-task event, need to check whether this
 515         * event's task is the current task on this cpu.
 516         */
 517        if (ctx->task && cpuctx->task_ctx != ctx)
 518                return;
 519
 520        spin_lock(&ctx->lock);
 521
 522        /*
 523         * If the event is on, turn it off.
 524         * If it is in error state, leave it in error state.
 525         */
 526        if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 527                update_context_time(ctx);
 528                update_group_times(event);
 529                if (event == event->group_leader)
 530                        group_sched_out(event, cpuctx, ctx);
 531                else
 532                        event_sched_out(event, cpuctx, ctx);
 533                event->state = PERF_EVENT_STATE_OFF;
 534        }
 535
 536        spin_unlock(&ctx->lock);
 537}
 538
 539/*
 540 * Disable a event.
 541 *
 542 * If event->ctx is a cloned context, callers must make sure that
 543 * every task struct that event->ctx->task could possibly point to
 544 * remains valid.  This condition is satisifed when called through
 545 * perf_event_for_each_child or perf_event_for_each because they
 546 * hold the top-level event's child_mutex, so any descendant that
 547 * goes to exit will block in sync_child_event.
 548 * When called from perf_pending_event it's OK because event->ctx
 549 * is the current context on this CPU and preemption is disabled,
 550 * hence we can't get into perf_event_task_sched_out for this context.
 551 */
 552static void perf_event_disable(struct perf_event *event)
 553{
 554        struct perf_event_context *ctx = event->ctx;
 555        struct task_struct *task = ctx->task;
 556
 557        if (!task) {
 558                /*
 559                 * Disable the event on the cpu that it's on
 560                 */
 561                smp_call_function_single(event->cpu, __perf_event_disable,
 562                                         event, 1);
 563                return;
 564        }
 565
 566 retry:
 567        task_oncpu_function_call(task, __perf_event_disable, event);
 568
 569        spin_lock_irq(&ctx->lock);
 570        /*
 571         * If the event is still active, we need to retry the cross-call.
 572         */
 573        if (event->state == PERF_EVENT_STATE_ACTIVE) {
 574                spin_unlock_irq(&ctx->lock);
 575                goto retry;
 576        }
 577
 578        /*
 579         * Since we have the lock this context can't be scheduled
 580         * in, so we can change the state safely.
 581         */
 582        if (event->state == PERF_EVENT_STATE_INACTIVE) {
 583                update_group_times(event);
 584                event->state = PERF_EVENT_STATE_OFF;
 585        }
 586
 587        spin_unlock_irq(&ctx->lock);
 588}
 589
 590static int
 591event_sched_in(struct perf_event *event,
 592                 struct perf_cpu_context *cpuctx,
 593                 struct perf_event_context *ctx,
 594                 int cpu)
 595{
 596        if (event->state <= PERF_EVENT_STATE_OFF)
 597                return 0;
 598
 599        event->state = PERF_EVENT_STATE_ACTIVE;
 600        event->oncpu = cpu;     /* TODO: put 'cpu' into cpuctx->cpu */
 601        /*
 602         * The new state must be visible before we turn it on in the hardware:
 603         */
 604        smp_wmb();
 605
 606        if (event->pmu->enable(event)) {
 607                event->state = PERF_EVENT_STATE_INACTIVE;
 608                event->oncpu = -1;
 609                return -EAGAIN;
 610        }
 611
 612        event->tstamp_running += ctx->time - event->tstamp_stopped;
 613
 614        if (!is_software_event(event))
 615                cpuctx->active_oncpu++;
 616        ctx->nr_active++;
 617
 618        if (event->attr.exclusive)
 619                cpuctx->exclusive = 1;
 620
 621        return 0;
 622}
 623
 624static int
 625group_sched_in(struct perf_event *group_event,
 626               struct perf_cpu_context *cpuctx,
 627               struct perf_event_context *ctx,
 628               int cpu)
 629{
 630        struct perf_event *event, *partial_group;
 631        int ret;
 632
 633        if (group_event->state == PERF_EVENT_STATE_OFF)
 634                return 0;
 635
 636        ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
 637        if (ret)
 638                return ret < 0 ? ret : 0;
 639
 640        if (event_sched_in(group_event, cpuctx, ctx, cpu))
 641                return -EAGAIN;
 642
 643        /*
 644         * Schedule in siblings as one group (if any):
 645         */
 646        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 647                if (event_sched_in(event, cpuctx, ctx, cpu)) {
 648                        partial_group = event;
 649                        goto group_error;
 650                }
 651        }
 652
 653        return 0;
 654
 655group_error:
 656        /*
 657         * Groups can be scheduled in as one unit only, so undo any
 658         * partial group before returning:
 659         */
 660        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 661                if (event == partial_group)
 662                        break;
 663                event_sched_out(event, cpuctx, ctx);
 664        }
 665        event_sched_out(group_event, cpuctx, ctx);
 666
 667        return -EAGAIN;
 668}
 669
 670/*
 671 * Return 1 for a group consisting entirely of software events,
 672 * 0 if the group contains any hardware events.
 673 */
 674static int is_software_only_group(struct perf_event *leader)
 675{
 676        struct perf_event *event;
 677
 678        if (!is_software_event(leader))
 679                return 0;
 680
 681        list_for_each_entry(event, &leader->sibling_list, group_entry)
 682                if (!is_software_event(event))
 683                        return 0;
 684
 685        return 1;
 686}
 687
 688/*
 689 * Work out whether we can put this event group on the CPU now.
 690 */
 691static int group_can_go_on(struct perf_event *event,
 692                           struct perf_cpu_context *cpuctx,
 693                           int can_add_hw)
 694{
 695        /*
 696         * Groups consisting entirely of software events can always go on.
 697         */
 698        if (is_software_only_group(event))
 699                return 1;
 700        /*
 701         * If an exclusive group is already on, no other hardware
 702         * events can go on.
 703         */
 704        if (cpuctx->exclusive)
 705                return 0;
 706        /*
 707         * If this group is exclusive and there are already
 708         * events on the CPU, it can't go on.
 709         */
 710        if (event->attr.exclusive && cpuctx->active_oncpu)
 711                return 0;
 712        /*
 713         * Otherwise, try to add it if all previous groups were able
 714         * to go on.
 715         */
 716        return can_add_hw;
 717}
 718
 719static void add_event_to_ctx(struct perf_event *event,
 720                               struct perf_event_context *ctx)
 721{
 722        list_add_event(event, ctx);
 723        event->tstamp_enabled = ctx->time;
 724        event->tstamp_running = ctx->time;
 725        event->tstamp_stopped = ctx->time;
 726}
 727
 728/*
 729 * Cross CPU call to install and enable a performance event
 730 *
 731 * Must be called with ctx->mutex held
 732 */
 733static void __perf_install_in_context(void *info)
 734{
 735        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 736        struct perf_event *event = info;
 737        struct perf_event_context *ctx = event->ctx;
 738        struct perf_event *leader = event->group_leader;
 739        int cpu = smp_processor_id();
 740        int err;
 741
 742        /*
 743         * If this is a task context, we need to check whether it is
 744         * the current task context of this cpu. If not it has been
 745         * scheduled out before the smp call arrived.
 746         * Or possibly this is the right context but it isn't
 747         * on this cpu because it had no events.
 748         */
 749        if (ctx->task && cpuctx->task_ctx != ctx) {
 750                if (cpuctx->task_ctx || ctx->task != current)
 751                        return;
 752                cpuctx->task_ctx = ctx;
 753        }
 754
 755        spin_lock(&ctx->lock);
 756        ctx->is_active = 1;
 757        update_context_time(ctx);
 758
 759        /*
 760         * Protect the list operation against NMI by disabling the
 761         * events on a global level. NOP for non NMI based events.
 762         */
 763        perf_disable();
 764
 765        add_event_to_ctx(event, ctx);
 766
 767        /*
 768         * Don't put the event on if it is disabled or if
 769         * it is in a group and the group isn't on.
 770         */
 771        if (event->state != PERF_EVENT_STATE_INACTIVE ||
 772            (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
 773                goto unlock;
 774
 775        /*
 776         * An exclusive event can't go on if there are already active
 777         * hardware events, and no hardware event can go on if there
 778         * is already an exclusive event on.
 779         */
 780        if (!group_can_go_on(event, cpuctx, 1))
 781                err = -EEXIST;
 782        else
 783                err = event_sched_in(event, cpuctx, ctx, cpu);
 784
 785        if (err) {
 786                /*
 787                 * This event couldn't go on.  If it is in a group
 788                 * then we have to pull the whole group off.
 789                 * If the event group is pinned then put it in error state.
 790                 */
 791                if (leader != event)
 792                        group_sched_out(leader, cpuctx, ctx);
 793                if (leader->attr.pinned) {
 794                        update_group_times(leader);
 795                        leader->state = PERF_EVENT_STATE_ERROR;
 796                }
 797        }
 798
 799        if (!err && !ctx->task && cpuctx->max_pertask)
 800                cpuctx->max_pertask--;
 801
 802 unlock:
 803        perf_enable();
 804
 805        spin_unlock(&ctx->lock);
 806}
 807
 808/*
 809 * Attach a performance event to a context
 810 *
 811 * First we add the event to the list with the hardware enable bit
 812 * in event->hw_config cleared.
 813 *
 814 * If the event is attached to a task which is on a CPU we use a smp
 815 * call to enable it in the task context. The task might have been
 816 * scheduled away, but we check this in the smp call again.
 817 *
 818 * Must be called with ctx->mutex held.
 819 */
 820static void
 821perf_install_in_context(struct perf_event_context *ctx,
 822                        struct perf_event *event,
 823                        int cpu)
 824{
 825        struct task_struct *task = ctx->task;
 826
 827        if (!task) {
 828                /*
 829                 * Per cpu events are installed via an smp call and
 830                 * the install is always sucessful.
 831                 */
 832                smp_call_function_single(cpu, __perf_install_in_context,
 833                                         event, 1);
 834                return;
 835        }
 836
 837retry:
 838        task_oncpu_function_call(task, __perf_install_in_context,
 839                                 event);
 840
 841        spin_lock_irq(&ctx->lock);
 842        /*
 843         * we need to retry the smp call.
 844         */
 845        if (ctx->is_active && list_empty(&event->group_entry)) {
 846                spin_unlock_irq(&ctx->lock);
 847                goto retry;
 848        }
 849
 850        /*
 851         * The lock prevents that this context is scheduled in so we
 852         * can add the event safely, if it the call above did not
 853         * succeed.
 854         */
 855        if (list_empty(&event->group_entry))
 856                add_event_to_ctx(event, ctx);
 857        spin_unlock_irq(&ctx->lock);
 858}
 859
 860/*
 861 * Put a event into inactive state and update time fields.
 862 * Enabling the leader of a group effectively enables all
 863 * the group members that aren't explicitly disabled, so we
 864 * have to update their ->tstamp_enabled also.
 865 * Note: this works for group members as well as group leaders
 866 * since the non-leader members' sibling_lists will be empty.
 867 */
 868static void __perf_event_mark_enabled(struct perf_event *event,
 869                                        struct perf_event_context *ctx)
 870{
 871        struct perf_event *sub;
 872
 873        event->state = PERF_EVENT_STATE_INACTIVE;
 874        event->tstamp_enabled = ctx->time - event->total_time_enabled;
 875        list_for_each_entry(sub, &event->sibling_list, group_entry)
 876                if (sub->state >= PERF_EVENT_STATE_INACTIVE)
 877                        sub->tstamp_enabled =
 878                                ctx->time - sub->total_time_enabled;
 879}
 880
 881/*
 882 * Cross CPU call to enable a performance event
 883 */
 884static void __perf_event_enable(void *info)
 885{
 886        struct perf_event *event = info;
 887        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 888        struct perf_event_context *ctx = event->ctx;
 889        struct perf_event *leader = event->group_leader;
 890        int err;
 891
 892        /*
 893         * If this is a per-task event, need to check whether this
 894         * event's task is the current task on this cpu.
 895         */
 896        if (ctx->task && cpuctx->task_ctx != ctx) {
 897                if (cpuctx->task_ctx || ctx->task != current)
 898                        return;
 899                cpuctx->task_ctx = ctx;
 900        }
 901
 902        spin_lock(&ctx->lock);
 903        ctx->is_active = 1;
 904        update_context_time(ctx);
 905
 906        if (event->state >= PERF_EVENT_STATE_INACTIVE)
 907                goto unlock;
 908        __perf_event_mark_enabled(event, ctx);
 909
 910        /*
 911         * If the event is in a group and isn't the group leader,
 912         * then don't put it on unless the group is on.
 913         */
 914        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
 915                goto unlock;
 916
 917        if (!group_can_go_on(event, cpuctx, 1)) {
 918                err = -EEXIST;
 919        } else {
 920                perf_disable();
 921                if (event == leader)
 922                        err = group_sched_in(event, cpuctx, ctx,
 923                                             smp_processor_id());
 924                else
 925                        err = event_sched_in(event, cpuctx, ctx,
 926                                               smp_processor_id());
 927                perf_enable();
 928        }
 929
 930        if (err) {
 931                /*
 932                 * If this event can't go on and it's part of a
 933                 * group, then the whole group has to come off.
 934                 */
 935                if (leader != event)
 936                        group_sched_out(leader, cpuctx, ctx);
 937                if (leader->attr.pinned) {
 938                        update_group_times(leader);
 939                        leader->state = PERF_EVENT_STATE_ERROR;
 940                }
 941        }
 942
 943 unlock:
 944        spin_unlock(&ctx->lock);
 945}
 946
 947/*
 948 * Enable a event.
 949 *
 950 * If event->ctx is a cloned context, callers must make sure that
 951 * every task struct that event->ctx->task could possibly point to
 952 * remains valid.  This condition is satisfied when called through
 953 * perf_event_for_each_child or perf_event_for_each as described
 954 * for perf_event_disable.
 955 */
 956static void perf_event_enable(struct perf_event *event)
 957{
 958        struct perf_event_context *ctx = event->ctx;
 959        struct task_struct *task = ctx->task;
 960
 961        if (!task) {
 962                /*
 963                 * Enable the event on the cpu that it's on
 964                 */
 965                smp_call_function_single(event->cpu, __perf_event_enable,
 966                                         event, 1);
 967                return;
 968        }
 969
 970        spin_lock_irq(&ctx->lock);
 971        if (event->state >= PERF_EVENT_STATE_INACTIVE)
 972                goto out;
 973
 974        /*
 975         * If the event is in error state, clear that first.
 976         * That way, if we see the event in error state below, we
 977         * know that it has gone back into error state, as distinct
 978         * from the task having been scheduled away before the
 979         * cross-call arrived.
 980         */
 981        if (event->state == PERF_EVENT_STATE_ERROR)
 982                event->state = PERF_EVENT_STATE_OFF;
 983
 984 retry:
 985        spin_unlock_irq(&ctx->lock);
 986        task_oncpu_function_call(task, __perf_event_enable, event);
 987
 988        spin_lock_irq(&ctx->lock);
 989
 990        /*
 991         * If the context is active and the event is still off,
 992         * we need to retry the cross-call.
 993         */
 994        if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
 995                goto retry;
 996
 997        /*
 998         * Since we have the lock this context can't be scheduled
 999         * in, so we can change the state safely.
1000         */
1001        if (event->state == PERF_EVENT_STATE_OFF)
1002                __perf_event_mark_enabled(event, ctx);
1003
1004 out:
1005        spin_unlock_irq(&ctx->lock);
1006}
1007
1008static int perf_event_refresh(struct perf_event *event, int refresh)
1009{
1010        /*
1011         * not supported on inherited events
1012         */
1013        if (event->attr.inherit)
1014                return -EINVAL;
1015
1016        atomic_add(refresh, &event->event_limit);
1017        perf_event_enable(event);
1018
1019        return 0;
1020}
1021
1022void __perf_event_sched_out(struct perf_event_context *ctx,
1023                              struct perf_cpu_context *cpuctx)
1024{
1025        struct perf_event *event;
1026
1027        spin_lock(&ctx->lock);
1028        ctx->is_active = 0;
1029        if (likely(!ctx->nr_events))
1030                goto out;
1031        update_context_time(ctx);
1032
1033        perf_disable();
1034        if (ctx->nr_active)
1035                list_for_each_entry(event, &ctx->group_list, group_entry)
1036                        group_sched_out(event, cpuctx, ctx);
1037
1038        perf_enable();
1039 out:
1040        spin_unlock(&ctx->lock);
1041}
1042
1043/*
1044 * Test whether two contexts are equivalent, i.e. whether they
1045 * have both been cloned from the same version of the same context
1046 * and they both have the same number of enabled events.
1047 * If the number of enabled events is the same, then the set
1048 * of enabled events should be the same, because these are both
1049 * inherited contexts, therefore we can't access individual events
1050 * in them directly with an fd; we can only enable/disable all
1051 * events via prctl, or enable/disable all events in a family
1052 * via ioctl, which will have the same effect on both contexts.
1053 */
1054static int context_equiv(struct perf_event_context *ctx1,
1055                         struct perf_event_context *ctx2)
1056{
1057        return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1058                && ctx1->parent_gen == ctx2->parent_gen
1059                && !ctx1->pin_count && !ctx2->pin_count;
1060}
1061
1062static void __perf_event_read(void *event);
1063
1064static void __perf_event_sync_stat(struct perf_event *event,
1065                                     struct perf_event *next_event)
1066{
1067        u64 value;
1068
1069        if (!event->attr.inherit_stat)
1070                return;
1071
1072        /*
1073         * Update the event value, we cannot use perf_event_read()
1074         * because we're in the middle of a context switch and have IRQs
1075         * disabled, which upsets smp_call_function_single(), however
1076         * we know the event must be on the current CPU, therefore we
1077         * don't need to use it.
1078         */
1079        switch (event->state) {
1080        case PERF_EVENT_STATE_ACTIVE:
1081                __perf_event_read(event);
1082                break;
1083
1084        case PERF_EVENT_STATE_INACTIVE:
1085                update_event_times(event);
1086                break;
1087
1088        default:
1089                break;
1090        }
1091
1092        /*
1093         * In order to keep per-task stats reliable we need to flip the event
1094         * values when we flip the contexts.
1095         */
1096        value = atomic64_read(&next_event->count);
1097        value = atomic64_xchg(&event->count, value);
1098        atomic64_set(&next_event->count, value);
1099
1100        swap(event->total_time_enabled, next_event->total_time_enabled);
1101        swap(event->total_time_running, next_event->total_time_running);
1102
1103        /*
1104         * Since we swizzled the values, update the user visible data too.
1105         */
1106        perf_event_update_userpage(event);
1107        perf_event_update_userpage(next_event);
1108}
1109
1110#define list_next_entry(pos, member) \
1111        list_entry(pos->member.next, typeof(*pos), member)
1112
1113static void perf_event_sync_stat(struct perf_event_context *ctx,
1114                                   struct perf_event_context *next_ctx)
1115{
1116        struct perf_event *event, *next_event;
1117
1118        if (!ctx->nr_stat)
1119                return;
1120
1121        event = list_first_entry(&ctx->event_list,
1122                                   struct perf_event, event_entry);
1123
1124        next_event = list_first_entry(&next_ctx->event_list,
1125                                        struct perf_event, event_entry);
1126
1127        while (&event->event_entry != &ctx->event_list &&
1128               &next_event->event_entry != &next_ctx->event_list) {
1129
1130                __perf_event_sync_stat(event, next_event);
1131
1132                event = list_next_entry(event, event_entry);
1133                next_event = list_next_entry(next_event, event_entry);
1134        }
1135}
1136
1137/*
1138 * Called from scheduler to remove the events of the current task,
1139 * with interrupts disabled.
1140 *
1141 * We stop each event and update the event value in event->count.
1142 *
1143 * This does not protect us against NMI, but disable()
1144 * sets the disabled bit in the control field of event _before_
1145 * accessing the event control register. If a NMI hits, then it will
1146 * not restart the event.
1147 */
1148void perf_event_task_sched_out(struct task_struct *task,
1149                                 struct task_struct *next, int cpu)
1150{
1151        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1152        struct perf_event_context *ctx = task->perf_event_ctxp;
1153        struct perf_event_context *next_ctx;
1154        struct perf_event_context *parent;
1155        struct pt_regs *regs;
1156        int do_switch = 1;
1157
1158        regs = task_pt_regs(task);
1159        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1160
1161        if (likely(!ctx || !cpuctx->task_ctx))
1162                return;
1163
1164        update_context_time(ctx);
1165
1166        rcu_read_lock();
1167        parent = rcu_dereference(ctx->parent_ctx);
1168        next_ctx = next->perf_event_ctxp;
1169        if (parent && next_ctx &&
1170            rcu_dereference(next_ctx->parent_ctx) == parent) {
1171                /*
1172                 * Looks like the two contexts are clones, so we might be
1173                 * able to optimize the context switch.  We lock both
1174                 * contexts and check that they are clones under the
1175                 * lock (including re-checking that neither has been
1176                 * uncloned in the meantime).  It doesn't matter which
1177                 * order we take the locks because no other cpu could
1178                 * be trying to lock both of these tasks.
1179                 */
1180                spin_lock(&ctx->lock);
1181                spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1182                if (context_equiv(ctx, next_ctx)) {
1183                        /*
1184                         * XXX do we need a memory barrier of sorts
1185                         * wrt to rcu_dereference() of perf_event_ctxp
1186                         */
1187                        task->perf_event_ctxp = next_ctx;
1188                        next->perf_event_ctxp = ctx;
1189                        ctx->task = next;
1190                        next_ctx->task = task;
1191                        do_switch = 0;
1192
1193                        perf_event_sync_stat(ctx, next_ctx);
1194                }
1195                spin_unlock(&next_ctx->lock);
1196                spin_unlock(&ctx->lock);
1197        }
1198        rcu_read_unlock();
1199
1200        if (do_switch) {
1201                __perf_event_sched_out(ctx, cpuctx);
1202                cpuctx->task_ctx = NULL;
1203        }
1204}
1205
1206/*
1207 * Called with IRQs disabled
1208 */
1209static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1210{
1211        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1212
1213        if (!cpuctx->task_ctx)
1214                return;
1215
1216        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1217                return;
1218
1219        __perf_event_sched_out(ctx, cpuctx);
1220        cpuctx->task_ctx = NULL;
1221}
1222
1223/*
1224 * Called with IRQs disabled
1225 */
1226static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1227{
1228        __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1229}
1230
1231static void
1232__perf_event_sched_in(struct perf_event_context *ctx,
1233                        struct perf_cpu_context *cpuctx, int cpu)
1234{
1235        struct perf_event *event;
1236        int can_add_hw = 1;
1237
1238        spin_lock(&ctx->lock);
1239        ctx->is_active = 1;
1240        if (likely(!ctx->nr_events))
1241                goto out;
1242
1243        ctx->timestamp = perf_clock();
1244
1245        perf_disable();
1246
1247        /*
1248         * First go through the list and put on any pinned groups
1249         * in order to give them the best chance of going on.
1250         */
1251        list_for_each_entry(event, &ctx->group_list, group_entry) {
1252                if (event->state <= PERF_EVENT_STATE_OFF ||
1253                    !event->attr.pinned)
1254                        continue;
1255                if (event->cpu != -1 && event->cpu != cpu)
1256                        continue;
1257
1258                if (group_can_go_on(event, cpuctx, 1))
1259                        group_sched_in(event, cpuctx, ctx, cpu);
1260
1261                /*
1262                 * If this pinned group hasn't been scheduled,
1263                 * put it in error state.
1264                 */
1265                if (event->state == PERF_EVENT_STATE_INACTIVE) {
1266                        update_group_times(event);
1267                        event->state = PERF_EVENT_STATE_ERROR;
1268                }
1269        }
1270
1271        list_for_each_entry(event, &ctx->group_list, group_entry) {
1272                /*
1273                 * Ignore events in OFF or ERROR state, and
1274                 * ignore pinned events since we did them already.
1275                 */
1276                if (event->state <= PERF_EVENT_STATE_OFF ||
1277                    event->attr.pinned)
1278                        continue;
1279
1280                /*
1281                 * Listen to the 'cpu' scheduling filter constraint
1282                 * of events:
1283                 */
1284                if (event->cpu != -1 && event->cpu != cpu)
1285                        continue;
1286
1287                if (group_can_go_on(event, cpuctx, can_add_hw))
1288                        if (group_sched_in(event, cpuctx, ctx, cpu))
1289                                can_add_hw = 0;
1290        }
1291        perf_enable();
1292 out:
1293        spin_unlock(&ctx->lock);
1294}
1295
1296/*
1297 * Called from scheduler to add the events of the current task
1298 * with interrupts disabled.
1299 *
1300 * We restore the event value and then enable it.
1301 *
1302 * This does not protect us against NMI, but enable()
1303 * sets the enabled bit in the control field of event _before_
1304 * accessing the event control register. If a NMI hits, then it will
1305 * keep the event running.
1306 */
1307void perf_event_task_sched_in(struct task_struct *task, int cpu)
1308{
1309        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1310        struct perf_event_context *ctx = task->perf_event_ctxp;
1311
1312        if (likely(!ctx))
1313                return;
1314        if (cpuctx->task_ctx == ctx)
1315                return;
1316        __perf_event_sched_in(ctx, cpuctx, cpu);
1317        cpuctx->task_ctx = ctx;
1318}
1319
1320static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1321{
1322        struct perf_event_context *ctx = &cpuctx->ctx;
1323
1324        __perf_event_sched_in(ctx, cpuctx, cpu);
1325}
1326
1327#define MAX_INTERRUPTS (~0ULL)
1328
1329static void perf_log_throttle(struct perf_event *event, int enable);
1330
1331static void perf_adjust_period(struct perf_event *event, u64 events)
1332{
1333        struct hw_perf_event *hwc = &event->hw;
1334        u64 period, sample_period;
1335        s64 delta;
1336
1337        events *= hwc->sample_period;
1338        period = div64_u64(events, event->attr.sample_freq);
1339
1340        delta = (s64)(period - hwc->sample_period);
1341        delta = (delta + 7) / 8; /* low pass filter */
1342
1343        sample_period = hwc->sample_period + delta;
1344
1345        if (!sample_period)
1346                sample_period = 1;
1347
1348        hwc->sample_period = sample_period;
1349}
1350
1351static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1352{
1353        struct perf_event *event;
1354        struct hw_perf_event *hwc;
1355        u64 interrupts, freq;
1356
1357        spin_lock(&ctx->lock);
1358        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1359                if (event->state != PERF_EVENT_STATE_ACTIVE)
1360                        continue;
1361
1362                hwc = &event->hw;
1363
1364                interrupts = hwc->interrupts;
1365                hwc->interrupts = 0;
1366
1367                /*
1368                 * unthrottle events on the tick
1369                 */
1370                if (interrupts == MAX_INTERRUPTS) {
1371                        perf_log_throttle(event, 1);
1372                        event->pmu->unthrottle(event);
1373                        interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1374                }
1375
1376                if (!event->attr.freq || !event->attr.sample_freq)
1377                        continue;
1378
1379                /*
1380                 * if the specified freq < HZ then we need to skip ticks
1381                 */
1382                if (event->attr.sample_freq < HZ) {
1383                        freq = event->attr.sample_freq;
1384
1385                        hwc->freq_count += freq;
1386                        hwc->freq_interrupts += interrupts;
1387
1388                        if (hwc->freq_count < HZ)
1389                                continue;
1390
1391                        interrupts = hwc->freq_interrupts;
1392                        hwc->freq_interrupts = 0;
1393                        hwc->freq_count -= HZ;
1394                } else
1395                        freq = HZ;
1396
1397                perf_adjust_period(event, freq * interrupts);
1398
1399                /*
1400                 * In order to avoid being stalled by an (accidental) huge
1401                 * sample period, force reset the sample period if we didn't
1402                 * get any events in this freq period.
1403                 */
1404                if (!interrupts) {
1405                        perf_disable();
1406                        event->pmu->disable(event);
1407                        atomic64_set(&hwc->period_left, 0);
1408                        event->pmu->enable(event);
1409                        perf_enable();
1410                }
1411        }
1412        spin_unlock(&ctx->lock);
1413}
1414
1415/*
1416 * Round-robin a context's events:
1417 */
1418static void rotate_ctx(struct perf_event_context *ctx)
1419{
1420        struct perf_event *event;
1421
1422        if (!ctx->nr_events)
1423                return;
1424
1425        spin_lock(&ctx->lock);
1426        /*
1427         * Rotate the first entry last (works just fine for group events too):
1428         */
1429        perf_disable();
1430        list_for_each_entry(event, &ctx->group_list, group_entry) {
1431                list_move_tail(&event->group_entry, &ctx->group_list);
1432                break;
1433        }
1434        perf_enable();
1435
1436        spin_unlock(&ctx->lock);
1437}
1438
1439void perf_event_task_tick(struct task_struct *curr, int cpu)
1440{
1441        struct perf_cpu_context *cpuctx;
1442        struct perf_event_context *ctx;
1443
1444        if (!atomic_read(&nr_events))
1445                return;
1446
1447        cpuctx = &per_cpu(perf_cpu_context, cpu);
1448        ctx = curr->perf_event_ctxp;
1449
1450        perf_ctx_adjust_freq(&cpuctx->ctx);
1451        if (ctx)
1452                perf_ctx_adjust_freq(ctx);
1453
1454        perf_event_cpu_sched_out(cpuctx);
1455        if (ctx)
1456                __perf_event_task_sched_out(ctx);
1457
1458        rotate_ctx(&cpuctx->ctx);
1459        if (ctx)
1460                rotate_ctx(ctx);
1461
1462        perf_event_cpu_sched_in(cpuctx, cpu);
1463        if (ctx)
1464                perf_event_task_sched_in(curr, cpu);
1465}
1466
1467/*
1468 * Enable all of a task's events that have been marked enable-on-exec.
1469 * This expects task == current.
1470 */
1471static void perf_event_enable_on_exec(struct task_struct *task)
1472{
1473        struct perf_event_context *ctx;
1474        struct perf_event *event;
1475        unsigned long flags;
1476        int enabled = 0;
1477
1478        local_irq_save(flags);
1479        ctx = task->perf_event_ctxp;
1480        if (!ctx || !ctx->nr_events)
1481                goto out;
1482
1483        __perf_event_task_sched_out(ctx);
1484
1485        spin_lock(&ctx->lock);
1486
1487        list_for_each_entry(event, &ctx->group_list, group_entry) {
1488                if (!event->attr.enable_on_exec)
1489                        continue;
1490                event->attr.enable_on_exec = 0;
1491                if (event->state >= PERF_EVENT_STATE_INACTIVE)
1492                        continue;
1493                __perf_event_mark_enabled(event, ctx);
1494                enabled = 1;
1495        }
1496
1497        /*
1498         * Unclone this context if we enabled any event.
1499         */
1500        if (enabled)
1501                unclone_ctx(ctx);
1502
1503        spin_unlock(&ctx->lock);
1504
1505        perf_event_task_sched_in(task, smp_processor_id());
1506 out:
1507        local_irq_restore(flags);
1508}
1509
1510/*
1511 * Cross CPU call to read the hardware event
1512 */
1513static void __perf_event_read(void *info)
1514{
1515        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1516        struct perf_event *event = info;
1517        struct perf_event_context *ctx = event->ctx;
1518        unsigned long flags;
1519
1520        /*
1521         * If this is a task context, we need to check whether it is
1522         * the current task context of this cpu.  If not it has been
1523         * scheduled out before the smp call arrived.  In that case
1524         * event->count would have been updated to a recent sample
1525         * when the event was scheduled out.
1526         */
1527        if (ctx->task && cpuctx->task_ctx != ctx)
1528                return;
1529
1530        local_irq_save(flags);
1531        if (ctx->is_active)
1532                update_context_time(ctx);
1533        event->pmu->read(event);
1534        update_event_times(event);
1535        local_irq_restore(flags);
1536}
1537
1538static u64 perf_event_read(struct perf_event *event)
1539{
1540        /*
1541         * If event is enabled and currently active on a CPU, update the
1542         * value in the event structure:
1543         */
1544        if (event->state == PERF_EVENT_STATE_ACTIVE) {
1545                smp_call_function_single(event->oncpu,
1546                                         __perf_event_read, event, 1);
1547        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1548                update_event_times(event);
1549        }
1550
1551        return atomic64_read(&event->count);
1552}
1553
1554/*
1555 * Initialize the perf_event context in a task_struct:
1556 */
1557static void
1558__perf_event_init_context(struct perf_event_context *ctx,
1559                            struct task_struct *task)
1560{
1561        memset(ctx, 0, sizeof(*ctx));
1562        spin_lock_init(&ctx->lock);
1563        mutex_init(&ctx->mutex);
1564        INIT_LIST_HEAD(&ctx->group_list);
1565        INIT_LIST_HEAD(&ctx->event_list);
1566        atomic_set(&ctx->refcount, 1);
1567        ctx->task = task;
1568}
1569
1570static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1571{
1572        struct perf_event_context *ctx;
1573        struct perf_cpu_context *cpuctx;
1574        struct task_struct *task;
1575        unsigned long flags;
1576        int err;
1577
1578        /*
1579         * If cpu is not a wildcard then this is a percpu event:
1580         */
1581        if (cpu != -1) {
1582                /* Must be root to operate on a CPU event: */
1583                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1584                        return ERR_PTR(-EACCES);
1585
1586                if (cpu < 0 || cpu > num_possible_cpus())
1587                        return ERR_PTR(-EINVAL);
1588
1589                /*
1590                 * We could be clever and allow to attach a event to an
1591                 * offline CPU and activate it when the CPU comes up, but
1592                 * that's for later.
1593                 */
1594                if (!cpu_isset(cpu, cpu_online_map))
1595                        return ERR_PTR(-ENODEV);
1596
1597                cpuctx = &per_cpu(perf_cpu_context, cpu);
1598                ctx = &cpuctx->ctx;
1599                get_ctx(ctx);
1600
1601                return ctx;
1602        }
1603
1604        rcu_read_lock();
1605        if (!pid)
1606                task = current;
1607        else
1608                task = find_task_by_vpid(pid);
1609        if (task)
1610                get_task_struct(task);
1611        rcu_read_unlock();
1612
1613        if (!task)
1614                return ERR_PTR(-ESRCH);
1615
1616        /*
1617         * Can't attach events to a dying task.
1618         */
1619        err = -ESRCH;
1620        if (task->flags & PF_EXITING)
1621                goto errout;
1622
1623        /* Reuse ptrace permission checks for now. */
1624        err = -EACCES;
1625        if (!ptrace_may_access(task, PTRACE_MODE_READ))
1626                goto errout;
1627
1628 retry:
1629        ctx = perf_lock_task_context(task, &flags);
1630        if (ctx) {
1631                unclone_ctx(ctx);
1632                spin_unlock_irqrestore(&ctx->lock, flags);
1633        }
1634
1635        if (!ctx) {
1636                ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1637                err = -ENOMEM;
1638                if (!ctx)
1639                        goto errout;
1640                __perf_event_init_context(ctx, task);
1641                get_ctx(ctx);
1642                if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1643                        /*
1644                         * We raced with some other task; use
1645                         * the context they set.
1646                         */
1647                        kfree(ctx);
1648                        goto retry;
1649                }
1650                get_task_struct(task);
1651        }
1652
1653        put_task_struct(task);
1654        return ctx;
1655
1656 errout:
1657        put_task_struct(task);
1658        return ERR_PTR(err);
1659}
1660
1661static void free_event_rcu(struct rcu_head *head)
1662{
1663        struct perf_event *event;
1664
1665        event = container_of(head, struct perf_event, rcu_head);
1666        if (event->ns)
1667                put_pid_ns(event->ns);
1668        kfree(event);
1669}
1670
1671static void perf_pending_sync(struct perf_event *event);
1672
1673static void free_event(struct perf_event *event)
1674{
1675        perf_pending_sync(event);
1676
1677        if (!event->parent) {
1678                atomic_dec(&nr_events);
1679                if (event->attr.mmap)
1680                        atomic_dec(&nr_mmap_events);
1681                if (event->attr.comm)
1682                        atomic_dec(&nr_comm_events);
1683                if (event->attr.task)
1684                        atomic_dec(&nr_task_events);
1685        }
1686
1687        if (event->output) {
1688                fput(event->output->filp);
1689                event->output = NULL;
1690        }
1691
1692        if (event->destroy)
1693                event->destroy(event);
1694
1695        put_ctx(event->ctx);
1696        call_rcu(&event->rcu_head, free_event_rcu);
1697}
1698
1699/*
1700 * Called when the last reference to the file is gone.
1701 */
1702static int perf_release(struct inode *inode, struct file *file)
1703{
1704        struct perf_event *event = file->private_data;
1705        struct perf_event_context *ctx = event->ctx;
1706
1707        file->private_data = NULL;
1708
1709        WARN_ON_ONCE(ctx->parent_ctx);
1710        mutex_lock(&ctx->mutex);
1711        perf_event_remove_from_context(event);
1712        mutex_unlock(&ctx->mutex);
1713
1714        mutex_lock(&event->owner->perf_event_mutex);
1715        list_del_init(&event->owner_entry);
1716        mutex_unlock(&event->owner->perf_event_mutex);
1717        put_task_struct(event->owner);
1718
1719        free_event(event);
1720
1721        return 0;
1722}
1723
1724static int perf_event_read_size(struct perf_event *event)
1725{
1726        int entry = sizeof(u64); /* value */
1727        int size = 0;
1728        int nr = 1;
1729
1730        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1731                size += sizeof(u64);
1732
1733        if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1734                size += sizeof(u64);
1735
1736        if (event->attr.read_format & PERF_FORMAT_ID)
1737                entry += sizeof(u64);
1738
1739        if (event->attr.read_format & PERF_FORMAT_GROUP) {
1740                nr += event->group_leader->nr_siblings;
1741                size += sizeof(u64);
1742        }
1743
1744        size += entry * nr;
1745
1746        return size;
1747}
1748
1749static u64 perf_event_read_value(struct perf_event *event)
1750{
1751        struct perf_event *child;
1752        u64 total = 0;
1753
1754        total += perf_event_read(event);
1755        list_for_each_entry(child, &event->child_list, child_list)
1756                total += perf_event_read(child);
1757
1758        return total;
1759}
1760
1761static int perf_event_read_entry(struct perf_event *event,
1762                                   u64 read_format, char __user *buf)
1763{
1764        int n = 0, count = 0;
1765        u64 values[2];
1766
1767        values[n++] = perf_event_read_value(event);
1768        if (read_format & PERF_FORMAT_ID)
1769                values[n++] = primary_event_id(event);
1770
1771        count = n * sizeof(u64);
1772
1773        if (copy_to_user(buf, values, count))
1774                return -EFAULT;
1775
1776        return count;
1777}
1778
1779static int perf_event_read_group(struct perf_event *event,
1780                                   u64 read_format, char __user *buf)
1781{
1782        struct perf_event *leader = event->group_leader, *sub;
1783        int n = 0, size = 0, err = -EFAULT;
1784        u64 values[3];
1785
1786        values[n++] = 1 + leader->nr_siblings;
1787        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1788                values[n++] = leader->total_time_enabled +
1789                        atomic64_read(&leader->child_total_time_enabled);
1790        }
1791        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1792                values[n++] = leader->total_time_running +
1793                        atomic64_read(&leader->child_total_time_running);
1794        }
1795
1796        size = n * sizeof(u64);
1797
1798        if (copy_to_user(buf, values, size))
1799                return -EFAULT;
1800
1801        err = perf_event_read_entry(leader, read_format, buf + size);
1802        if (err < 0)
1803                return err;
1804
1805        size += err;
1806
1807        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1808                err = perf_event_read_entry(sub, read_format,
1809                                buf + size);
1810                if (err < 0)
1811                        return err;
1812
1813                size += err;
1814        }
1815
1816        return size;
1817}
1818
1819static int perf_event_read_one(struct perf_event *event,
1820                                 u64 read_format, char __user *buf)
1821{
1822        u64 values[4];
1823        int n = 0;
1824
1825        values[n++] = perf_event_read_value(event);
1826        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1827                values[n++] = event->total_time_enabled +
1828                        atomic64_read(&event->child_total_time_enabled);
1829        }
1830        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1831                values[n++] = event->total_time_running +
1832                        atomic64_read(&event->child_total_time_running);
1833        }
1834        if (read_format & PERF_FORMAT_ID)
1835                values[n++] = primary_event_id(event);
1836
1837        if (copy_to_user(buf, values, n * sizeof(u64)))
1838                return -EFAULT;
1839
1840        return n * sizeof(u64);
1841}
1842
1843/*
1844 * Read the performance event - simple non blocking version for now
1845 */
1846static ssize_t
1847perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1848{
1849        u64 read_format = event->attr.read_format;
1850        int ret;
1851
1852        /*
1853         * Return end-of-file for a read on a event that is in
1854         * error state (i.e. because it was pinned but it couldn't be
1855         * scheduled on to the CPU at some point).
1856         */
1857        if (event->state == PERF_EVENT_STATE_ERROR)
1858                return 0;
1859
1860        if (count < perf_event_read_size(event))
1861                return -ENOSPC;
1862
1863        WARN_ON_ONCE(event->ctx->parent_ctx);
1864        mutex_lock(&event->child_mutex);
1865        if (read_format & PERF_FORMAT_GROUP)
1866                ret = perf_event_read_group(event, read_format, buf);
1867        else
1868                ret = perf_event_read_one(event, read_format, buf);
1869        mutex_unlock(&event->child_mutex);
1870
1871        return ret;
1872}
1873
1874static ssize_t
1875perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1876{
1877        struct perf_event *event = file->private_data;
1878
1879        return perf_read_hw(event, buf, count);
1880}
1881
1882static unsigned int perf_poll(struct file *file, poll_table *wait)
1883{
1884        struct perf_event *event = file->private_data;
1885        struct perf_mmap_data *data;
1886        unsigned int events = POLL_HUP;
1887
1888        rcu_read_lock();
1889        data = rcu_dereference(event->data);
1890        if (data)
1891                events = atomic_xchg(&data->poll, 0);
1892        rcu_read_unlock();
1893
1894        poll_wait(file, &event->waitq, wait);
1895
1896        return events;
1897}
1898
1899static void perf_event_reset(struct perf_event *event)
1900{
1901        (void)perf_event_read(event);
1902        atomic64_set(&event->count, 0);
1903        perf_event_update_userpage(event);
1904}
1905
1906/*
1907 * Holding the top-level event's child_mutex means that any
1908 * descendant process that has inherited this event will block
1909 * in sync_child_event if it goes to exit, thus satisfying the
1910 * task existence requirements of perf_event_enable/disable.
1911 */
1912static void perf_event_for_each_child(struct perf_event *event,
1913                                        void (*func)(struct perf_event *))
1914{
1915        struct perf_event *child;
1916
1917        WARN_ON_ONCE(event->ctx->parent_ctx);
1918        mutex_lock(&event->child_mutex);
1919        func(event);
1920        list_for_each_entry(child, &event->child_list, child_list)
1921                func(child);
1922        mutex_unlock(&event->child_mutex);
1923}
1924
1925static void perf_event_for_each(struct perf_event *event,
1926                                  void (*func)(struct perf_event *))
1927{
1928        struct perf_event_context *ctx = event->ctx;
1929        struct perf_event *sibling;
1930
1931        WARN_ON_ONCE(ctx->parent_ctx);
1932        mutex_lock(&ctx->mutex);
1933        event = event->group_leader;
1934
1935        perf_event_for_each_child(event, func);
1936        func(event);
1937        list_for_each_entry(sibling, &event->sibling_list, group_entry)
1938                perf_event_for_each_child(event, func);
1939        mutex_unlock(&ctx->mutex);
1940}
1941
1942static int perf_event_period(struct perf_event *event, u64 __user *arg)
1943{
1944        struct perf_event_context *ctx = event->ctx;
1945        unsigned long size;
1946        int ret = 0;
1947        u64 value;
1948
1949        if (!event->attr.sample_period)
1950                return -EINVAL;
1951
1952        size = copy_from_user(&value, arg, sizeof(value));
1953        if (size != sizeof(value))
1954                return -EFAULT;
1955
1956        if (!value)
1957                return -EINVAL;
1958
1959        spin_lock_irq(&ctx->lock);
1960        if (event->attr.freq) {
1961                if (value > sysctl_perf_event_sample_rate) {
1962                        ret = -EINVAL;
1963                        goto unlock;
1964                }
1965
1966                event->attr.sample_freq = value;
1967        } else {
1968                event->attr.sample_period = value;
1969                event->hw.sample_period = value;
1970        }
1971unlock:
1972        spin_unlock_irq(&ctx->lock);
1973
1974        return ret;
1975}
1976
1977int perf_event_set_output(struct perf_event *event, int output_fd);
1978
1979static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1980{
1981        struct perf_event *event = file->private_data;
1982        void (*func)(struct perf_event *);
1983        u32 flags = arg;
1984
1985        switch (cmd) {
1986        case PERF_EVENT_IOC_ENABLE:
1987                func = perf_event_enable;
1988                break;
1989        case PERF_EVENT_IOC_DISABLE:
1990                func = perf_event_disable;
1991                break;
1992        case PERF_EVENT_IOC_RESET:
1993                func = perf_event_reset;
1994                break;
1995
1996        case PERF_EVENT_IOC_REFRESH:
1997                return perf_event_refresh(event, arg);
1998
1999        case PERF_EVENT_IOC_PERIOD:
2000                return perf_event_period(event, (u64 __user *)arg);
2001
2002        case PERF_EVENT_IOC_SET_OUTPUT:
2003                return perf_event_set_output(event, arg);
2004
2005        default:
2006                return -ENOTTY;
2007        }
2008
2009        if (flags & PERF_IOC_FLAG_GROUP)
2010                perf_event_for_each(event, func);
2011        else
2012                perf_event_for_each_child(event, func);
2013
2014        return 0;
2015}
2016
2017int perf_event_task_enable(void)
2018{
2019        struct perf_event *event;
2020
2021        mutex_lock(&current->perf_event_mutex);
2022        list_for_each_entry(event, &current->perf_event_list, owner_entry)
2023                perf_event_for_each_child(event, perf_event_enable);
2024        mutex_unlock(&current->perf_event_mutex);
2025
2026        return 0;
2027}
2028
2029int perf_event_task_disable(void)
2030{
2031        struct perf_event *event;
2032
2033        mutex_lock(&current->perf_event_mutex);
2034        list_for_each_entry(event, &current->perf_event_list, owner_entry)
2035                perf_event_for_each_child(event, perf_event_disable);
2036        mutex_unlock(&current->perf_event_mutex);
2037
2038        return 0;
2039}
2040
2041#ifndef PERF_EVENT_INDEX_OFFSET
2042# define PERF_EVENT_INDEX_OFFSET 0
2043#endif
2044
2045static int perf_event_index(struct perf_event *event)
2046{
2047        if (event->state != PERF_EVENT_STATE_ACTIVE)
2048                return 0;
2049
2050        return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2051}
2052
2053/*
2054 * Callers need to ensure there can be no nesting of this function, otherwise
2055 * the seqlock logic goes bad. We can not serialize this because the arch
2056 * code calls this from NMI context.
2057 */
2058void perf_event_update_userpage(struct perf_event *event)
2059{
2060        struct perf_event_mmap_page *userpg;
2061        struct perf_mmap_data *data;
2062
2063        rcu_read_lock();
2064        data = rcu_dereference(event->data);
2065        if (!data)
2066                goto unlock;
2067
2068        userpg = data->user_page;
2069
2070        /*
2071         * Disable preemption so as to not let the corresponding user-space
2072         * spin too long if we get preempted.
2073         */
2074        preempt_disable();
2075        ++userpg->lock;
2076        barrier();
2077        userpg->index = perf_event_index(event);
2078        userpg->offset = atomic64_read(&event->count);
2079        if (event->state == PERF_EVENT_STATE_ACTIVE)
2080                userpg->offset -= atomic64_read(&event->hw.prev_count);
2081
2082        userpg->time_enabled = event->total_time_enabled +
2083                        atomic64_read(&event->child_total_time_enabled);
2084
2085        userpg->time_running = event->total_time_running +
2086                        atomic64_read(&event->child_total_time_running);
2087
2088        barrier();
2089        ++userpg->lock;
2090        preempt_enable();
2091unlock:
2092        rcu_read_unlock();
2093}
2094
2095static unsigned long perf_data_size(struct perf_mmap_data *data)
2096{
2097        return data->nr_pages << (PAGE_SHIFT + data->data_order);
2098}
2099
2100#ifndef CONFIG_PERF_USE_VMALLOC
2101
2102/*
2103 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2104 */
2105
2106static struct page *
2107perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2108{
2109        if (pgoff > data->nr_pages)
2110                return NULL;
2111
2112        if (pgoff == 0)
2113                return virt_to_page(data->user_page);
2114
2115        return virt_to_page(data->data_pages[pgoff - 1]);
2116}
2117
2118static struct perf_mmap_data *
2119perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2120{
2121        struct perf_mmap_data *data;
2122        unsigned long size;
2123        int i;
2124
2125        WARN_ON(atomic_read(&event->mmap_count));
2126
2127        size = sizeof(struct perf_mmap_data);
2128        size += nr_pages * sizeof(void *);
2129
2130        data = kzalloc(size, GFP_KERNEL);
2131        if (!data)
2132                goto fail;
2133
2134        data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2135        if (!data->user_page)
2136                goto fail_user_page;
2137
2138        for (i = 0; i < nr_pages; i++) {
2139                data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2140                if (!data->data_pages[i])
2141                        goto fail_data_pages;
2142        }
2143
2144        data->data_order = 0;
2145        data->nr_pages = nr_pages;
2146
2147        return data;
2148
2149fail_data_pages:
2150        for (i--; i >= 0; i--)
2151                free_page((unsigned long)data->data_pages[i]);
2152
2153        free_page((unsigned long)data->user_page);
2154
2155fail_user_page:
2156        kfree(data);
2157
2158fail:
2159        return NULL;
2160}
2161
2162static void perf_mmap_free_page(unsigned long addr)
2163{
2164        struct page *page = virt_to_page((void *)addr);
2165
2166        page->mapping = NULL;
2167        __free_page(page);
2168}
2169
2170static void perf_mmap_data_free(struct perf_mmap_data *data)
2171{
2172        int i;
2173
2174        perf_mmap_free_page((unsigned long)data->user_page);
2175        for (i = 0; i < data->nr_pages; i++)
2176                perf_mmap_free_page((unsigned long)data->data_pages[i]);
2177}
2178
2179#else
2180
2181/*
2182 * Back perf_mmap() with vmalloc memory.
2183 *
2184 * Required for architectures that have d-cache aliasing issues.
2185 */
2186
2187static struct page *
2188perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2189{
2190        if (pgoff > (1UL << data->data_order))
2191                return NULL;
2192
2193        return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2194}
2195
2196static void perf_mmap_unmark_page(void *addr)
2197{
2198        struct page *page = vmalloc_to_page(addr);
2199
2200        page->mapping = NULL;
2201}
2202
2203static void perf_mmap_data_free_work(struct work_struct *work)
2204{
2205        struct perf_mmap_data *data;
2206        void *base;
2207        int i, nr;
2208
2209        data = container_of(work, struct perf_mmap_data, work);
2210        nr = 1 << data->data_order;
2211
2212        base = data->user_page;
2213        for (i = 0; i < nr + 1; i++)
2214                perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2215
2216        vfree(base);
2217}
2218
2219static void perf_mmap_data_free(struct perf_mmap_data *data)
2220{
2221        schedule_work(&data->work);
2222}
2223
2224static struct perf_mmap_data *
2225perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2226{
2227        struct perf_mmap_data *data;
2228        unsigned long size;
2229        void *all_buf;
2230
2231        WARN_ON(atomic_read(&event->mmap_count));
2232
2233        size = sizeof(struct perf_mmap_data);
2234        size += sizeof(void *);
2235
2236        data = kzalloc(size, GFP_KERNEL);
2237        if (!data)
2238                goto fail;
2239
2240        INIT_WORK(&data->work, perf_mmap_data_free_work);
2241
2242        all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2243        if (!all_buf)
2244                goto fail_all_buf;
2245
2246        data->user_page = all_buf;
2247        data->data_pages[0] = all_buf + PAGE_SIZE;
2248        data->data_order = ilog2(nr_pages);
2249        data->nr_pages = 1;
2250
2251        return data;
2252
2253fail_all_buf:
2254        kfree(data);
2255
2256fail:
2257        return NULL;
2258}
2259
2260#endif
2261
2262static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2263{
2264        struct perf_event *event = vma->vm_file->private_data;
2265        struct perf_mmap_data *data;
2266        int ret = VM_FAULT_SIGBUS;
2267
2268        if (vmf->flags & FAULT_FLAG_MKWRITE) {
2269                if (vmf->pgoff == 0)
2270                        ret = 0;
2271                return ret;
2272        }
2273
2274        rcu_read_lock();
2275        data = rcu_dereference(event->data);
2276        if (!data)
2277                goto unlock;
2278
2279        if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2280                goto unlock;
2281
2282        vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2283        if (!vmf->page)
2284                goto unlock;
2285
2286        get_page(vmf->page);
2287        vmf->page->mapping = vma->vm_file->f_mapping;
2288        vmf->page->index   = vmf->pgoff;
2289
2290        ret = 0;
2291unlock:
2292        rcu_read_unlock();
2293
2294        return ret;
2295}
2296
2297static void
2298perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2299{
2300        long max_size = perf_data_size(data);
2301
2302        atomic_set(&data->lock, -1);
2303
2304        if (event->attr.watermark) {
2305                data->watermark = min_t(long, max_size,
2306                                        event->attr.wakeup_watermark);
2307        }
2308
2309        if (!data->watermark)
2310                data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
2311
2312
2313        rcu_assign_pointer(event->data, data);
2314}
2315
2316static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2317{
2318        struct perf_mmap_data *data;
2319
2320        data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2321        perf_mmap_data_free(data);
2322        kfree(data);
2323}
2324
2325static void perf_mmap_data_release(struct perf_event *event)
2326{
2327        struct perf_mmap_data *data = event->data;
2328
2329        WARN_ON(atomic_read(&event->mmap_count));
2330
2331        rcu_assign_pointer(event->data, NULL);
2332        call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2333}
2334
2335static void perf_mmap_open(struct vm_area_struct *vma)
2336{
2337        struct perf_event *event = vma->vm_file->private_data;
2338
2339        atomic_inc(&event->mmap_count);
2340}
2341
2342static void perf_mmap_close(struct vm_area_struct *vma)
2343{
2344        struct perf_event *event = vma->vm_file->private_data;
2345
2346        WARN_ON_ONCE(event->ctx->parent_ctx);
2347        if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2348                unsigned long size = perf_data_size(event->data);
2349                struct user_struct *user = current_user();
2350
2351                atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2352                vma->vm_mm->locked_vm -= event->data->nr_locked;
2353                perf_mmap_data_release(event);
2354                mutex_unlock(&event->mmap_mutex);
2355        }
2356}
2357
2358static const struct vm_operations_struct perf_mmap_vmops = {
2359        .open           = perf_mmap_open,
2360        .close          = perf_mmap_close,
2361        .fault          = perf_mmap_fault,
2362        .page_mkwrite   = perf_mmap_fault,
2363};
2364
2365static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2366{
2367        struct perf_event *event = file->private_data;
2368        unsigned long user_locked, user_lock_limit;
2369        struct user_struct *user = current_user();
2370        unsigned long locked, lock_limit;
2371        struct perf_mmap_data *data;
2372        unsigned long vma_size;
2373        unsigned long nr_pages;
2374        long user_extra, extra;
2375        int ret = 0;
2376
2377        if (!(vma->vm_flags & VM_SHARED))
2378                return -EINVAL;
2379
2380        vma_size = vma->vm_end - vma->vm_start;
2381        nr_pages = (vma_size / PAGE_SIZE) - 1;
2382
2383        /*
2384         * If we have data pages ensure they're a power-of-two number, so we
2385         * can do bitmasks instead of modulo.
2386         */
2387        if (nr_pages != 0 && !is_power_of_2(nr_pages))
2388                return -EINVAL;
2389
2390        if (vma_size != PAGE_SIZE * (1 + nr_pages))
2391                return -EINVAL;
2392
2393        if (vma->vm_pgoff != 0)
2394                return -EINVAL;
2395
2396        WARN_ON_ONCE(event->ctx->parent_ctx);
2397        mutex_lock(&event->mmap_mutex);
2398        if (event->output) {
2399                ret = -EINVAL;
2400                goto unlock;
2401        }
2402
2403        if (atomic_inc_not_zero(&event->mmap_count)) {
2404                if (nr_pages != event->data->nr_pages)
2405                        ret = -EINVAL;
2406                goto unlock;
2407        }
2408
2409        user_extra = nr_pages + 1;
2410        user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2411
2412        /*
2413         * Increase the limit linearly with more CPUs:
2414         */
2415        user_lock_limit *= num_online_cpus();
2416
2417        user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2418
2419        extra = 0;
2420        if (user_locked > user_lock_limit)
2421                extra = user_locked - user_lock_limit;
2422
2423        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2424        lock_limit >>= PAGE_SHIFT;
2425        locked = vma->vm_mm->locked_vm + extra;
2426
2427        if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2428                !capable(CAP_IPC_LOCK)) {
2429                ret = -EPERM;
2430                goto unlock;
2431        }
2432
2433        WARN_ON(event->data);
2434
2435        data = perf_mmap_data_alloc(event, nr_pages);
2436        ret = -ENOMEM;
2437        if (!data)
2438                goto unlock;
2439
2440        ret = 0;
2441        perf_mmap_data_init(event, data);
2442
2443        atomic_set(&event->mmap_count, 1);
2444        atomic_long_add(user_extra, &user->locked_vm);
2445        vma->vm_mm->locked_vm += extra;
2446        event->data->nr_locked = extra;
2447        if (vma->vm_flags & VM_WRITE)
2448                event->data->writable = 1;
2449
2450unlock:
2451        mutex_unlock(&event->mmap_mutex);
2452
2453        vma->vm_flags |= VM_RESERVED;
2454        vma->vm_ops = &perf_mmap_vmops;
2455
2456        return ret;
2457}
2458
2459static int perf_fasync(int fd, struct file *filp, int on)
2460{
2461        struct inode *inode = filp->f_path.dentry->d_inode;
2462        struct perf_event *event = filp->private_data;
2463        int retval;
2464
2465        mutex_lock(&inode->i_mutex);
2466        retval = fasync_helper(fd, filp, on, &event->fasync);
2467        mutex_unlock(&inode->i_mutex);
2468
2469        if (retval < 0)
2470                return retval;
2471
2472        return 0;
2473}
2474
2475static const struct file_operations perf_fops = {
2476        .release                = perf_release,
2477        .read                   = perf_read,
2478        .poll                   = perf_poll,
2479        .unlocked_ioctl         = perf_ioctl,
2480        .compat_ioctl           = perf_ioctl,
2481        .mmap                   = perf_mmap,
2482        .fasync                 = perf_fasync,
2483};
2484
2485/*
2486 * Perf event wakeup
2487 *
2488 * If there's data, ensure we set the poll() state and publish everything
2489 * to user-space before waking everybody up.
2490 */
2491
2492void perf_event_wakeup(struct perf_event *event)
2493{
2494        wake_up_all(&event->waitq);
2495
2496        if (event->pending_kill) {
2497                kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2498                event->pending_kill = 0;
2499        }
2500}
2501
2502/*
2503 * Pending wakeups
2504 *
2505 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2506 *
2507 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2508 * single linked list and use cmpxchg() to add entries lockless.
2509 */
2510
2511static void perf_pending_event(struct perf_pending_entry *entry)
2512{
2513        struct perf_event *event = container_of(entry,
2514                        struct perf_event, pending);
2515
2516        if (event->pending_disable) {
2517                event->pending_disable = 0;
2518                __perf_event_disable(event);
2519        }
2520
2521        if (event->pending_wakeup) {
2522                event->pending_wakeup = 0;
2523                perf_event_wakeup(event);
2524        }
2525}
2526
2527#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2528
2529static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2530        PENDING_TAIL,
2531};
2532
2533static void perf_pending_queue(struct perf_pending_entry *entry,
2534                               void (*func)(struct perf_pending_entry *))
2535{
2536        struct perf_pending_entry **head;
2537
2538        if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2539                return;
2540
2541        entry->func = func;
2542
2543        head = &get_cpu_var(perf_pending_head);
2544
2545        do {
2546                entry->next = *head;
2547        } while (cmpxchg(head, entry->next, entry) != entry->next);
2548
2549        set_perf_event_pending();
2550
2551        put_cpu_var(perf_pending_head);
2552}
2553
2554static int __perf_pending_run(void)
2555{
2556        struct perf_pending_entry *list;
2557        int nr = 0;
2558
2559        list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2560        while (list != PENDING_TAIL) {
2561                void (*func)(struct perf_pending_entry *);
2562                struct perf_pending_entry *entry = list;
2563
2564                list = list->next;
2565
2566                func = entry->func;
2567                entry->next = NULL;
2568                /*
2569                 * Ensure we observe the unqueue before we issue the wakeup,
2570                 * so that we won't be waiting forever.
2571                 * -- see perf_not_pending().
2572                 */
2573                smp_wmb();
2574
2575                func(entry);
2576                nr++;
2577        }
2578
2579        return nr;
2580}
2581
2582static inline int perf_not_pending(struct perf_event *event)
2583{
2584        /*
2585         * If we flush on whatever cpu we run, there is a chance we don't
2586         * need to wait.
2587         */
2588        get_cpu();
2589        __perf_pending_run();
2590        put_cpu();
2591
2592        /*
2593         * Ensure we see the proper queue state before going to sleep
2594         * so that we do not miss the wakeup. -- see perf_pending_handle()
2595         */
2596        smp_rmb();
2597        return event->pending.next == NULL;
2598}
2599
2600static void perf_pending_sync(struct perf_event *event)
2601{
2602        wait_event(event->waitq, perf_not_pending(event));
2603}
2604
2605void perf_event_do_pending(void)
2606{
2607        __perf_pending_run();
2608}
2609
2610/*
2611 * Callchain support -- arch specific
2612 */
2613
2614__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2615{
2616        return NULL;
2617}
2618
2619/*
2620 * Output
2621 */
2622static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2623                              unsigned long offset, unsigned long head)
2624{
2625        unsigned long mask;
2626
2627        if (!data->writable)
2628                return true;
2629
2630        mask = perf_data_size(data) - 1;
2631
2632        offset = (offset - tail) & mask;
2633        head   = (head   - tail) & mask;
2634
2635        if ((int)(head - offset) < 0)
2636                return false;
2637
2638        return true;
2639}
2640
2641static void perf_output_wakeup(struct perf_output_handle *handle)
2642{
2643        atomic_set(&handle->data->poll, POLL_IN);
2644
2645        if (handle->nmi) {
2646                handle->event->pending_wakeup = 1;
2647                perf_pending_queue(&handle->event->pending,
2648                                   perf_pending_event);
2649        } else
2650                perf_event_wakeup(handle->event);
2651}
2652
2653/*
2654 * Curious locking construct.
2655 *
2656 * We need to ensure a later event_id doesn't publish a head when a former
2657 * event_id isn't done writing. However since we need to deal with NMIs we
2658 * cannot fully serialize things.
2659 *
2660 * What we do is serialize between CPUs so we only have to deal with NMI
2661 * nesting on a single CPU.
2662 *
2663 * We only publish the head (and generate a wakeup) when the outer-most
2664 * event_id completes.
2665 */
2666static void perf_output_lock(struct perf_output_handle *handle)
2667{
2668        struct perf_mmap_data *data = handle->data;
2669        int cpu;
2670
2671        handle->locked = 0;
2672
2673        local_irq_save(handle->flags);
2674        cpu = smp_processor_id();
2675
2676        if (in_nmi() && atomic_read(&data->lock) == cpu)
2677                return;
2678
2679        while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2680                cpu_relax();
2681
2682        handle->locked = 1;
2683}
2684
2685static void perf_output_unlock(struct perf_output_handle *handle)
2686{
2687        struct perf_mmap_data *data = handle->data;
2688        unsigned long head;
2689        int cpu;
2690
2691        data->done_head = data->head;
2692
2693        if (!handle->locked)
2694                goto out;
2695
2696again:
2697        /*
2698         * The xchg implies a full barrier that ensures all writes are done
2699         * before we publish the new head, matched by a rmb() in userspace when
2700         * reading this position.
2701         */
2702        while ((head = atomic_long_xchg(&data->done_head, 0)))
2703                data->user_page->data_head = head;
2704
2705        /*
2706         * NMI can happen here, which means we can miss a done_head update.
2707         */
2708
2709        cpu = atomic_xchg(&data->lock, -1);
2710        WARN_ON_ONCE(cpu != smp_processor_id());
2711
2712        /*
2713         * Therefore we have to validate we did not indeed do so.
2714         */
2715        if (unlikely(atomic_long_read(&data->done_head))) {
2716                /*
2717                 * Since we had it locked, we can lock it again.
2718                 */
2719                while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2720                        cpu_relax();
2721
2722                goto again;
2723        }
2724
2725        if (atomic_xchg(&data->wakeup, 0))
2726                perf_output_wakeup(handle);
2727out:
2728        local_irq_restore(handle->flags);
2729}
2730
2731void perf_output_copy(struct perf_output_handle *handle,
2732                      const void *buf, unsigned int len)
2733{
2734        unsigned int pages_mask;
2735        unsigned long offset;
2736        unsigned int size;
2737        void **pages;
2738
2739        offset          = handle->offset;
2740        pages_mask      = handle->data->nr_pages - 1;
2741        pages           = handle->data->data_pages;
2742
2743        do {
2744                unsigned long page_offset;
2745                unsigned long page_size;
2746                int nr;
2747
2748                nr          = (offset >> PAGE_SHIFT) & pages_mask;
2749                page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
2750                page_offset = offset & (page_size - 1);
2751                size        = min_t(unsigned int, page_size - page_offset, len);
2752
2753                memcpy(pages[nr] + page_offset, buf, size);
2754
2755                len         -= size;
2756                buf         += size;
2757                offset      += size;
2758        } while (len);
2759
2760        handle->offset = offset;
2761
2762        /*
2763         * Check we didn't copy past our reservation window, taking the
2764         * possible unsigned int wrap into account.
2765         */
2766        WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2767}
2768
2769int perf_output_begin(struct perf_output_handle *handle,
2770                      struct perf_event *event, unsigned int size,
2771                      int nmi, int sample)
2772{
2773        struct perf_event *output_event;
2774        struct perf_mmap_data *data;
2775        unsigned long tail, offset, head;
2776        int have_lost;
2777        struct {
2778                struct perf_event_header header;
2779                u64                      id;
2780                u64                      lost;
2781        } lost_event;
2782
2783        rcu_read_lock();
2784        /*
2785         * For inherited events we send all the output towards the parent.
2786         */
2787        if (event->parent)
2788                event = event->parent;
2789
2790        output_event = rcu_dereference(event->output);
2791        if (output_event)
2792                event = output_event;
2793
2794        data = rcu_dereference(event->data);
2795        if (!data)
2796                goto out;
2797
2798        handle->data    = data;
2799        handle->event   = event;
2800        handle->nmi     = nmi;
2801        handle->sample  = sample;
2802
2803        if (!data->nr_pages)
2804                goto fail;
2805
2806        have_lost = atomic_read(&data->lost);
2807        if (have_lost)
2808                size += sizeof(lost_event);
2809
2810        perf_output_lock(handle);
2811
2812        do {
2813                /*
2814                 * Userspace could choose to issue a mb() before updating the
2815                 * tail pointer. So that all reads will be completed before the
2816                 * write is issued.
2817                 */
2818                tail = ACCESS_ONCE(data->user_page->data_tail);
2819                smp_rmb();
2820                offset = head = atomic_long_read(&data->head);
2821                head += size;
2822                if (unlikely(!perf_output_space(data, tail, offset, head)))
2823                        goto fail;
2824        } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2825
2826        handle->offset  = offset;
2827        handle->head    = head;
2828
2829        if (head - tail > data->watermark)
2830                atomic_set(&data->wakeup, 1);
2831
2832        if (have_lost) {
2833                lost_event.header.type = PERF_RECORD_LOST;
2834                lost_event.header.misc = 0;
2835                lost_event.header.size = sizeof(lost_event);
2836                lost_event.id          = event->id;
2837                lost_event.lost        = atomic_xchg(&data->lost, 0);
2838
2839                perf_output_put(handle, lost_event);
2840        }
2841
2842        return 0;
2843
2844fail:
2845        atomic_inc(&data->lost);
2846        perf_output_unlock(handle);
2847out:
2848        rcu_read_unlock();
2849
2850        return -ENOSPC;
2851}
2852
2853void perf_output_end(struct perf_output_handle *handle)
2854{
2855        struct perf_event *event = handle->event;
2856        struct perf_mmap_data *data = handle->data;
2857
2858        int wakeup_events = event->attr.wakeup_events;
2859
2860        if (handle->sample && wakeup_events) {
2861                int events = atomic_inc_return(&data->events);
2862                if (events >= wakeup_events) {
2863                        atomic_sub(wakeup_events, &data->events);
2864                        atomic_set(&data->wakeup, 1);
2865                }
2866        }
2867
2868        perf_output_unlock(handle);
2869        rcu_read_unlock();
2870}
2871
2872static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2873{
2874        /*
2875         * only top level events have the pid namespace they were created in
2876         */
2877        if (event->parent)
2878                event = event->parent;
2879
2880        return task_tgid_nr_ns(p, event->ns);
2881}
2882
2883static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2884{
2885        /*
2886         * only top level events have the pid namespace they were created in
2887         */
2888        if (event->parent)
2889                event = event->parent;
2890
2891        return task_pid_nr_ns(p, event->ns);
2892}
2893
2894static void perf_output_read_one(struct perf_output_handle *handle,
2895                                 struct perf_event *event)
2896{
2897        u64 read_format = event->attr.read_format;
2898        u64 values[4];
2899        int n = 0;
2900
2901        values[n++] = atomic64_read(&event->count);
2902        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2903                values[n++] = event->total_time_enabled +
2904                        atomic64_read(&event->child_total_time_enabled);
2905        }
2906        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2907                values[n++] = event->total_time_running +
2908                        atomic64_read(&event->child_total_time_running);
2909        }
2910        if (read_format & PERF_FORMAT_ID)
2911                values[n++] = primary_event_id(event);
2912
2913        perf_output_copy(handle, values, n * sizeof(u64));
2914}
2915
2916/*
2917 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2918 */
2919static void perf_output_read_group(struct perf_output_handle *handle,
2920                            struct perf_event *event)
2921{
2922        struct perf_event *leader = event->group_leader, *sub;
2923        u64 read_format = event->attr.read_format;
2924        u64 values[5];
2925        int n = 0;
2926
2927        values[n++] = 1 + leader->nr_siblings;
2928
2929        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2930                values[n++] = leader->total_time_enabled;
2931
2932        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2933                values[n++] = leader->total_time_running;
2934
2935        if (leader != event)
2936                leader->pmu->read(leader);
2937
2938        values[n++] = atomic64_read(&leader->count);
2939        if (read_format & PERF_FORMAT_ID)
2940                values[n++] = primary_event_id(leader);
2941
2942        perf_output_copy(handle, values, n * sizeof(u64));
2943
2944        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2945                n = 0;
2946
2947                if (sub != event)
2948                        sub->pmu->read(sub);
2949
2950                values[n++] = atomic64_read(&sub->count);
2951                if (read_format & PERF_FORMAT_ID)
2952                        values[n++] = primary_event_id(sub);
2953
2954                perf_output_copy(handle, values, n * sizeof(u64));
2955        }
2956}
2957
2958static void perf_output_read(struct perf_output_handle *handle,
2959                             struct perf_event *event)
2960{
2961        if (event->attr.read_format & PERF_FORMAT_GROUP)
2962                perf_output_read_group(handle, event);
2963        else
2964                perf_output_read_one(handle, event);
2965}
2966
2967void perf_output_sample(struct perf_output_handle *handle,
2968                        struct perf_event_header *header,
2969                        struct perf_sample_data *data,
2970                        struct perf_event *event)
2971{
2972        u64 sample_type = data->type;
2973
2974        perf_output_put(handle, *header);
2975
2976        if (sample_type & PERF_SAMPLE_IP)
2977                perf_output_put(handle, data->ip);
2978
2979        if (sample_type & PERF_SAMPLE_TID)
2980                perf_output_put(handle, data->tid_entry);
2981
2982        if (sample_type & PERF_SAMPLE_TIME)
2983                perf_output_put(handle, data->time);
2984
2985        if (sample_type & PERF_SAMPLE_ADDR)
2986                perf_output_put(handle, data->addr);
2987
2988        if (sample_type & PERF_SAMPLE_ID)
2989                perf_output_put(handle, data->id);
2990
2991        if (sample_type & PERF_SAMPLE_STREAM_ID)
2992                perf_output_put(handle, data->stream_id);
2993
2994        if (sample_type & PERF_SAMPLE_CPU)
2995                perf_output_put(handle, data->cpu_entry);
2996
2997        if (sample_type & PERF_SAMPLE_PERIOD)
2998                perf_output_put(handle, data->period);
2999
3000        if (sample_type & PERF_SAMPLE_READ)
3001                perf_output_read(handle, event);
3002
3003        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3004                if (data->callchain) {
3005                        int size = 1;
3006
3007                        if (data->callchain)
3008                                size += data->callchain->nr;
3009
3010                        size *= sizeof(u64);
3011
3012                        perf_output_copy(handle, data->callchain, size);
3013                } else {
3014                        u64 nr = 0;
3015                        perf_output_put(handle, nr);
3016                }
3017        }
3018
3019        if (sample_type & PERF_SAMPLE_RAW) {
3020                if (data->raw) {
3021                        perf_output_put(handle, data->raw->size);
3022                        perf_output_copy(handle, data->raw->data,
3023                                         data->raw->size);
3024                } else {
3025                        struct {
3026                                u32     size;
3027                                u32     data;
3028                        } raw = {
3029                                .size = sizeof(u32),
3030                                .data = 0,
3031                        };
3032                        perf_output_put(handle, raw);
3033                }
3034        }
3035}
3036
3037void perf_prepare_sample(struct perf_event_header *header,
3038                         struct perf_sample_data *data,
3039                         struct perf_event *event,
3040                         struct pt_regs *regs)
3041{
3042        u64 sample_type = event->attr.sample_type;
3043
3044        data->type = sample_type;
3045
3046        header->type = PERF_RECORD_SAMPLE;
3047        header->size = sizeof(*header);
3048
3049        header->misc = 0;
3050        header->misc |= perf_misc_flags(regs);
3051
3052        if (sample_type & PERF_SAMPLE_IP) {
3053                data->ip = perf_instruction_pointer(regs);
3054
3055                header->size += sizeof(data->ip);
3056        }
3057
3058        if (sample_type & PERF_SAMPLE_TID) {
3059                /* namespace issues */
3060                data->tid_entry.pid = perf_event_pid(event, current);
3061                data->tid_entry.tid = perf_event_tid(event, current);
3062
3063                header->size += sizeof(data->tid_entry);
3064        }
3065
3066        if (sample_type & PERF_SAMPLE_TIME) {
3067                data->time = perf_clock();
3068
3069                header->size += sizeof(data->time);
3070        }
3071
3072        if (sample_type & PERF_SAMPLE_ADDR)
3073                header->size += sizeof(data->addr);
3074
3075        if (sample_type & PERF_SAMPLE_ID) {
3076                data->id = primary_event_id(event);
3077
3078                header->size += sizeof(data->id);
3079        }
3080
3081        if (sample_type & PERF_SAMPLE_STREAM_ID) {
3082                data->stream_id = event->id;
3083
3084                header->size += sizeof(data->stream_id);
3085        }
3086
3087        if (sample_type & PERF_SAMPLE_CPU) {
3088                data->cpu_entry.cpu             = raw_smp_processor_id();
3089                data->cpu_entry.reserved        = 0;
3090
3091                header->size += sizeof(data->cpu_entry);
3092        }
3093
3094        if (sample_type & PERF_SAMPLE_PERIOD)
3095                header->size += sizeof(data->period);
3096
3097        if (sample_type & PERF_SAMPLE_READ)
3098                header->size += perf_event_read_size(event);
3099
3100        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3101                int size = 1;
3102
3103                data->callchain = perf_callchain(regs);
3104
3105                if (data->callchain)
3106                        size += data->callchain->nr;
3107
3108                header->size += size * sizeof(u64);
3109        }
3110
3111        if (sample_type & PERF_SAMPLE_RAW) {
3112                int size = sizeof(u32);
3113
3114                if (data->raw)
3115                        size += data->raw->size;
3116                else
3117                        size += sizeof(u32);
3118
3119                WARN_ON_ONCE(size & (sizeof(u64)-1));
3120                header->size += size;
3121        }
3122}
3123
3124static void perf_event_output(struct perf_event *event, int nmi,
3125                                struct perf_sample_data *data,
3126                                struct pt_regs *regs)
3127{
3128        struct perf_output_handle handle;
3129        struct perf_event_header header;
3130
3131        perf_prepare_sample(&header, data, event, regs);
3132
3133        if (perf_output_begin(&handle, event, header.size, nmi, 1))
3134                return;
3135
3136        perf_output_sample(&handle, &header, data, event);
3137
3138        perf_output_end(&handle);
3139}
3140
3141/*
3142 * read event_id
3143 */
3144
3145struct perf_read_event {
3146        struct perf_event_header        header;
3147
3148        u32                             pid;
3149        u32                             tid;
3150};
3151
3152static void
3153perf_event_read_event(struct perf_event *event,
3154                        struct task_struct *task)
3155{
3156        struct perf_output_handle handle;
3157        struct perf_read_event read_event = {
3158                .header = {
3159                        .type = PERF_RECORD_READ,
3160                        .misc = 0,
3161                        .size = sizeof(read_event) + perf_event_read_size(event),
3162                },
3163                .pid = perf_event_pid(event, task),
3164                .tid = perf_event_tid(event, task),
3165        };
3166        int ret;
3167
3168        ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3169        if (ret)
3170                return;
3171
3172        perf_output_put(&handle, read_event);
3173        perf_output_read(&handle, event);
3174
3175        perf_output_end(&handle);
3176}
3177
3178/*
3179 * task tracking -- fork/exit
3180 *
3181 * enabled by: attr.comm | attr.mmap | attr.task
3182 */
3183
3184struct perf_task_event {
3185        struct task_struct              *task;
3186        struct perf_event_context       *task_ctx;
3187
3188        struct {
3189                struct perf_event_header        header;
3190
3191                u32                             pid;
3192                u32                             ppid;
3193                u32                             tid;
3194                u32                             ptid;
3195                u64                             time;
3196        } event_id;
3197};
3198
3199static void perf_event_task_output(struct perf_event *event,
3200                                     struct perf_task_event *task_event)
3201{
3202        struct perf_output_handle handle;
3203        int size;
3204        struct task_struct *task = task_event->task;
3205        int ret;
3206
3207        size  = task_event->event_id.header.size;
3208        ret = perf_output_begin(&handle, event, size, 0, 0);
3209
3210        if (ret)
3211                return;
3212
3213        task_event->event_id.pid = perf_event_pid(event, task);
3214        task_event->event_id.ppid = perf_event_pid(event, current);
3215
3216        task_event->event_id.tid = perf_event_tid(event, task);
3217        task_event->event_id.ptid = perf_event_tid(event, current);
3218
3219        task_event->event_id.time = perf_clock();
3220
3221        perf_output_put(&handle, task_event->event_id);
3222
3223        perf_output_end(&handle);
3224}
3225
3226static int perf_event_task_match(struct perf_event *event)
3227{
3228        if (event->attr.comm || event->attr.mmap || event->attr.task)
3229                return 1;
3230
3231        return 0;
3232}
3233
3234static void perf_event_task_ctx(struct perf_event_context *ctx,
3235                                  struct perf_task_event *task_event)
3236{
3237        struct perf_event *event;
3238
3239        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3240                return;
3241
3242        rcu_read_lock();
3243        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3244                if (perf_event_task_match(event))
3245                        perf_event_task_output(event, task_event);
3246        }
3247        rcu_read_unlock();
3248}
3249
3250static void perf_event_task_event(struct perf_task_event *task_event)
3251{
3252        struct perf_cpu_context *cpuctx;
3253        struct perf_event_context *ctx = task_event->task_ctx;
3254
3255        cpuctx = &get_cpu_var(perf_cpu_context);
3256        perf_event_task_ctx(&cpuctx->ctx, task_event);
3257        put_cpu_var(perf_cpu_context);
3258
3259        rcu_read_lock();
3260        if (!ctx)
3261                ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3262        if (ctx)
3263                perf_event_task_ctx(ctx, task_event);
3264        rcu_read_unlock();
3265}
3266
3267static void perf_event_task(struct task_struct *task,
3268                              struct perf_event_context *task_ctx,
3269                              int new)
3270{
3271        struct perf_task_event task_event;
3272
3273        if (!atomic_read(&nr_comm_events) &&
3274            !atomic_read(&nr_mmap_events) &&
3275            !atomic_read(&nr_task_events))
3276                return;
3277
3278        task_event = (struct perf_task_event){
3279                .task     = task,
3280                .task_ctx = task_ctx,
3281                .event_id    = {
3282                        .header = {
3283                                .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3284                                .misc = 0,
3285                                .size = sizeof(task_event.event_id),
3286                        },
3287                        /* .pid  */
3288                        /* .ppid */
3289                        /* .tid  */
3290                        /* .ptid */
3291                },
3292        };
3293
3294        perf_event_task_event(&task_event);
3295}
3296
3297void perf_event_fork(struct task_struct *task)
3298{
3299        perf_event_task(task, NULL, 1);
3300}
3301
3302/*
3303 * comm tracking
3304 */
3305
3306struct perf_comm_event {
3307        struct task_struct      *task;
3308        char                    *comm;
3309        int                     comm_size;
3310
3311        struct {
3312                struct perf_event_header        header;
3313
3314                u32                             pid;
3315                u32                             tid;
3316        } event_id;
3317};
3318
3319static void perf_event_comm_output(struct perf_event *event,
3320                                     struct perf_comm_event *comm_event)
3321{
3322        struct perf_output_handle handle;
3323        int size = comm_event->event_id.header.size;
3324        int ret = perf_output_begin(&handle, event, size, 0, 0);
3325
3326        if (ret)
3327                return;
3328
3329        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3330        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3331
3332        perf_output_put(&handle, comm_event->event_id);
3333        perf_output_copy(&handle, comm_event->comm,
3334                                   comm_event->comm_size);
3335        perf_output_end(&handle);
3336}
3337
3338static int perf_event_comm_match(struct perf_event *event)
3339{
3340        if (event->attr.comm)
3341                return 1;
3342
3343        return 0;
3344}
3345
3346static void perf_event_comm_ctx(struct perf_event_context *ctx,
3347                                  struct perf_comm_event *comm_event)
3348{
3349        struct perf_event *event;
3350
3351        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3352                return;
3353
3354        rcu_read_lock();
3355        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3356                if (perf_event_comm_match(event))
3357                        perf_event_comm_output(event, comm_event);
3358        }
3359        rcu_read_unlock();
3360}
3361
3362static void perf_event_comm_event(struct perf_comm_event *comm_event)
3363{
3364        struct perf_cpu_context *cpuctx;
3365        struct perf_event_context *ctx;
3366        unsigned int size;
3367        char comm[TASK_COMM_LEN];
3368
3369        memset(comm, 0, sizeof(comm));
3370        strncpy(comm, comm_event->task->comm, sizeof(comm));
3371        size = ALIGN(strlen(comm)+1, sizeof(u64));
3372
3373        comm_event->comm = comm;
3374        comm_event->comm_size = size;
3375
3376        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3377
3378        cpuctx = &get_cpu_var(perf_cpu_context);
3379        perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3380        put_cpu_var(perf_cpu_context);
3381
3382        rcu_read_lock();
3383        /*
3384         * doesn't really matter which of the child contexts the
3385         * events ends up in.
3386         */
3387        ctx = rcu_dereference(current->perf_event_ctxp);
3388        if (ctx)
3389                perf_event_comm_ctx(ctx, comm_event);
3390        rcu_read_unlock();
3391}
3392
3393void perf_event_comm(struct task_struct *task)
3394{
3395        struct perf_comm_event comm_event;
3396
3397        if (task->perf_event_ctxp)
3398                perf_event_enable_on_exec(task);
3399
3400        if (!atomic_read(&nr_comm_events))
3401                return;
3402
3403        comm_event = (struct perf_comm_event){
3404                .task   = task,
3405                /* .comm      */
3406                /* .comm_size */
3407                .event_id  = {
3408                        .header = {
3409                                .type = PERF_RECORD_COMM,
3410                                .misc = 0,
3411                                /* .size */
3412                        },
3413                        /* .pid */
3414                        /* .tid */
3415                },
3416        };
3417
3418        perf_event_comm_event(&comm_event);
3419}
3420
3421/*
3422 * mmap tracking
3423 */
3424
3425struct perf_mmap_event {
3426        struct vm_area_struct   *vma;
3427
3428        const char              *file_name;
3429        int                     file_size;
3430
3431        struct {
3432                struct perf_event_header        header;
3433
3434                u32                             pid;
3435                u32                             tid;
3436                u64                             start;
3437                u64                             len;
3438                u64                             pgoff;
3439        } event_id;
3440};
3441
3442static void perf_event_mmap_output(struct perf_event *event,
3443                                     struct perf_mmap_event *mmap_event)
3444{
3445        struct perf_output_handle handle;
3446        int size = mmap_event->event_id.header.size;
3447        int ret = perf_output_begin(&handle, event, size, 0, 0);
3448
3449        if (ret)
3450                return;
3451
3452        mmap_event->event_id.pid = perf_event_pid(event, current);
3453        mmap_event->event_id.tid = perf_event_tid(event, current);
3454
3455        perf_output_put(&handle, mmap_event->event_id);
3456        perf_output_copy(&handle, mmap_event->file_name,
3457                                   mmap_event->file_size);
3458        perf_output_end(&handle);
3459}
3460
3461static int perf_event_mmap_match(struct perf_event *event,
3462                                   struct perf_mmap_event *mmap_event)
3463{
3464        if (event->attr.mmap)
3465                return 1;
3466
3467        return 0;
3468}
3469
3470static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3471                                  struct perf_mmap_event *mmap_event)
3472{
3473        struct perf_event *event;
3474
3475        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3476                return;
3477
3478        rcu_read_lock();
3479        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3480                if (perf_event_mmap_match(event, mmap_event))
3481                        perf_event_mmap_output(event, mmap_event);
3482        }
3483        rcu_read_unlock();
3484}
3485
3486static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3487{
3488        struct perf_cpu_context *cpuctx;
3489        struct perf_event_context *ctx;
3490        struct vm_area_struct *vma = mmap_event->vma;
3491        struct file *file = vma->vm_file;
3492        unsigned int size;
3493        char tmp[16];
3494        char *buf = NULL;
3495        const char *name;
3496
3497        memset(tmp, 0, sizeof(tmp));
3498
3499        if (file) {
3500                /*
3501                 * d_path works from the end of the buffer backwards, so we
3502                 * need to add enough zero bytes after the string to handle
3503                 * the 64bit alignment we do later.
3504                 */
3505                buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3506                if (!buf) {
3507                        name = strncpy(tmp, "//enomem", sizeof(tmp));
3508                        goto got_name;
3509                }
3510                name = d_path(&file->f_path, buf, PATH_MAX);
3511                if (IS_ERR(name)) {
3512                        name = strncpy(tmp, "//toolong", sizeof(tmp));
3513                        goto got_name;
3514                }
3515        } else {
3516                if (arch_vma_name(mmap_event->vma)) {
3517                        name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3518                                       sizeof(tmp));
3519                        goto got_name;
3520                }
3521
3522                if (!vma->vm_mm) {
3523                        name = strncpy(tmp, "[vdso]", sizeof(tmp));
3524                        goto got_name;
3525                }
3526
3527                name = strncpy(tmp, "//anon", sizeof(tmp));
3528                goto got_name;
3529        }
3530
3531got_name:
3532        size = ALIGN(strlen(name)+1, sizeof(u64));
3533
3534        mmap_event->file_name = name;
3535        mmap_event->file_size = size;
3536
3537        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3538
3539        cpuctx = &get_cpu_var(perf_cpu_context);
3540        perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3541        put_cpu_var(perf_cpu_context);
3542
3543        rcu_read_lock();
3544        /*
3545         * doesn't really matter which of the child contexts the
3546         * events ends up in.
3547         */
3548        ctx = rcu_dereference(current->perf_event_ctxp);
3549        if (ctx)
3550                perf_event_mmap_ctx(ctx, mmap_event);
3551        rcu_read_unlock();
3552
3553        kfree(buf);
3554}
3555
3556void __perf_event_mmap(struct vm_area_struct *vma)
3557{
3558        struct perf_mmap_event mmap_event;
3559
3560        if (!atomic_read(&nr_mmap_events))
3561                return;
3562
3563        mmap_event = (struct perf_mmap_event){
3564                .vma    = vma,
3565                /* .file_name */
3566                /* .file_size */
3567                .event_id  = {
3568                        .header = {
3569                                .type = PERF_RECORD_MMAP,
3570                                .misc = 0,
3571                                /* .size */
3572                        },
3573                        /* .pid */
3574                        /* .tid */
3575                        .start  = vma->vm_start,
3576                        .len    = vma->vm_end - vma->vm_start,
3577                        .pgoff  = vma->vm_pgoff,
3578                },
3579        };
3580
3581        perf_event_mmap_event(&mmap_event);
3582}
3583
3584/*
3585 * IRQ throttle logging
3586 */
3587
3588static void perf_log_throttle(struct perf_event *event, int enable)
3589{
3590        struct perf_output_handle handle;
3591        int ret;
3592
3593        struct {
3594                struct perf_event_header        header;
3595                u64                             time;
3596                u64                             id;
3597                u64                             stream_id;
3598        } throttle_event = {
3599                .header = {
3600                        .type = PERF_RECORD_THROTTLE,
3601                        .misc = 0,
3602                        .size = sizeof(throttle_event),
3603                },
3604                .time           = perf_clock(),
3605                .id             = primary_event_id(event),
3606                .stream_id      = event->id,
3607        };
3608
3609        if (enable)
3610                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3611
3612        ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3613        if (ret)
3614                return;
3615
3616        perf_output_put(&handle, throttle_event);
3617        perf_output_end(&handle);
3618}
3619
3620/*
3621 * Generic event overflow handling, sampling.
3622 */
3623
3624static int __perf_event_overflow(struct perf_event *event, int nmi,
3625                                   int throttle, struct perf_sample_data *data,
3626                                   struct pt_regs *regs)
3627{
3628        int events = atomic_read(&event->event_limit);
3629        struct hw_perf_event *hwc = &event->hw;
3630        int ret = 0;
3631
3632        throttle = (throttle && event->pmu->unthrottle != NULL);
3633
3634        if (!throttle) {
3635                hwc->interrupts++;
3636        } else {
3637                if (hwc->interrupts != MAX_INTERRUPTS) {
3638                        hwc->interrupts++;
3639                        if (HZ * hwc->interrupts >
3640                                        (u64)sysctl_perf_event_sample_rate) {
3641                                hwc->interrupts = MAX_INTERRUPTS;
3642                                perf_log_throttle(event, 0);
3643                                ret = 1;
3644                        }
3645                } else {
3646                        /*
3647                         * Keep re-disabling events even though on the previous
3648                         * pass we disabled it - just in case we raced with a
3649                         * sched-in and the event got enabled again:
3650                         */
3651                        ret = 1;
3652                }
3653        }
3654
3655        if (event->attr.freq) {
3656                u64 now = perf_clock();
3657                s64 delta = now - hwc->freq_stamp;
3658
3659                hwc->freq_stamp = now;
3660
3661                if (delta > 0 && delta < TICK_NSEC)
3662                        perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3663        }
3664
3665        /*
3666         * XXX event_limit might not quite work as expected on inherited
3667         * events
3668         */
3669
3670        event->pending_kill = POLL_IN;
3671        if (events && atomic_dec_and_test(&event->event_limit)) {
3672                ret = 1;
3673                event->pending_kill = POLL_HUP;
3674                if (nmi) {
3675                        event->pending_disable = 1;
3676                        perf_pending_queue(&event->pending,
3677                                           perf_pending_event);
3678                } else
3679                        perf_event_disable(event);
3680        }
3681
3682        perf_event_output(event, nmi, data, regs);
3683        return ret;
3684}
3685
3686int perf_event_overflow(struct perf_event *event, int nmi,
3687                          struct perf_sample_data *data,
3688                          struct pt_regs *regs)
3689{
3690        return __perf_event_overflow(event, nmi, 1, data, regs);
3691}
3692
3693/*
3694 * Generic software event infrastructure
3695 */
3696
3697/*
3698 * We directly increment event->count and keep a second value in
3699 * event->hw.period_left to count intervals. This period event
3700 * is kept in the range [-sample_period, 0] so that we can use the
3701 * sign as trigger.
3702 */
3703
3704static u64 perf_swevent_set_period(struct perf_event *event)
3705{
3706        struct hw_perf_event *hwc = &event->hw;
3707        u64 period = hwc->last_period;
3708        u64 nr, offset;
3709        s64 old, val;
3710
3711        hwc->last_period = hwc->sample_period;
3712
3713again:
3714        old = val = atomic64_read(&hwc->period_left);
3715        if (val < 0)
3716                return 0;
3717
3718        nr = div64_u64(period + val, period);
3719        offset = nr * period;
3720        val -= offset;
3721        if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3722                goto again;
3723
3724        return nr;
3725}
3726
3727static void perf_swevent_overflow(struct perf_event *event,
3728                                    int nmi, struct perf_sample_data *data,
3729                                    struct pt_regs *regs)
3730{
3731        struct hw_perf_event *hwc = &event->hw;
3732        int throttle = 0;
3733        u64 overflow;
3734
3735        data->period = event->hw.last_period;
3736        overflow = perf_swevent_set_period(event);
3737
3738        if (hwc->interrupts == MAX_INTERRUPTS)
3739                return;
3740
3741        for (; overflow; overflow--) {
3742                if (__perf_event_overflow(event, nmi, throttle,
3743                                            data, regs)) {
3744                        /*
3745                         * We inhibit the overflow from happening when
3746                         * hwc->interrupts == MAX_INTERRUPTS.
3747                         */
3748                        break;
3749                }
3750                throttle = 1;
3751        }
3752}
3753
3754static void perf_swevent_unthrottle(struct perf_event *event)
3755{
3756        /*
3757         * Nothing to do, we already reset hwc->interrupts.
3758         */
3759}
3760
3761static void perf_swevent_add(struct perf_event *event, u64 nr,
3762                               int nmi, struct perf_sample_data *data,
3763                               struct pt_regs *regs)
3764{
3765        struct hw_perf_event *hwc = &event->hw;
3766
3767        atomic64_add(nr, &event->count);
3768
3769        if (!hwc->sample_period)
3770                return;
3771
3772        if (!regs)
3773                return;
3774
3775        if (!atomic64_add_negative(nr, &hwc->period_left))
3776                perf_swevent_overflow(event, nmi, data, regs);
3777}
3778
3779static int perf_swevent_is_counting(struct perf_event *event)
3780{
3781        /*
3782         * The event is active, we're good!
3783         */
3784        if (event->state == PERF_EVENT_STATE_ACTIVE)
3785                return 1;
3786
3787        /*
3788         * The event is off/error, not counting.
3789         */
3790        if (event->state != PERF_EVENT_STATE_INACTIVE)
3791                return 0;
3792
3793        /*
3794         * The event is inactive, if the context is active
3795         * we're part of a group that didn't make it on the 'pmu',
3796         * not counting.
3797         */
3798        if (event->ctx->is_active)
3799                return 0;
3800
3801        /*
3802         * We're inactive and the context is too, this means the
3803         * task is scheduled out, we're counting events that happen
3804         * to us, like migration events.
3805         */
3806        return 1;
3807}
3808
3809static int perf_swevent_match(struct perf_event *event,
3810                                enum perf_type_id type,
3811                                u32 event_id, struct pt_regs *regs)
3812{
3813        if (!perf_swevent_is_counting(event))
3814                return 0;
3815
3816        if (event->attr.type != type)
3817                return 0;
3818        if (event->attr.config != event_id)
3819                return 0;
3820
3821        if (regs) {
3822                if (event->attr.exclude_user && user_mode(regs))
3823                        return 0;
3824
3825                if (event->attr.exclude_kernel && !user_mode(regs))
3826                        return 0;
3827        }
3828
3829        return 1;
3830}
3831
3832static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3833                                     enum perf_type_id type,
3834                                     u32 event_id, u64 nr, int nmi,
3835                                     struct perf_sample_data *data,
3836                                     struct pt_regs *regs)
3837{
3838        struct perf_event *event;
3839
3840        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3841                return;
3842
3843        rcu_read_lock();
3844        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3845                if (perf_swevent_match(event, type, event_id, regs))
3846                        perf_swevent_add(event, nr, nmi, data, regs);
3847        }
3848        rcu_read_unlock();
3849}
3850
3851static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
3852{
3853        if (in_nmi())
3854                return &cpuctx->recursion[3];
3855
3856        if (in_irq())
3857                return &cpuctx->recursion[2];
3858
3859        if (in_softirq())
3860                return &cpuctx->recursion[1];
3861
3862        return &cpuctx->recursion[0];
3863}
3864
3865static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3866                                    u64 nr, int nmi,
3867                                    struct perf_sample_data *data,
3868                                    struct pt_regs *regs)
3869{
3870        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3871        int *recursion = perf_swevent_recursion_context(cpuctx);
3872        struct perf_event_context *ctx;
3873
3874        if (*recursion)
3875                goto out;
3876
3877        (*recursion)++;
3878        barrier();
3879
3880        perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3881                                 nr, nmi, data, regs);
3882        rcu_read_lock();
3883        /*
3884         * doesn't really matter which of the child contexts the
3885         * events ends up in.
3886         */
3887        ctx = rcu_dereference(current->perf_event_ctxp);
3888        if (ctx)
3889                perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3890        rcu_read_unlock();
3891
3892        barrier();
3893        (*recursion)--;
3894
3895out:
3896        put_cpu_var(perf_cpu_context);
3897}
3898
3899void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3900                            struct pt_regs *regs, u64 addr)
3901{
3902        struct perf_sample_data data = {
3903                .addr = addr,
3904        };
3905
3906        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
3907                                &data, regs);
3908}
3909
3910static void perf_swevent_read(struct perf_event *event)
3911{
3912}
3913
3914static int perf_swevent_enable(struct perf_event *event)
3915{
3916        struct hw_perf_event *hwc = &event->hw;
3917
3918        if (hwc->sample_period) {
3919                hwc->last_period = hwc->sample_period;
3920                perf_swevent_set_period(event);
3921        }
3922        return 0;
3923}
3924
3925static void perf_swevent_disable(struct perf_event *event)
3926{
3927}
3928
3929static const struct pmu perf_ops_generic = {
3930        .enable         = perf_swevent_enable,
3931        .disable        = perf_swevent_disable,
3932        .read           = perf_swevent_read,
3933        .unthrottle     = perf_swevent_unthrottle,
3934};
3935
3936/*
3937 * hrtimer based swevent callback
3938 */
3939
3940static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3941{
3942        enum hrtimer_restart ret = HRTIMER_RESTART;
3943        struct perf_sample_data data;
3944        struct pt_regs *regs;
3945        struct perf_event *event;
3946        u64 period;
3947
3948        event   = container_of(hrtimer, struct perf_event, hw.hrtimer);
3949        event->pmu->read(event);
3950
3951        data.addr = 0;
3952        regs = get_irq_regs();
3953        /*
3954         * In case we exclude kernel IPs or are somehow not in interrupt
3955         * context, provide the next best thing, the user IP.
3956         */
3957        if ((event->attr.exclude_kernel || !regs) &&
3958                        !event->attr.exclude_user)
3959                regs = task_pt_regs(current);
3960
3961        if (regs) {
3962                if (!(event->attr.exclude_idle && current->pid == 0))
3963                        if (perf_event_overflow(event, 0, &data, regs))
3964                                ret = HRTIMER_NORESTART;
3965        }
3966
3967        period = max_t(u64, 10000, event->hw.sample_period);
3968        hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3969
3970        return ret;
3971}
3972
3973static void perf_swevent_start_hrtimer(struct perf_event *event)
3974{
3975        struct hw_perf_event *hwc = &event->hw;
3976
3977        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3978        hwc->hrtimer.function = perf_swevent_hrtimer;
3979        if (hwc->sample_period) {
3980                u64 period;
3981
3982                if (hwc->remaining) {
3983                        if (hwc->remaining < 0)
3984                                period = 10000;
3985                        else
3986                                period = hwc->remaining;
3987                        hwc->remaining = 0;
3988                } else {
3989                        period = max_t(u64, 10000, hwc->sample_period);
3990                }
3991                __hrtimer_start_range_ns(&hwc->hrtimer,
3992                                ns_to_ktime(period), 0,
3993                                HRTIMER_MODE_REL, 0);
3994        }
3995}
3996
3997static void perf_swevent_cancel_hrtimer(struct perf_event *event)
3998{
3999        struct hw_perf_event *hwc = &event->hw;
4000
4001        if (hwc->sample_period) {
4002                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4003                hwc->remaining = ktime_to_ns(remaining);
4004
4005                hrtimer_cancel(&hwc->hrtimer);
4006        }
4007}
4008
4009/*
4010 * Software event: cpu wall time clock
4011 */
4012
4013static void cpu_clock_perf_event_update(struct perf_event *event)
4014{
4015        int cpu = raw_smp_processor_id();
4016        s64 prev;
4017        u64 now;
4018
4019        now = cpu_clock(cpu);
4020        prev = atomic64_read(&event->hw.prev_count);
4021        atomic64_set(&event->hw.prev_count, now);
4022        atomic64_add(now - prev, &event->count);
4023}
4024
4025static int cpu_clock_perf_event_enable(struct perf_event *event)
4026{
4027        struct hw_perf_event *hwc = &event->hw;
4028        int cpu = raw_smp_processor_id();
4029
4030        atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4031        perf_swevent_start_hrtimer(event);
4032
4033        return 0;
4034}
4035
4036static void cpu_clock_perf_event_disable(struct perf_event *event)
4037{
4038        perf_swevent_cancel_hrtimer(event);
4039        cpu_clock_perf_event_update(event);
4040}
4041
4042static void cpu_clock_perf_event_read(struct perf_event *event)
4043{
4044        cpu_clock_perf_event_update(event);
4045}
4046
4047static const struct pmu perf_ops_cpu_clock = {
4048        .enable         = cpu_clock_perf_event_enable,
4049        .disable        = cpu_clock_perf_event_disable,
4050        .read           = cpu_clock_perf_event_read,
4051};
4052
4053/*
4054 * Software event: task time clock
4055 */
4056
4057static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4058{
4059        u64 prev;
4060        s64 delta;
4061
4062        prev = atomic64_xchg(&event->hw.prev_count, now);
4063        delta = now - prev;
4064        atomic64_add(delta, &event->count);
4065}
4066
4067static int task_clock_perf_event_enable(struct perf_event *event)
4068{
4069        struct hw_perf_event *hwc = &event->hw;
4070        u64 now;
4071
4072        now = event->ctx->time;
4073
4074        atomic64_set(&hwc->prev_count, now);
4075
4076        perf_swevent_start_hrtimer(event);
4077
4078        return 0;
4079}
4080
4081static void task_clock_perf_event_disable(struct perf_event *event)
4082{
4083        perf_swevent_cancel_hrtimer(event);
4084        task_clock_perf_event_update(event, event->ctx->time);
4085
4086}
4087
4088static void task_clock_perf_event_read(struct perf_event *event)
4089{
4090        u64 time;
4091
4092        if (!in_nmi()) {
4093                update_context_time(event->ctx);
4094                time = event->ctx->time;
4095        } else {
4096                u64 now = perf_clock();
4097                u64 delta = now - event->ctx->timestamp;
4098                time = event->ctx->time + delta;
4099        }
4100
4101        task_clock_perf_event_update(event, time);
4102}
4103
4104static const struct pmu perf_ops_task_clock = {
4105        .enable         = task_clock_perf_event_enable,
4106        .disable        = task_clock_perf_event_disable,
4107        .read           = task_clock_perf_event_read,
4108};
4109
4110#ifdef CONFIG_EVENT_PROFILE
4111void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4112                          int entry_size)
4113{
4114        struct perf_raw_record raw = {
4115                .size = entry_size,
4116                .data = record,
4117        };
4118
4119        struct perf_sample_data data = {
4120                .addr = addr,
4121                .raw = &raw,
4122        };
4123
4124        struct pt_regs *regs = get_irq_regs();
4125
4126        if (!regs)
4127                regs = task_pt_regs(current);
4128
4129        do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4130                                &data, regs);
4131}
4132EXPORT_SYMBOL_GPL(perf_tp_event);
4133
4134extern int ftrace_profile_enable(int);
4135extern void ftrace_profile_disable(int);
4136
4137static void tp_perf_event_destroy(struct perf_event *event)
4138{
4139        ftrace_profile_disable(event->attr.config);
4140}
4141
4142static const struct pmu *tp_perf_event_init(struct perf_event *event)
4143{
4144        /*
4145         * Raw tracepoint data is a severe data leak, only allow root to
4146         * have these.
4147         */
4148        if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4149                        perf_paranoid_tracepoint_raw() &&
4150                        !capable(CAP_SYS_ADMIN))
4151                return ERR_PTR(-EPERM);
4152
4153        if (ftrace_profile_enable(event->attr.config))
4154                return NULL;
4155
4156        event->destroy = tp_perf_event_destroy;
4157
4158        return &perf_ops_generic;
4159}
4160#else
4161static const struct pmu *tp_perf_event_init(struct perf_event *event)
4162{
4163        return NULL;
4164}
4165#endif
4166
4167atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4168
4169static void sw_perf_event_destroy(struct perf_event *event)
4170{
4171        u64 event_id = event->attr.config;
4172
4173        WARN_ON(event->parent);
4174
4175        atomic_dec(&perf_swevent_enabled[event_id]);
4176}
4177
4178static const struct pmu *sw_perf_event_init(struct perf_event *event)
4179{
4180        const struct pmu *pmu = NULL;
4181        u64 event_id = event->attr.config;
4182
4183        /*
4184         * Software events (currently) can't in general distinguish
4185         * between user, kernel and hypervisor events.
4186         * However, context switches and cpu migrations are considered
4187         * to be kernel events, and page faults are never hypervisor
4188         * events.
4189         */
4190        switch (event_id) {
4191        case PERF_COUNT_SW_CPU_CLOCK:
4192                pmu = &perf_ops_cpu_clock;
4193
4194                break;
4195        case PERF_COUNT_SW_TASK_CLOCK:
4196                /*
4197                 * If the user instantiates this as a per-cpu event,
4198                 * use the cpu_clock event instead.
4199                 */
4200                if (event->ctx->task)
4201                        pmu = &perf_ops_task_clock;
4202                else
4203                        pmu = &perf_ops_cpu_clock;
4204
4205                break;
4206        case PERF_COUNT_SW_PAGE_FAULTS:
4207        case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4208        case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4209        case PERF_COUNT_SW_CONTEXT_SWITCHES:
4210        case PERF_COUNT_SW_CPU_MIGRATIONS:
4211                if (!event->parent) {
4212                        atomic_inc(&perf_swevent_enabled[event_id]);
4213                        event->destroy = sw_perf_event_destroy;
4214                }
4215                pmu = &perf_ops_generic;
4216                break;
4217        }
4218
4219        return pmu;
4220}
4221
4222/*
4223 * Allocate and initialize a event structure
4224 */
4225static struct perf_event *
4226perf_event_alloc(struct perf_event_attr *attr,
4227                   int cpu,
4228                   struct perf_event_context *ctx,
4229                   struct perf_event *group_leader,
4230                   struct perf_event *parent_event,
4231                   gfp_t gfpflags)
4232{
4233        const struct pmu *pmu;
4234        struct perf_event *event;
4235        struct hw_perf_event *hwc;
4236        long err;
4237
4238        event = kzalloc(sizeof(*event), gfpflags);
4239        if (!event)
4240                return ERR_PTR(-ENOMEM);
4241
4242        /*
4243         * Single events are their own group leaders, with an
4244         * empty sibling list:
4245         */
4246        if (!group_leader)
4247                group_leader = event;
4248
4249        mutex_init(&event->child_mutex);
4250        INIT_LIST_HEAD(&event->child_list);
4251
4252        INIT_LIST_HEAD(&event->group_entry);
4253        INIT_LIST_HEAD(&event->event_entry);
4254        INIT_LIST_HEAD(&event->sibling_list);
4255        init_waitqueue_head(&event->waitq);
4256
4257        mutex_init(&event->mmap_mutex);
4258
4259        event->cpu              = cpu;
4260        event->attr             = *attr;
4261        event->group_leader     = group_leader;
4262        event->pmu              = NULL;
4263        event->ctx              = ctx;
4264        event->oncpu            = -1;
4265
4266        event->parent           = parent_event;
4267
4268        event->ns               = get_pid_ns(current->nsproxy->pid_ns);
4269        event->id               = atomic64_inc_return(&perf_event_id);
4270
4271        event->state            = PERF_EVENT_STATE_INACTIVE;
4272
4273        if (attr->disabled)
4274                event->state = PERF_EVENT_STATE_OFF;
4275
4276        pmu = NULL;
4277
4278        hwc = &event->hw;
4279        hwc->sample_period = attr->sample_period;
4280        if (attr->freq && attr->sample_freq)
4281                hwc->sample_period = 1;
4282        hwc->last_period = hwc->sample_period;
4283
4284        atomic64_set(&hwc->period_left, hwc->sample_period);
4285
4286        /*
4287         * we currently do not support PERF_FORMAT_GROUP on inherited events
4288         */
4289        if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4290                goto done;
4291
4292        switch (attr->type) {
4293        case PERF_TYPE_RAW:
4294        case PERF_TYPE_HARDWARE:
4295        case PERF_TYPE_HW_CACHE:
4296                pmu = hw_perf_event_init(event);
4297                break;
4298
4299        case PERF_TYPE_SOFTWARE:
4300                pmu = sw_perf_event_init(event);
4301                break;
4302
4303        case PERF_TYPE_TRACEPOINT:
4304                pmu = tp_perf_event_init(event);
4305                break;
4306
4307        default:
4308                break;
4309        }
4310done:
4311        err = 0;
4312        if (!pmu)
4313                err = -EINVAL;
4314        else if (IS_ERR(pmu))
4315                err = PTR_ERR(pmu);
4316
4317        if (err) {
4318                if (event->ns)
4319                        put_pid_ns(event->ns);
4320                kfree(event);
4321                return ERR_PTR(err);
4322        }
4323
4324        event->pmu = pmu;
4325
4326        if (!event->parent) {
4327                atomic_inc(&nr_events);
4328                if (event->attr.mmap)
4329                        atomic_inc(&nr_mmap_events);
4330                if (event->attr.comm)
4331                        atomic_inc(&nr_comm_events);
4332                if (event->attr.task)
4333                        atomic_inc(&nr_task_events);
4334        }
4335
4336        return event;
4337}
4338
4339static int perf_copy_attr(struct perf_event_attr __user *uattr,
4340                          struct perf_event_attr *attr)
4341{
4342        u32 size;
4343        int ret;
4344
4345        if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4346                return -EFAULT;
4347
4348        /*
4349         * zero the full structure, so that a short copy will be nice.
4350         */
4351        memset(attr, 0, sizeof(*attr));
4352
4353        ret = get_user(size, &uattr->size);
4354        if (ret)
4355                return ret;
4356
4357        if (size > PAGE_SIZE)   /* silly large */
4358                goto err_size;
4359
4360        if (!size)              /* abi compat */
4361                size = PERF_ATTR_SIZE_VER0;
4362
4363        if (size < PERF_ATTR_SIZE_VER0)
4364                goto err_size;
4365
4366        /*
4367         * If we're handed a bigger struct than we know of,
4368         * ensure all the unknown bits are 0 - i.e. new
4369         * user-space does not rely on any kernel feature
4370         * extensions we dont know about yet.
4371         */
4372        if (size > sizeof(*attr)) {
4373                unsigned char __user *addr;
4374                unsigned char __user *end;
4375                unsigned char val;
4376
4377                addr = (void __user *)uattr + sizeof(*attr);
4378                end  = (void __user *)uattr + size;
4379
4380                for (; addr < end; addr++) {
4381                        ret = get_user(val, addr);
4382                        if (ret)
4383                                return ret;
4384                        if (val)
4385                                goto err_size;
4386                }
4387                size = sizeof(*attr);
4388        }
4389
4390        ret = copy_from_user(attr, uattr, size);
4391        if (ret)
4392                return -EFAULT;
4393
4394        /*
4395         * If the type exists, the corresponding creation will verify
4396         * the attr->config.
4397         */
4398        if (attr->type >= PERF_TYPE_MAX)
4399                return -EINVAL;
4400
4401        if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4402                return -EINVAL;
4403
4404        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4405                return -EINVAL;
4406
4407        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4408                return -EINVAL;
4409
4410out:
4411        return ret;
4412
4413err_size:
4414        put_user(sizeof(*attr), &uattr->size);
4415        ret = -E2BIG;
4416        goto out;
4417}
4418
4419int perf_event_set_output(struct perf_event *event, int output_fd)
4420{
4421        struct perf_event *output_event = NULL;
4422        struct file *output_file = NULL;
4423        struct perf_event *old_output;
4424        int fput_needed = 0;
4425        int ret = -EINVAL;
4426
4427        if (!output_fd)
4428                goto set;
4429
4430        output_file = fget_light(output_fd, &fput_needed);
4431        if (!output_file)
4432                return -EBADF;
4433
4434        if (output_file->f_op != &perf_fops)
4435                goto out;
4436
4437        output_event = output_file->private_data;
4438
4439        /* Don't chain output fds */
4440        if (output_event->output)
4441                goto out;
4442
4443        /* Don't set an output fd when we already have an output channel */
4444        if (event->data)
4445                goto out;
4446
4447        atomic_long_inc(&output_file->f_count);
4448
4449set:
4450        mutex_lock(&event->mmap_mutex);
4451        old_output = event->output;
4452        rcu_assign_pointer(event->output, output_event);
4453        mutex_unlock(&event->mmap_mutex);
4454
4455        if (old_output) {
4456                /*
4457                 * we need to make sure no existing perf_output_*()
4458                 * is still referencing this event.
4459                 */
4460                synchronize_rcu();
4461                fput(old_output->filp);
4462        }
4463
4464        ret = 0;
4465out:
4466        fput_light(output_file, fput_needed);
4467        return ret;
4468}
4469
4470/**
4471 * sys_perf_event_open - open a performance event, associate it to a task/cpu
4472 *
4473 * @attr_uptr:  event_id type attributes for monitoring/sampling
4474 * @pid:                target pid
4475 * @cpu:                target cpu
4476 * @group_fd:           group leader event fd
4477 */
4478SYSCALL_DEFINE5(perf_event_open,
4479                struct perf_event_attr __user *, attr_uptr,
4480                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4481{
4482        struct perf_event *event, *group_leader;
4483        struct perf_event_attr attr;
4484        struct perf_event_context *ctx;
4485        struct file *event_file = NULL;
4486        struct file *group_file = NULL;
4487        int fput_needed = 0;
4488        int fput_needed2 = 0;
4489        int err;
4490
4491        /* for future expandability... */
4492        if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4493                return -EINVAL;
4494
4495        err = perf_copy_attr(attr_uptr, &attr);
4496        if (err)
4497                return err;
4498
4499        if (!attr.exclude_kernel) {
4500                if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4501                        return -EACCES;
4502        }
4503
4504        if (attr.freq) {
4505                if (attr.sample_freq > sysctl_perf_event_sample_rate)
4506                        return -EINVAL;
4507        }
4508
4509        /*
4510         * Get the target context (task or percpu):
4511         */
4512        ctx = find_get_context(pid, cpu);
4513        if (IS_ERR(ctx))
4514                return PTR_ERR(ctx);
4515
4516        /*
4517         * Look up the group leader (we will attach this event to it):
4518         */
4519        group_leader = NULL;
4520        if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4521                err = -EINVAL;
4522                group_file = fget_light(group_fd, &fput_needed);
4523                if (!group_file)
4524                        goto err_put_context;
4525                if (group_file->f_op != &perf_fops)
4526                        goto err_put_context;
4527
4528                group_leader = group_file->private_data;
4529                /*
4530                 * Do not allow a recursive hierarchy (this new sibling
4531                 * becoming part of another group-sibling):
4532                 */
4533                if (group_leader->group_leader != group_leader)
4534                        goto err_put_context;
4535                /*
4536                 * Do not allow to attach to a group in a different
4537                 * task or CPU context:
4538                 */
4539                if (group_leader->ctx != ctx)
4540                        goto err_put_context;
4541                /*
4542                 * Only a group leader can be exclusive or pinned
4543                 */
4544                if (attr.exclusive || attr.pinned)
4545                        goto err_put_context;
4546        }
4547
4548        event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4549                                     NULL, GFP_KERNEL);
4550        err = PTR_ERR(event);
4551        if (IS_ERR(event))
4552                goto err_put_context;
4553
4554        err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4555        if (err < 0)
4556                goto err_free_put_context;
4557
4558        event_file = fget_light(err, &fput_needed2);
4559        if (!event_file)
4560                goto err_free_put_context;
4561
4562        if (flags & PERF_FLAG_FD_OUTPUT) {
4563                err = perf_event_set_output(event, group_fd);
4564                if (err)
4565                        goto err_fput_free_put_context;
4566        }
4567
4568        event->filp = event_file;
4569        WARN_ON_ONCE(ctx->parent_ctx);
4570        mutex_lock(&ctx->mutex);
4571        perf_install_in_context(ctx, event, cpu);
4572        ++ctx->generation;
4573        mutex_unlock(&ctx->mutex);
4574
4575        event->owner = current;
4576        get_task_struct(current);
4577        mutex_lock(&current->perf_event_mutex);
4578        list_add_tail(&event->owner_entry, &current->perf_event_list);
4579        mutex_unlock(&current->perf_event_mutex);
4580
4581err_fput_free_put_context:
4582        fput_light(event_file, fput_needed2);
4583
4584err_free_put_context:
4585        if (err < 0)
4586                kfree(event);
4587
4588err_put_context:
4589        if (err < 0)
4590                put_ctx(ctx);
4591
4592        fput_light(group_file, fput_needed);
4593
4594        return err;
4595}
4596
4597/*
4598 * inherit a event from parent task to child task:
4599 */
4600static struct perf_event *
4601inherit_event(struct perf_event *parent_event,
4602              struct task_struct *parent,
4603              struct perf_event_context *parent_ctx,
4604              struct task_struct *child,
4605              struct perf_event *group_leader,
4606              struct perf_event_context *child_ctx)
4607{
4608        struct perf_event *child_event;
4609
4610        /*
4611         * Instead of creating recursive hierarchies of events,
4612         * we link inherited events back to the original parent,
4613         * which has a filp for sure, which we use as the reference
4614         * count:
4615         */
4616        if (parent_event->parent)
4617                parent_event = parent_event->parent;
4618
4619        child_event = perf_event_alloc(&parent_event->attr,
4620                                           parent_event->cpu, child_ctx,
4621                                           group_leader, parent_event,
4622                                           GFP_KERNEL);
4623        if (IS_ERR(child_event))
4624                return child_event;
4625        get_ctx(child_ctx);
4626
4627        /*
4628         * Make the child state follow the state of the parent event,
4629         * not its attr.disabled bit.  We hold the parent's mutex,
4630         * so we won't race with perf_event_{en, dis}able_family.
4631         */
4632        if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4633                child_event->state = PERF_EVENT_STATE_INACTIVE;
4634        else
4635                child_event->state = PERF_EVENT_STATE_OFF;
4636
4637        if (parent_event->attr.freq)
4638                child_event->hw.sample_period = parent_event->hw.sample_period;
4639
4640        /*
4641         * Link it up in the child's context:
4642         */
4643        add_event_to_ctx(child_event, child_ctx);
4644
4645        /*
4646         * Get a reference to the parent filp - we will fput it
4647         * when the child event exits. This is safe to do because
4648         * we are in the parent and we know that the filp still
4649         * exists and has a nonzero count:
4650         */
4651        atomic_long_inc(&parent_event->filp->f_count);
4652
4653        /*
4654         * Link this into the parent event's child list
4655         */
4656        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4657        mutex_lock(&parent_event->child_mutex);
4658        list_add_tail(&child_event->child_list, &parent_event->child_list);
4659        mutex_unlock(&parent_event->child_mutex);
4660
4661        return child_event;
4662}
4663
4664static int inherit_group(struct perf_event *parent_event,
4665              struct task_struct *parent,
4666              struct perf_event_context *parent_ctx,
4667              struct task_struct *child,
4668              struct perf_event_context *child_ctx)
4669{
4670        struct perf_event *leader;
4671        struct perf_event *sub;
4672        struct perf_event *child_ctr;
4673
4674        leader = inherit_event(parent_event, parent, parent_ctx,
4675                                 child, NULL, child_ctx);
4676        if (IS_ERR(leader))
4677                return PTR_ERR(leader);
4678        list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4679                child_ctr = inherit_event(sub, parent, parent_ctx,
4680                                            child, leader, child_ctx);
4681                if (IS_ERR(child_ctr))
4682                        return PTR_ERR(child_ctr);
4683        }
4684        return 0;
4685}
4686
4687static void sync_child_event(struct perf_event *child_event,
4688                               struct task_struct *child)
4689{
4690        struct perf_event *parent_event = child_event->parent;
4691        u64 child_val;
4692
4693        if (child_event->attr.inherit_stat)
4694                perf_event_read_event(child_event, child);
4695
4696        child_val = atomic64_read(&child_event->count);
4697
4698        /*
4699         * Add back the child's count to the parent's count:
4700         */
4701        atomic64_add(child_val, &parent_event->count);
4702        atomic64_add(child_event->total_time_enabled,
4703                     &parent_event->child_total_time_enabled);
4704        atomic64_add(child_event->total_time_running,
4705                     &parent_event->child_total_time_running);
4706
4707        /*
4708         * Remove this event from the parent's list
4709         */
4710        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4711        mutex_lock(&parent_event->child_mutex);
4712        list_del_init(&child_event->child_list);
4713        mutex_unlock(&parent_event->child_mutex);
4714
4715        /*
4716         * Release the parent event, if this was the last
4717         * reference to it.
4718         */
4719        fput(parent_event->filp);
4720}
4721
4722static void
4723__perf_event_exit_task(struct perf_event *child_event,
4724                         struct perf_event_context *child_ctx,
4725                         struct task_struct *child)
4726{
4727        struct perf_event *parent_event;
4728
4729        update_event_times(child_event);
4730        perf_event_remove_from_context(child_event);
4731
4732        parent_event = child_event->parent;
4733        /*
4734         * It can happen that parent exits first, and has events
4735         * that are still around due to the child reference. These
4736         * events need to be zapped - but otherwise linger.
4737         */
4738        if (parent_event) {
4739                sync_child_event(child_event, child);
4740                free_event(child_event);
4741        }
4742}
4743
4744/*
4745 * When a child task exits, feed back event values to parent events.
4746 */
4747void perf_event_exit_task(struct task_struct *child)
4748{
4749        struct perf_event *child_event, *tmp;
4750        struct perf_event_context *child_ctx;
4751        unsigned long flags;
4752
4753        if (likely(!child->perf_event_ctxp)) {
4754                perf_event_task(child, NULL, 0);
4755                return;
4756        }
4757
4758        local_irq_save(flags);
4759        /*
4760         * We can't reschedule here because interrupts are disabled,
4761         * and either child is current or it is a task that can't be
4762         * scheduled, so we are now safe from rescheduling changing
4763         * our context.
4764         */
4765        child_ctx = child->perf_event_ctxp;
4766        __perf_event_task_sched_out(child_ctx);
4767
4768        /*
4769         * Take the context lock here so that if find_get_context is
4770         * reading child->perf_event_ctxp, we wait until it has
4771         * incremented the context's refcount before we do put_ctx below.
4772         */
4773        spin_lock(&child_ctx->lock);
4774        child->perf_event_ctxp = NULL;
4775        /*
4776         * If this context is a clone; unclone it so it can't get
4777         * swapped to another process while we're removing all
4778         * the events from it.
4779         */
4780        unclone_ctx(child_ctx);
4781        spin_unlock_irqrestore(&child_ctx->lock, flags);
4782
4783        /*
4784         * Report the task dead after unscheduling the events so that we
4785         * won't get any samples after PERF_RECORD_EXIT. We can however still
4786         * get a few PERF_RECORD_READ events.
4787         */
4788        perf_event_task(child, child_ctx, 0);
4789
4790        /*
4791         * We can recurse on the same lock type through:
4792         *
4793         *   __perf_event_exit_task()
4794         *     sync_child_event()
4795         *       fput(parent_event->filp)
4796         *         perf_release()
4797         *           mutex_lock(&ctx->mutex)
4798         *
4799         * But since its the parent context it won't be the same instance.
4800         */
4801        mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4802
4803again:
4804        list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
4805                                 group_entry)
4806                __perf_event_exit_task(child_event, child_ctx, child);
4807
4808        /*
4809         * If the last event was a group event, it will have appended all
4810         * its siblings to the list, but we obtained 'tmp' before that which
4811         * will still point to the list head terminating the iteration.
4812         */
4813        if (!list_empty(&child_ctx->group_list))
4814                goto again;
4815
4816        mutex_unlock(&child_ctx->mutex);
4817
4818        put_ctx(child_ctx);
4819}
4820
4821/*
4822 * free an unexposed, unused context as created by inheritance by
4823 * init_task below, used by fork() in case of fail.
4824 */
4825void perf_event_free_task(struct task_struct *task)
4826{
4827        struct perf_event_context *ctx = task->perf_event_ctxp;
4828        struct perf_event *event, *tmp;
4829
4830        if (!ctx)
4831                return;
4832
4833        mutex_lock(&ctx->mutex);
4834again:
4835        list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
4836                struct perf_event *parent = event->parent;
4837
4838                if (WARN_ON_ONCE(!parent))
4839                        continue;
4840
4841                mutex_lock(&parent->child_mutex);
4842                list_del_init(&event->child_list);
4843                mutex_unlock(&parent->child_mutex);
4844
4845                fput(parent->filp);
4846
4847                list_del_event(event, ctx);
4848                free_event(event);
4849        }
4850
4851        if (!list_empty(&ctx->group_list))
4852                goto again;
4853
4854        mutex_unlock(&ctx->mutex);
4855
4856        put_ctx(ctx);
4857}
4858
4859/*
4860 * Initialize the perf_event context in task_struct
4861 */
4862int perf_event_init_task(struct task_struct *child)
4863{
4864        struct perf_event_context *child_ctx, *parent_ctx;
4865        struct perf_event_context *cloned_ctx;
4866        struct perf_event *event;
4867        struct task_struct *parent = current;
4868        int inherited_all = 1;
4869        int ret = 0;
4870
4871        child->perf_event_ctxp = NULL;
4872
4873        mutex_init(&child->perf_event_mutex);
4874        INIT_LIST_HEAD(&child->perf_event_list);
4875
4876        if (likely(!parent->perf_event_ctxp))
4877                return 0;
4878
4879        /*
4880         * This is executed from the parent task context, so inherit
4881         * events that have been marked for cloning.
4882         * First allocate and initialize a context for the child.
4883         */
4884
4885        child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4886        if (!child_ctx)
4887                return -ENOMEM;
4888
4889        __perf_event_init_context(child_ctx, child);
4890        child->perf_event_ctxp = child_ctx;
4891        get_task_struct(child);
4892
4893        /*
4894         * If the parent's context is a clone, pin it so it won't get
4895         * swapped under us.
4896         */
4897        parent_ctx = perf_pin_task_context(parent);
4898
4899        /*
4900         * No need to check if parent_ctx != NULL here; since we saw
4901         * it non-NULL earlier, the only reason for it to become NULL
4902         * is if we exit, and since we're currently in the middle of
4903         * a fork we can't be exiting at the same time.
4904         */
4905
4906        /*
4907         * Lock the parent list. No need to lock the child - not PID
4908         * hashed yet and not running, so nobody can access it.
4909         */
4910        mutex_lock(&parent_ctx->mutex);
4911
4912        /*
4913         * We dont have to disable NMIs - we are only looking at
4914         * the list, not manipulating it:
4915         */
4916        list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
4917
4918                if (!event->attr.inherit) {
4919                        inherited_all = 0;
4920                        continue;
4921                }
4922
4923                ret = inherit_group(event, parent, parent_ctx,
4924                                             child, child_ctx);
4925                if (ret) {
4926                        inherited_all = 0;
4927                        break;
4928                }
4929        }
4930
4931        if (inherited_all) {
4932                /*
4933                 * Mark the child context as a clone of the parent
4934                 * context, or of whatever the parent is a clone of.
4935                 * Note that if the parent is a clone, it could get
4936                 * uncloned at any point, but that doesn't matter
4937                 * because the list of events and the generation
4938                 * count can't have changed since we took the mutex.
4939                 */
4940                cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4941                if (cloned_ctx) {
4942                        child_ctx->parent_ctx = cloned_ctx;
4943                        child_ctx->parent_gen = parent_ctx->parent_gen;
4944                } else {
4945                        child_ctx->parent_ctx = parent_ctx;
4946                        child_ctx->parent_gen = parent_ctx->generation;
4947                }
4948                get_ctx(child_ctx->parent_ctx);
4949        }
4950
4951        mutex_unlock(&parent_ctx->mutex);
4952
4953        perf_unpin_context(parent_ctx);
4954
4955        return ret;
4956}
4957
4958static void __cpuinit perf_event_init_cpu(int cpu)
4959{
4960        struct perf_cpu_context *cpuctx;
4961
4962        cpuctx = &per_cpu(perf_cpu_context, cpu);
4963        __perf_event_init_context(&cpuctx->ctx, NULL);
4964
4965        spin_lock(&perf_resource_lock);
4966        cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
4967        spin_unlock(&perf_resource_lock);
4968
4969        hw_perf_event_setup(cpu);
4970}
4971
4972#ifdef CONFIG_HOTPLUG_CPU
4973static void __perf_event_exit_cpu(void *info)
4974{
4975        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4976        struct perf_event_context *ctx = &cpuctx->ctx;
4977        struct perf_event *event, *tmp;
4978
4979        list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
4980                __perf_event_remove_from_context(event);
4981}
4982static void perf_event_exit_cpu(int cpu)
4983{
4984        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4985        struct perf_event_context *ctx = &cpuctx->ctx;
4986
4987        mutex_lock(&ctx->mutex);
4988        smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
4989        mutex_unlock(&ctx->mutex);
4990}
4991#else
4992static inline void perf_event_exit_cpu(int cpu) { }
4993#endif
4994
4995static int __cpuinit
4996perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4997{
4998        unsigned int cpu = (long)hcpu;
4999
5000        switch (action) {
5001
5002        case CPU_UP_PREPARE:
5003        case CPU_UP_PREPARE_FROZEN:
5004                perf_event_init_cpu(cpu);
5005                break;
5006
5007        case CPU_ONLINE:
5008        case CPU_ONLINE_FROZEN:
5009                hw_perf_event_setup_online(cpu);
5010                break;
5011
5012        case CPU_DOWN_PREPARE:
5013        case CPU_DOWN_PREPARE_FROZEN:
5014                perf_event_exit_cpu(cpu);
5015                break;
5016
5017        default:
5018                break;
5019        }
5020
5021        return NOTIFY_OK;
5022}
5023
5024/*
5025 * This has to have a higher priority than migration_notifier in sched.c.
5026 */
5027static struct notifier_block __cpuinitdata perf_cpu_nb = {
5028        .notifier_call          = perf_cpu_notify,
5029        .priority               = 20,
5030};
5031
5032void __init perf_event_init(void)
5033{
5034        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5035                        (void *)(long)smp_processor_id());
5036        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5037                        (void *)(long)smp_processor_id());
5038        register_cpu_notifier(&perf_cpu_nb);
5039}
5040
5041static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
5042{
5043        return sprintf(buf, "%d\n", perf_reserved_percpu);
5044}
5045
5046static ssize_t
5047perf_set_reserve_percpu(struct sysdev_class *class,
5048                        const char *buf,
5049                        size_t count)
5050{
5051        struct perf_cpu_context *cpuctx;
5052        unsigned long val;
5053        int err, cpu, mpt;
5054
5055        err = strict_strtoul(buf, 10, &val);
5056        if (err)
5057                return err;
5058        if (val > perf_max_events)
5059                return -EINVAL;
5060
5061        spin_lock(&perf_resource_lock);
5062        perf_reserved_percpu = val;
5063        for_each_online_cpu(cpu) {
5064                cpuctx = &per_cpu(perf_cpu_context, cpu);
5065                spin_lock_irq(&cpuctx->ctx.lock);
5066                mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5067                          perf_max_events - perf_reserved_percpu);
5068                cpuctx->max_pertask = mpt;
5069                spin_unlock_irq(&cpuctx->ctx.lock);
5070        }
5071        spin_unlock(&perf_resource_lock);
5072
5073        return count;
5074}
5075
5076static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
5077{
5078        return sprintf(buf, "%d\n", perf_overcommit);
5079}
5080
5081static ssize_t
5082perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
5083{
5084        unsigned long val;
5085        int err;
5086
5087        err = strict_strtoul(buf, 10, &val);
5088        if (err)
5089                return err;
5090        if (val > 1)
5091                return -EINVAL;
5092
5093        spin_lock(&perf_resource_lock);
5094        perf_overcommit = val;
5095        spin_unlock(&perf_resource_lock);
5096
5097        return count;
5098}
5099
5100static SYSDEV_CLASS_ATTR(
5101                                reserve_percpu,
5102                                0644,
5103                                perf_show_reserve_percpu,
5104                                perf_set_reserve_percpu
5105                        );
5106
5107static SYSDEV_CLASS_ATTR(
5108                                overcommit,
5109                                0644,
5110                                perf_show_overcommit,
5111                                perf_set_overcommit
5112                        );
5113
5114static struct attribute *perfclass_attrs[] = {
5115        &attr_reserve_percpu.attr,
5116        &attr_overcommit.attr,
5117        NULL
5118};
5119
5120static struct attribute_group perfclass_attr_group = {
5121        .attrs                  = perfclass_attrs,
5122        .name                   = "perf_events",
5123};
5124
5125static int __init perf_event_sysfs_init(void)
5126{
5127        return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5128                                  &perfclass_attr_group);
5129}
5130device_initcall(perf_event_sysfs_init);
5131