linux/arch/x86/kernel/cpu/perf_event.c
<<
>>
Prefs
   1/*
   2 * Performance events x86 architecture code
   3 *
   4 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6 *  Copyright (C) 2009 Jaswinder Singh Rajput
   7 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
   8 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   9 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
  10 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
  11 *
  12 *  For licencing details see kernel-base/COPYING
  13 */
  14
  15#include <linux/perf_event.h>
  16#include <linux/capability.h>
  17#include <linux/notifier.h>
  18#include <linux/hardirq.h>
  19#include <linux/kprobes.h>
  20#include <linux/module.h>
  21#include <linux/kdebug.h>
  22#include <linux/sched.h>
  23#include <linux/uaccess.h>
  24#include <linux/slab.h>
  25#include <linux/highmem.h>
  26#include <linux/cpu.h>
  27#include <linux/bitops.h>
  28
  29#include <asm/apic.h>
  30#include <asm/stacktrace.h>
  31#include <asm/nmi.h>
  32#include <asm/compat.h>
  33
  34#if 0
  35#undef wrmsrl
  36#define wrmsrl(msr, val)                                        \
  37do {                                                            \
  38        trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
  39                        (unsigned long)(val));                  \
  40        native_write_msr((msr), (u32)((u64)(val)),              \
  41                        (u32)((u64)(val) >> 32));               \
  42} while (0)
  43#endif
  44
  45/*
  46 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
  47 */
  48static unsigned long
  49copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
  50{
  51        unsigned long offset, addr = (unsigned long)from;
  52        unsigned long size, len = 0;
  53        struct page *page;
  54        void *map;
  55        int ret;
  56
  57        do {
  58                ret = __get_user_pages_fast(addr, 1, 0, &page);
  59                if (!ret)
  60                        break;
  61
  62                offset = addr & (PAGE_SIZE - 1);
  63                size = min(PAGE_SIZE - offset, n - len);
  64
  65                map = kmap_atomic(page);
  66                memcpy(to, map+offset, size);
  67                kunmap_atomic(map);
  68                put_page(page);
  69
  70                len  += size;
  71                to   += size;
  72                addr += size;
  73
  74        } while (len < n);
  75
  76        return len;
  77}
  78
  79struct event_constraint {
  80        union {
  81                unsigned long   idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
  82                u64             idxmsk64;
  83        };
  84        u64     code;
  85        u64     cmask;
  86        int     weight;
  87};
  88
  89struct amd_nb {
  90        int nb_id;  /* NorthBridge id */
  91        int refcnt; /* reference count */
  92        struct perf_event *owners[X86_PMC_IDX_MAX];
  93        struct event_constraint event_constraints[X86_PMC_IDX_MAX];
  94};
  95
  96#define MAX_LBR_ENTRIES         16
  97
  98struct cpu_hw_events {
  99        /*
 100         * Generic x86 PMC bits
 101         */
 102        struct perf_event       *events[X86_PMC_IDX_MAX]; /* in counter order */
 103        unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 104        unsigned long           running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 105        int                     enabled;
 106
 107        int                     n_events;
 108        int                     n_added;
 109        int                     n_txn;
 110        int                     assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
 111        u64                     tags[X86_PMC_IDX_MAX];
 112        struct perf_event       *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
 113
 114        unsigned int            group_flag;
 115
 116        /*
 117         * Intel DebugStore bits
 118         */
 119        struct debug_store      *ds;
 120        u64                     pebs_enabled;
 121
 122        /*
 123         * Intel LBR bits
 124         */
 125        int                             lbr_users;
 126        void                            *lbr_context;
 127        struct perf_branch_stack        lbr_stack;
 128        struct perf_branch_entry        lbr_entries[MAX_LBR_ENTRIES];
 129
 130        /*
 131         * AMD specific bits
 132         */
 133        struct amd_nb           *amd_nb;
 134};
 135
 136#define __EVENT_CONSTRAINT(c, n, m, w) {\
 137        { .idxmsk64 = (n) },            \
 138        .code = (c),                    \
 139        .cmask = (m),                   \
 140        .weight = (w),                  \
 141}
 142
 143#define EVENT_CONSTRAINT(c, n, m)       \
 144        __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
 145
 146/*
 147 * Constraint on the Event code.
 148 */
 149#define INTEL_EVENT_CONSTRAINT(c, n)    \
 150        EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
 151
 152/*
 153 * Constraint on the Event code + UMask + fixed-mask
 154 *
 155 * filter mask to validate fixed counter events.
 156 * the following filters disqualify for fixed counters:
 157 *  - inv
 158 *  - edge
 159 *  - cnt-mask
 160 *  The other filters are supported by fixed counters.
 161 *  The any-thread option is supported starting with v3.
 162 */
 163#define FIXED_EVENT_CONSTRAINT(c, n)    \
 164        EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
 165
 166/*
 167 * Constraint on the Event code + UMask
 168 */
 169#define PEBS_EVENT_CONSTRAINT(c, n)     \
 170        EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
 171
 172#define EVENT_CONSTRAINT_END            \
 173        EVENT_CONSTRAINT(0, 0, 0)
 174
 175#define for_each_event_constraint(e, c) \
 176        for ((e) = (c); (e)->weight; (e)++)
 177
 178union perf_capabilities {
 179        struct {
 180                u64     lbr_format    : 6;
 181                u64     pebs_trap     : 1;
 182                u64     pebs_arch_reg : 1;
 183                u64     pebs_format   : 4;
 184                u64     smm_freeze    : 1;
 185        };
 186        u64     capabilities;
 187};
 188
 189/*
 190 * struct x86_pmu - generic x86 pmu
 191 */
 192struct x86_pmu {
 193        /*
 194         * Generic x86 PMC bits
 195         */
 196        const char      *name;
 197        int             version;
 198        int             (*handle_irq)(struct pt_regs *);
 199        void            (*disable_all)(void);
 200        void            (*enable_all)(int added);
 201        void            (*enable)(struct perf_event *);
 202        void            (*disable)(struct perf_event *);
 203        int             (*hw_config)(struct perf_event *event);
 204        int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
 205        unsigned        eventsel;
 206        unsigned        perfctr;
 207        u64             (*event_map)(int);
 208        int             max_events;
 209        int             num_counters;
 210        int             num_counters_fixed;
 211        int             cntval_bits;
 212        u64             cntval_mask;
 213        int             apic;
 214        u64             max_period;
 215        struct event_constraint *
 216                        (*get_event_constraints)(struct cpu_hw_events *cpuc,
 217                                                 struct perf_event *event);
 218
 219        void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
 220                                                 struct perf_event *event);
 221        struct event_constraint *event_constraints;
 222        void            (*quirks)(void);
 223        int             perfctr_second_write;
 224
 225        int             (*cpu_prepare)(int cpu);
 226        void            (*cpu_starting)(int cpu);
 227        void            (*cpu_dying)(int cpu);
 228        void            (*cpu_dead)(int cpu);
 229
 230        /*
 231         * Intel Arch Perfmon v2+
 232         */
 233        u64                     intel_ctrl;
 234        union perf_capabilities intel_cap;
 235
 236        /*
 237         * Intel DebugStore bits
 238         */
 239        int             bts, pebs;
 240        int             bts_active, pebs_active;
 241        int             pebs_record_size;
 242        void            (*drain_pebs)(struct pt_regs *regs);
 243        struct event_constraint *pebs_constraints;
 244
 245        /*
 246         * Intel LBR
 247         */
 248        unsigned long   lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
 249        int             lbr_nr;                    /* hardware stack size */
 250};
 251
 252static struct x86_pmu x86_pmu __read_mostly;
 253
 254static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
 255        .enabled = 1,
 256};
 257
 258static int x86_perf_event_set_period(struct perf_event *event);
 259
 260/*
 261 * Generalized hw caching related hw_event table, filled
 262 * in on a per model basis. A value of 0 means
 263 * 'not supported', -1 means 'hw_event makes no sense on
 264 * this CPU', any other value means the raw hw_event
 265 * ID.
 266 */
 267
 268#define C(x) PERF_COUNT_HW_CACHE_##x
 269
 270static u64 __read_mostly hw_cache_event_ids
 271                                [PERF_COUNT_HW_CACHE_MAX]
 272                                [PERF_COUNT_HW_CACHE_OP_MAX]
 273                                [PERF_COUNT_HW_CACHE_RESULT_MAX];
 274
 275/*
 276 * Propagate event elapsed time into the generic event.
 277 * Can only be executed on the CPU where the event is active.
 278 * Returns the delta events processed.
 279 */
 280static u64
 281x86_perf_event_update(struct perf_event *event)
 282{
 283        struct hw_perf_event *hwc = &event->hw;
 284        int shift = 64 - x86_pmu.cntval_bits;
 285        u64 prev_raw_count, new_raw_count;
 286        int idx = hwc->idx;
 287        s64 delta;
 288
 289        if (idx == X86_PMC_IDX_FIXED_BTS)
 290                return 0;
 291
 292        /*
 293         * Careful: an NMI might modify the previous event value.
 294         *
 295         * Our tactic to handle this is to first atomically read and
 296         * exchange a new raw count - then add that new-prev delta
 297         * count to the generic event atomically:
 298         */
 299again:
 300        prev_raw_count = local64_read(&hwc->prev_count);
 301        rdmsrl(hwc->event_base + idx, new_raw_count);
 302
 303        if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 304                                        new_raw_count) != prev_raw_count)
 305                goto again;
 306
 307        /*
 308         * Now we have the new raw value and have updated the prev
 309         * timestamp already. We can now calculate the elapsed delta
 310         * (event-)time and add that to the generic event.
 311         *
 312         * Careful, not all hw sign-extends above the physical width
 313         * of the count.
 314         */
 315        delta = (new_raw_count << shift) - (prev_raw_count << shift);
 316        delta >>= shift;
 317
 318        local64_add(delta, &event->count);
 319        local64_sub(delta, &hwc->period_left);
 320
 321        return new_raw_count;
 322}
 323
 324static atomic_t active_events;
 325static DEFINE_MUTEX(pmc_reserve_mutex);
 326
 327#ifdef CONFIG_X86_LOCAL_APIC
 328
 329static bool reserve_pmc_hardware(void)
 330{
 331        int i;
 332
 333        for (i = 0; i < x86_pmu.num_counters; i++) {
 334                if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
 335                        goto perfctr_fail;
 336        }
 337
 338        for (i = 0; i < x86_pmu.num_counters; i++) {
 339                if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 340                        goto eventsel_fail;
 341        }
 342
 343        return true;
 344
 345eventsel_fail:
 346        for (i--; i >= 0; i--)
 347                release_evntsel_nmi(x86_pmu.eventsel + i);
 348
 349        i = x86_pmu.num_counters;
 350
 351perfctr_fail:
 352        for (i--; i >= 0; i--)
 353                release_perfctr_nmi(x86_pmu.perfctr + i);
 354
 355        return false;
 356}
 357
 358static void release_pmc_hardware(void)
 359{
 360        int i;
 361
 362        for (i = 0; i < x86_pmu.num_counters; i++) {
 363                release_perfctr_nmi(x86_pmu.perfctr + i);
 364                release_evntsel_nmi(x86_pmu.eventsel + i);
 365        }
 366}
 367
 368#else
 369
 370static bool reserve_pmc_hardware(void) { return true; }
 371static void release_pmc_hardware(void) {}
 372
 373#endif
 374
 375static bool check_hw_exists(void)
 376{
 377        u64 val, val_new = 0;
 378        int i, reg, ret = 0;
 379
 380        /*
 381         * Check to see if the BIOS enabled any of the counters, if so
 382         * complain and bail.
 383         */
 384        for (i = 0; i < x86_pmu.num_counters; i++) {
 385                reg = x86_pmu.eventsel + i;
 386                ret = rdmsrl_safe(reg, &val);
 387                if (ret)
 388                        goto msr_fail;
 389                if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
 390                        goto bios_fail;
 391        }
 392
 393        if (x86_pmu.num_counters_fixed) {
 394                reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
 395                ret = rdmsrl_safe(reg, &val);
 396                if (ret)
 397                        goto msr_fail;
 398                for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
 399                        if (val & (0x03 << i*4))
 400                                goto bios_fail;
 401                }
 402        }
 403
 404        /*
 405         * Now write a value and read it back to see if it matches,
 406         * this is needed to detect certain hardware emulators (qemu/kvm)
 407         * that don't trap on the MSR access and always return 0s.
 408         */
 409        val = 0xabcdUL;
 410        ret = checking_wrmsrl(x86_pmu.perfctr, val);
 411        ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new);
 412        if (ret || val != val_new)
 413                goto msr_fail;
 414
 415        return true;
 416
 417bios_fail:
 418        printk(KERN_CONT "Broken BIOS detected, using software events only.\n");
 419        printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
 420        return false;
 421
 422msr_fail:
 423        printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
 424        return false;
 425}
 426
 427static void reserve_ds_buffers(void);
 428static void release_ds_buffers(void);
 429
 430static void hw_perf_event_destroy(struct perf_event *event)
 431{
 432        if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
 433                release_pmc_hardware();
 434                release_ds_buffers();
 435                mutex_unlock(&pmc_reserve_mutex);
 436        }
 437}
 438
 439static inline int x86_pmu_initialized(void)
 440{
 441        return x86_pmu.handle_irq != NULL;
 442}
 443
 444static inline int
 445set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 446{
 447        unsigned int cache_type, cache_op, cache_result;
 448        u64 config, val;
 449
 450        config = attr->config;
 451
 452        cache_type = (config >>  0) & 0xff;
 453        if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
 454                return -EINVAL;
 455
 456        cache_op = (config >>  8) & 0xff;
 457        if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
 458                return -EINVAL;
 459
 460        cache_result = (config >> 16) & 0xff;
 461        if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
 462                return -EINVAL;
 463
 464        val = hw_cache_event_ids[cache_type][cache_op][cache_result];
 465
 466        if (val == 0)
 467                return -ENOENT;
 468
 469        if (val == -1)
 470                return -EINVAL;
 471
 472        hwc->config |= val;
 473
 474        return 0;
 475}
 476
 477static int x86_setup_perfctr(struct perf_event *event)
 478{
 479        struct perf_event_attr *attr = &event->attr;
 480        struct hw_perf_event *hwc = &event->hw;
 481        u64 config;
 482
 483        if (!is_sampling_event(event)) {
 484                hwc->sample_period = x86_pmu.max_period;
 485                hwc->last_period = hwc->sample_period;
 486                local64_set(&hwc->period_left, hwc->sample_period);
 487        } else {
 488                /*
 489                 * If we have a PMU initialized but no APIC
 490                 * interrupts, we cannot sample hardware
 491                 * events (user-space has to fall back and
 492                 * sample via a hrtimer based software event):
 493                 */
 494                if (!x86_pmu.apic)
 495                        return -EOPNOTSUPP;
 496        }
 497
 498        if (attr->type == PERF_TYPE_RAW)
 499                return 0;
 500
 501        if (attr->type == PERF_TYPE_HW_CACHE)
 502                return set_ext_hw_attr(hwc, attr);
 503
 504        if (attr->config >= x86_pmu.max_events)
 505                return -EINVAL;
 506
 507        /*
 508         * The generic map:
 509         */
 510        config = x86_pmu.event_map(attr->config);
 511
 512        if (config == 0)
 513                return -ENOENT;
 514
 515        if (config == -1LL)
 516                return -EINVAL;
 517
 518        /*
 519         * Branch tracing:
 520         */
 521        if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
 522            (hwc->sample_period == 1)) {
 523                /* BTS is not supported by this architecture. */
 524                if (!x86_pmu.bts_active)
 525                        return -EOPNOTSUPP;
 526
 527                /* BTS is currently only allowed for user-mode. */
 528                if (!attr->exclude_kernel)
 529                        return -EOPNOTSUPP;
 530        }
 531
 532        hwc->config |= config;
 533
 534        return 0;
 535}
 536
 537static int x86_pmu_hw_config(struct perf_event *event)
 538{
 539        if (event->attr.precise_ip) {
 540                int precise = 0;
 541
 542                /* Support for constant skid */
 543                if (x86_pmu.pebs_active) {
 544                        precise++;
 545
 546                        /* Support for IP fixup */
 547                        if (x86_pmu.lbr_nr)
 548                                precise++;
 549                }
 550
 551                if (event->attr.precise_ip > precise)
 552                        return -EOPNOTSUPP;
 553        }
 554
 555        /*
 556         * Generate PMC IRQs:
 557         * (keep 'enabled' bit clear for now)
 558         */
 559        event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
 560
 561        /*
 562         * Count user and OS events unless requested not to
 563         */
 564        if (!event->attr.exclude_user)
 565                event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
 566        if (!event->attr.exclude_kernel)
 567                event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
 568
 569        if (event->attr.type == PERF_TYPE_RAW)
 570                event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 571
 572        return x86_setup_perfctr(event);
 573}
 574
 575/*
 576 * Setup the hardware configuration for a given attr_type
 577 */
 578static int __x86_pmu_event_init(struct perf_event *event)
 579{
 580        int err;
 581
 582        if (!x86_pmu_initialized())
 583                return -ENODEV;
 584
 585        err = 0;
 586        if (!atomic_inc_not_zero(&active_events)) {
 587                mutex_lock(&pmc_reserve_mutex);
 588                if (atomic_read(&active_events) == 0) {
 589                        if (!reserve_pmc_hardware())
 590                                err = -EBUSY;
 591                        else
 592                                reserve_ds_buffers();
 593                }
 594                if (!err)
 595                        atomic_inc(&active_events);
 596                mutex_unlock(&pmc_reserve_mutex);
 597        }
 598        if (err)
 599                return err;
 600
 601        event->destroy = hw_perf_event_destroy;
 602
 603        event->hw.idx = -1;
 604        event->hw.last_cpu = -1;
 605        event->hw.last_tag = ~0ULL;
 606
 607        return x86_pmu.hw_config(event);
 608}
 609
 610static void x86_pmu_disable_all(void)
 611{
 612        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 613        int idx;
 614
 615        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 616                u64 val;
 617
 618                if (!test_bit(idx, cpuc->active_mask))
 619                        continue;
 620                rdmsrl(x86_pmu.eventsel + idx, val);
 621                if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
 622                        continue;
 623                val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
 624                wrmsrl(x86_pmu.eventsel + idx, val);
 625        }
 626}
 627
 628static void x86_pmu_disable(struct pmu *pmu)
 629{
 630        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 631
 632        if (!x86_pmu_initialized())
 633                return;
 634
 635        if (!cpuc->enabled)
 636                return;
 637
 638        cpuc->n_added = 0;
 639        cpuc->enabled = 0;
 640        barrier();
 641
 642        x86_pmu.disable_all();
 643}
 644
 645static void x86_pmu_enable_all(int added)
 646{
 647        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 648        int idx;
 649
 650        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 651                struct perf_event *event = cpuc->events[idx];
 652                u64 val;
 653
 654                if (!test_bit(idx, cpuc->active_mask))
 655                        continue;
 656
 657                val = event->hw.config;
 658                val |= ARCH_PERFMON_EVENTSEL_ENABLE;
 659                wrmsrl(x86_pmu.eventsel + idx, val);
 660        }
 661}
 662
 663static struct pmu pmu;
 664
 665static inline int is_x86_event(struct perf_event *event)
 666{
 667        return event->pmu == &pmu;
 668}
 669
 670static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 671{
 672        struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
 673        unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 674        int i, j, w, wmax, num = 0;
 675        struct hw_perf_event *hwc;
 676
 677        bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 678
 679        for (i = 0; i < n; i++) {
 680                c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
 681                constraints[i] = c;
 682        }
 683
 684        /*
 685         * fastpath, try to reuse previous register
 686         */
 687        for (i = 0; i < n; i++) {
 688                hwc = &cpuc->event_list[i]->hw;
 689                c = constraints[i];
 690
 691                /* never assigned */
 692                if (hwc->idx == -1)
 693                        break;
 694
 695                /* constraint still honored */
 696                if (!test_bit(hwc->idx, c->idxmsk))
 697                        break;
 698
 699                /* not already used */
 700                if (test_bit(hwc->idx, used_mask))
 701                        break;
 702
 703                __set_bit(hwc->idx, used_mask);
 704                if (assign)
 705                        assign[i] = hwc->idx;
 706        }
 707        if (i == n)
 708                goto done;
 709
 710        /*
 711         * begin slow path
 712         */
 713
 714        bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 715
 716        /*
 717         * weight = number of possible counters
 718         *
 719         * 1    = most constrained, only works on one counter
 720         * wmax = least constrained, works on any counter
 721         *
 722         * assign events to counters starting with most
 723         * constrained events.
 724         */
 725        wmax = x86_pmu.num_counters;
 726
 727        /*
 728         * when fixed event counters are present,
 729         * wmax is incremented by 1 to account
 730         * for one more choice
 731         */
 732        if (x86_pmu.num_counters_fixed)
 733                wmax++;
 734
 735        for (w = 1, num = n; num && w <= wmax; w++) {
 736                /* for each event */
 737                for (i = 0; num && i < n; i++) {
 738                        c = constraints[i];
 739                        hwc = &cpuc->event_list[i]->hw;
 740
 741                        if (c->weight != w)
 742                                continue;
 743
 744                        for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
 745                                if (!test_bit(j, used_mask))
 746                                        break;
 747                        }
 748
 749                        if (j == X86_PMC_IDX_MAX)
 750                                break;
 751
 752                        __set_bit(j, used_mask);
 753
 754                        if (assign)
 755                                assign[i] = j;
 756                        num--;
 757                }
 758        }
 759done:
 760        /*
 761         * scheduling failed or is just a simulation,
 762         * free resources if necessary
 763         */
 764        if (!assign || num) {
 765                for (i = 0; i < n; i++) {
 766                        if (x86_pmu.put_event_constraints)
 767                                x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
 768                }
 769        }
 770        return num ? -ENOSPC : 0;
 771}
 772
 773/*
 774 * dogrp: true if must collect siblings events (group)
 775 * returns total number of events and error code
 776 */
 777static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
 778{
 779        struct perf_event *event;
 780        int n, max_count;
 781
 782        max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
 783
 784        /* current number of events already accepted */
 785        n = cpuc->n_events;
 786
 787        if (is_x86_event(leader)) {
 788                if (n >= max_count)
 789                        return -ENOSPC;
 790                cpuc->event_list[n] = leader;
 791                n++;
 792        }
 793        if (!dogrp)
 794                return n;
 795
 796        list_for_each_entry(event, &leader->sibling_list, group_entry) {
 797                if (!is_x86_event(event) ||
 798                    event->state <= PERF_EVENT_STATE_OFF)
 799                        continue;
 800
 801                if (n >= max_count)
 802                        return -ENOSPC;
 803
 804                cpuc->event_list[n] = event;
 805                n++;
 806        }
 807        return n;
 808}
 809
 810static inline void x86_assign_hw_event(struct perf_event *event,
 811                                struct cpu_hw_events *cpuc, int i)
 812{
 813        struct hw_perf_event *hwc = &event->hw;
 814
 815        hwc->idx = cpuc->assign[i];
 816        hwc->last_cpu = smp_processor_id();
 817        hwc->last_tag = ++cpuc->tags[i];
 818
 819        if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
 820                hwc->config_base = 0;
 821                hwc->event_base = 0;
 822        } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
 823                hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
 824                /*
 825                 * We set it so that event_base + idx in wrmsr/rdmsr maps to
 826                 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
 827                 */
 828                hwc->event_base =
 829                        MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
 830        } else {
 831                hwc->config_base = x86_pmu.eventsel;
 832                hwc->event_base  = x86_pmu.perfctr;
 833        }
 834}
 835
 836static inline int match_prev_assignment(struct hw_perf_event *hwc,
 837                                        struct cpu_hw_events *cpuc,
 838                                        int i)
 839{
 840        return hwc->idx == cpuc->assign[i] &&
 841                hwc->last_cpu == smp_processor_id() &&
 842                hwc->last_tag == cpuc->tags[i];
 843}
 844
 845static void x86_pmu_start(struct perf_event *event, int flags);
 846static void x86_pmu_stop(struct perf_event *event, int flags);
 847
 848static void x86_pmu_enable(struct pmu *pmu)
 849{
 850        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 851        struct perf_event *event;
 852        struct hw_perf_event *hwc;
 853        int i, added = cpuc->n_added;
 854
 855        if (!x86_pmu_initialized())
 856                return;
 857
 858        if (cpuc->enabled)
 859                return;
 860
 861        if (cpuc->n_added) {
 862                int n_running = cpuc->n_events - cpuc->n_added;
 863                /*
 864                 * apply assignment obtained either from
 865                 * hw_perf_group_sched_in() or x86_pmu_enable()
 866                 *
 867                 * step1: save events moving to new counters
 868                 * step2: reprogram moved events into new counters
 869                 */
 870                for (i = 0; i < n_running; i++) {
 871                        event = cpuc->event_list[i];
 872                        hwc = &event->hw;
 873
 874                        /*
 875                         * we can avoid reprogramming counter if:
 876                         * - assigned same counter as last time
 877                         * - running on same CPU as last time
 878                         * - no other event has used the counter since
 879                         */
 880                        if (hwc->idx == -1 ||
 881                            match_prev_assignment(hwc, cpuc, i))
 882                                continue;
 883
 884                        /*
 885                         * Ensure we don't accidentally enable a stopped
 886                         * counter simply because we rescheduled.
 887                         */
 888                        if (hwc->state & PERF_HES_STOPPED)
 889                                hwc->state |= PERF_HES_ARCH;
 890
 891                        x86_pmu_stop(event, PERF_EF_UPDATE);
 892                }
 893
 894                for (i = 0; i < cpuc->n_events; i++) {
 895                        event = cpuc->event_list[i];
 896                        hwc = &event->hw;
 897
 898                        if (!match_prev_assignment(hwc, cpuc, i))
 899                                x86_assign_hw_event(event, cpuc, i);
 900                        else if (i < n_running)
 901                                continue;
 902
 903                        if (hwc->state & PERF_HES_ARCH)
 904                                continue;
 905
 906                        x86_pmu_start(event, PERF_EF_RELOAD);
 907                }
 908                cpuc->n_added = 0;
 909                perf_events_lapic_init();
 910        }
 911
 912        cpuc->enabled = 1;
 913        barrier();
 914
 915        x86_pmu.enable_all(added);
 916}
 917
 918static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 919                                          u64 enable_mask)
 920{
 921        wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
 922}
 923
 924static inline void x86_pmu_disable_event(struct perf_event *event)
 925{
 926        struct hw_perf_event *hwc = &event->hw;
 927
 928        wrmsrl(hwc->config_base + hwc->idx, hwc->config);
 929}
 930
 931static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
 932
 933/*
 934 * Set the next IRQ period, based on the hwc->period_left value.
 935 * To be called with the event disabled in hw:
 936 */
 937static int
 938x86_perf_event_set_period(struct perf_event *event)
 939{
 940        struct hw_perf_event *hwc = &event->hw;
 941        s64 left = local64_read(&hwc->period_left);
 942        s64 period = hwc->sample_period;
 943        int ret = 0, idx = hwc->idx;
 944
 945        if (idx == X86_PMC_IDX_FIXED_BTS)
 946                return 0;
 947
 948        /*
 949         * If we are way outside a reasonable range then just skip forward:
 950         */
 951        if (unlikely(left <= -period)) {
 952                left = period;
 953                local64_set(&hwc->period_left, left);
 954                hwc->last_period = period;
 955                ret = 1;
 956        }
 957
 958        if (unlikely(left <= 0)) {
 959                left += period;
 960                local64_set(&hwc->period_left, left);
 961                hwc->last_period = period;
 962                ret = 1;
 963        }
 964        /*
 965         * Quirk: certain CPUs dont like it if just 1 hw_event is left:
 966         */
 967        if (unlikely(left < 2))
 968                left = 2;
 969
 970        if (left > x86_pmu.max_period)
 971                left = x86_pmu.max_period;
 972
 973        per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 974
 975        /*
 976         * The hw event starts counting from this event offset,
 977         * mark it to be able to extra future deltas:
 978         */
 979        local64_set(&hwc->prev_count, (u64)-left);
 980
 981        wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
 982
 983        /*
 984         * Due to erratum on certan cpu we need
 985         * a second write to be sure the register
 986         * is updated properly
 987         */
 988        if (x86_pmu.perfctr_second_write) {
 989                wrmsrl(hwc->event_base + idx,
 990                        (u64)(-left) & x86_pmu.cntval_mask);
 991        }
 992
 993        perf_event_update_userpage(event);
 994
 995        return ret;
 996}
 997
 998static void x86_pmu_enable_event(struct perf_event *event)
 999{
1000        if (__this_cpu_read(cpu_hw_events.enabled))
1001                __x86_pmu_enable_event(&event->hw,
1002                                       ARCH_PERFMON_EVENTSEL_ENABLE);
1003}
1004
1005/*
1006 * Add a single event to the PMU.
1007 *
1008 * The event is added to the group of enabled events
1009 * but only if it can be scehduled with existing events.
1010 */
1011static int x86_pmu_add(struct perf_event *event, int flags)
1012{
1013        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1014        struct hw_perf_event *hwc;
1015        int assign[X86_PMC_IDX_MAX];
1016        int n, n0, ret;
1017
1018        hwc = &event->hw;
1019
1020        perf_pmu_disable(event->pmu);
1021        n0 = cpuc->n_events;
1022        ret = n = collect_events(cpuc, event, false);
1023        if (ret < 0)
1024                goto out;
1025
1026        hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1027        if (!(flags & PERF_EF_START))
1028                hwc->state |= PERF_HES_ARCH;
1029
1030        /*
1031         * If group events scheduling transaction was started,
1032         * skip the schedulability test here, it will be peformed
1033         * at commit time (->commit_txn) as a whole
1034         */
1035        if (cpuc->group_flag & PERF_EVENT_TXN)
1036                goto done_collect;
1037
1038        ret = x86_pmu.schedule_events(cpuc, n, assign);
1039        if (ret)
1040                goto out;
1041        /*
1042         * copy new assignment, now we know it is possible
1043         * will be used by hw_perf_enable()
1044         */
1045        memcpy(cpuc->assign, assign, n*sizeof(int));
1046
1047done_collect:
1048        cpuc->n_events = n;
1049        cpuc->n_added += n - n0;
1050        cpuc->n_txn += n - n0;
1051
1052        ret = 0;
1053out:
1054        perf_pmu_enable(event->pmu);
1055        return ret;
1056}
1057
1058static void x86_pmu_start(struct perf_event *event, int flags)
1059{
1060        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1061        int idx = event->hw.idx;
1062
1063        if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1064                return;
1065
1066        if (WARN_ON_ONCE(idx == -1))
1067                return;
1068
1069        if (flags & PERF_EF_RELOAD) {
1070                WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1071                x86_perf_event_set_period(event);
1072        }
1073
1074        event->hw.state = 0;
1075
1076        cpuc->events[idx] = event;
1077        __set_bit(idx, cpuc->active_mask);
1078        __set_bit(idx, cpuc->running);
1079        x86_pmu.enable(event);
1080        perf_event_update_userpage(event);
1081}
1082
1083void perf_event_print_debug(void)
1084{
1085        u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1086        u64 pebs;
1087        struct cpu_hw_events *cpuc;
1088        unsigned long flags;
1089        int cpu, idx;
1090
1091        if (!x86_pmu.num_counters)
1092                return;
1093
1094        local_irq_save(flags);
1095
1096        cpu = smp_processor_id();
1097        cpuc = &per_cpu(cpu_hw_events, cpu);
1098
1099        if (x86_pmu.version >= 2) {
1100                rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1101                rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1102                rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1103                rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1104                rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1105
1106                pr_info("\n");
1107                pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
1108                pr_info("CPU#%d: status:     %016llx\n", cpu, status);
1109                pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
1110                pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1111                pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1112        }
1113        pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1114
1115        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1116                rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1117                rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
1118
1119                prev_left = per_cpu(pmc_prev_left[idx], cpu);
1120
1121                pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
1122                        cpu, idx, pmc_ctrl);
1123                pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
1124                        cpu, idx, pmc_count);
1125                pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1126                        cpu, idx, prev_left);
1127        }
1128        for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1129                rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1130
1131                pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1132                        cpu, idx, pmc_count);
1133        }
1134        local_irq_restore(flags);
1135}
1136
1137static void x86_pmu_stop(struct perf_event *event, int flags)
1138{
1139        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1140        struct hw_perf_event *hwc = &event->hw;
1141
1142        if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
1143                x86_pmu.disable(event);
1144                cpuc->events[hwc->idx] = NULL;
1145                WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1146                hwc->state |= PERF_HES_STOPPED;
1147        }
1148
1149        if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1150                /*
1151                 * Drain the remaining delta count out of a event
1152                 * that we are disabling:
1153                 */
1154                x86_perf_event_update(event);
1155                hwc->state |= PERF_HES_UPTODATE;
1156        }
1157}
1158
1159static void x86_pmu_del(struct perf_event *event, int flags)
1160{
1161        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1162        int i;
1163
1164        /*
1165         * If we're called during a txn, we don't need to do anything.
1166         * The events never got scheduled and ->cancel_txn will truncate
1167         * the event_list.
1168         */
1169        if (cpuc->group_flag & PERF_EVENT_TXN)
1170                return;
1171
1172        x86_pmu_stop(event, PERF_EF_UPDATE);
1173
1174        for (i = 0; i < cpuc->n_events; i++) {
1175                if (event == cpuc->event_list[i]) {
1176
1177                        if (x86_pmu.put_event_constraints)
1178                                x86_pmu.put_event_constraints(cpuc, event);
1179
1180                        while (++i < cpuc->n_events)
1181                                cpuc->event_list[i-1] = cpuc->event_list[i];
1182
1183                        --cpuc->n_events;
1184                        break;
1185                }
1186        }
1187        perf_event_update_userpage(event);
1188}
1189
1190static int x86_pmu_handle_irq(struct pt_regs *regs)
1191{
1192        struct perf_sample_data data;
1193        struct cpu_hw_events *cpuc;
1194        struct perf_event *event;
1195        int idx, handled = 0;
1196        u64 val;
1197
1198        perf_sample_data_init(&data, 0);
1199
1200        cpuc = &__get_cpu_var(cpu_hw_events);
1201
1202        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1203                if (!test_bit(idx, cpuc->active_mask)) {
1204                        /*
1205                         * Though we deactivated the counter some cpus
1206                         * might still deliver spurious interrupts still
1207                         * in flight. Catch them:
1208                         */
1209                        if (__test_and_clear_bit(idx, cpuc->running))
1210                                handled++;
1211                        continue;
1212                }
1213
1214                event = cpuc->events[idx];
1215
1216                val = x86_perf_event_update(event);
1217                if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1218                        continue;
1219
1220                /*
1221                 * event overflow
1222                 */
1223                handled++;
1224                data.period     = event->hw.last_period;
1225
1226                if (!x86_perf_event_set_period(event))
1227                        continue;
1228
1229                if (perf_event_overflow(event, 1, &data, regs))
1230                        x86_pmu_stop(event, 0);
1231        }
1232
1233        if (handled)
1234                inc_irq_stat(apic_perf_irqs);
1235
1236        return handled;
1237}
1238
1239void perf_events_lapic_init(void)
1240{
1241        if (!x86_pmu.apic || !x86_pmu_initialized())
1242                return;
1243
1244        /*
1245         * Always use NMI for PMU
1246         */
1247        apic_write(APIC_LVTPC, APIC_DM_NMI);
1248}
1249
1250struct pmu_nmi_state {
1251        unsigned int    marked;
1252        int             handled;
1253};
1254
1255static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
1256
1257static int __kprobes
1258perf_event_nmi_handler(struct notifier_block *self,
1259                         unsigned long cmd, void *__args)
1260{
1261        struct die_args *args = __args;
1262        unsigned int this_nmi;
1263        int handled;
1264
1265        if (!atomic_read(&active_events))
1266                return NOTIFY_DONE;
1267
1268        switch (cmd) {
1269        case DIE_NMI:
1270                break;
1271        case DIE_NMIUNKNOWN:
1272                this_nmi = percpu_read(irq_stat.__nmi_count);
1273                if (this_nmi != __this_cpu_read(pmu_nmi.marked))
1274                        /* let the kernel handle the unknown nmi */
1275                        return NOTIFY_DONE;
1276                /*
1277                 * This one is a PMU back-to-back nmi. Two events
1278                 * trigger 'simultaneously' raising two back-to-back
1279                 * NMIs. If the first NMI handles both, the latter
1280                 * will be empty and daze the CPU. So, we drop it to
1281                 * avoid false-positive 'unknown nmi' messages.
1282                 */
1283                return NOTIFY_STOP;
1284        default:
1285                return NOTIFY_DONE;
1286        }
1287
1288        apic_write(APIC_LVTPC, APIC_DM_NMI);
1289
1290        handled = x86_pmu.handle_irq(args->regs);
1291        if (!handled)
1292                return NOTIFY_DONE;
1293
1294        this_nmi = percpu_read(irq_stat.__nmi_count);
1295        if ((handled > 1) ||
1296                /* the next nmi could be a back-to-back nmi */
1297            ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
1298             (__this_cpu_read(pmu_nmi.handled) > 1))) {
1299                /*
1300                 * We could have two subsequent back-to-back nmis: The
1301                 * first handles more than one counter, the 2nd
1302                 * handles only one counter and the 3rd handles no
1303                 * counter.
1304                 *
1305                 * This is the 2nd nmi because the previous was
1306                 * handling more than one counter. We will mark the
1307                 * next (3rd) and then drop it if unhandled.
1308                 */
1309                __this_cpu_write(pmu_nmi.marked, this_nmi + 1);
1310                __this_cpu_write(pmu_nmi.handled, handled);
1311        }
1312
1313        return NOTIFY_STOP;
1314}
1315
1316static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1317        .notifier_call          = perf_event_nmi_handler,
1318        .next                   = NULL,
1319        .priority               = NMI_LOCAL_LOW_PRIOR,
1320};
1321
1322static struct event_constraint unconstrained;
1323static struct event_constraint emptyconstraint;
1324
1325static struct event_constraint *
1326x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1327{
1328        struct event_constraint *c;
1329
1330        if (x86_pmu.event_constraints) {
1331                for_each_event_constraint(c, x86_pmu.event_constraints) {
1332                        if ((event->hw.config & c->cmask) == c->code)
1333                                return c;
1334                }
1335        }
1336
1337        return &unconstrained;
1338}
1339
1340#include "perf_event_amd.c"
1341#include "perf_event_p6.c"
1342#include "perf_event_p4.c"
1343#include "perf_event_intel_lbr.c"
1344#include "perf_event_intel_ds.c"
1345#include "perf_event_intel.c"
1346
1347static int __cpuinit
1348x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1349{
1350        unsigned int cpu = (long)hcpu;
1351        int ret = NOTIFY_OK;
1352
1353        switch (action & ~CPU_TASKS_FROZEN) {
1354        case CPU_UP_PREPARE:
1355                if (x86_pmu.cpu_prepare)
1356                        ret = x86_pmu.cpu_prepare(cpu);
1357                break;
1358
1359        case CPU_STARTING:
1360                if (x86_pmu.cpu_starting)
1361                        x86_pmu.cpu_starting(cpu);
1362                break;
1363
1364        case CPU_DYING:
1365                if (x86_pmu.cpu_dying)
1366                        x86_pmu.cpu_dying(cpu);
1367                break;
1368
1369        case CPU_UP_CANCELED:
1370        case CPU_DEAD:
1371                if (x86_pmu.cpu_dead)
1372                        x86_pmu.cpu_dead(cpu);
1373                break;
1374
1375        default:
1376                break;
1377        }
1378
1379        return ret;
1380}
1381
1382static void __init pmu_check_apic(void)
1383{
1384        if (cpu_has_apic)
1385                return;
1386
1387        x86_pmu.apic = 0;
1388        pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1389        pr_info("no hardware sampling interrupt available.\n");
1390}
1391
1392int __init init_hw_perf_events(void)
1393{
1394        struct event_constraint *c;
1395        int err;
1396
1397        pr_info("Performance Events: ");
1398
1399        switch (boot_cpu_data.x86_vendor) {
1400        case X86_VENDOR_INTEL:
1401                err = intel_pmu_init();
1402                break;
1403        case X86_VENDOR_AMD:
1404                err = amd_pmu_init();
1405                break;
1406        default:
1407                return 0;
1408        }
1409        if (err != 0) {
1410                pr_cont("no PMU driver, software events only.\n");
1411                return 0;
1412        }
1413
1414        pmu_check_apic();
1415
1416        /* sanity check that the hardware exists or is emulated */
1417        if (!check_hw_exists())
1418                return 0;
1419
1420        pr_cont("%s PMU driver.\n", x86_pmu.name);
1421
1422        if (x86_pmu.quirks)
1423                x86_pmu.quirks();
1424
1425        if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1426                WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1427                     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1428                x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1429        }
1430        x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1431
1432        if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1433                WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1434                     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1435                x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1436        }
1437
1438        x86_pmu.intel_ctrl |=
1439                ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1440
1441        perf_events_lapic_init();
1442        register_die_notifier(&perf_event_nmi_notifier);
1443
1444        unconstrained = (struct event_constraint)
1445                __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1446                                   0, x86_pmu.num_counters);
1447
1448        if (x86_pmu.event_constraints) {
1449                for_each_event_constraint(c, x86_pmu.event_constraints) {
1450                        if (c->cmask != X86_RAW_EVENT_MASK)
1451                                continue;
1452
1453                        c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1454                        c->weight += x86_pmu.num_counters;
1455                }
1456        }
1457
1458        pr_info("... version:                %d\n",     x86_pmu.version);
1459        pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
1460        pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
1461        pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
1462        pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1463        pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1464        pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1465
1466        perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1467        perf_cpu_notifier(x86_pmu_notifier);
1468
1469        return 0;
1470}
1471early_initcall(init_hw_perf_events);
1472
1473static inline void x86_pmu_read(struct perf_event *event)
1474{
1475        x86_perf_event_update(event);
1476}
1477
1478/*
1479 * Start group events scheduling transaction
1480 * Set the flag to make pmu::enable() not perform the
1481 * schedulability test, it will be performed at commit time
1482 */
1483static void x86_pmu_start_txn(struct pmu *pmu)
1484{
1485        perf_pmu_disable(pmu);
1486        __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
1487        __this_cpu_write(cpu_hw_events.n_txn, 0);
1488}
1489
1490/*
1491 * Stop group events scheduling transaction
1492 * Clear the flag and pmu::enable() will perform the
1493 * schedulability test.
1494 */
1495static void x86_pmu_cancel_txn(struct pmu *pmu)
1496{
1497        __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1498        /*
1499         * Truncate the collected events.
1500         */
1501        __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1502        __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
1503        perf_pmu_enable(pmu);
1504}
1505
1506/*
1507 * Commit group events scheduling transaction
1508 * Perform the group schedulability test as a whole
1509 * Return 0 if success
1510 */
1511static int x86_pmu_commit_txn(struct pmu *pmu)
1512{
1513        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1514        int assign[X86_PMC_IDX_MAX];
1515        int n, ret;
1516
1517        n = cpuc->n_events;
1518
1519        if (!x86_pmu_initialized())
1520                return -EAGAIN;
1521
1522        ret = x86_pmu.schedule_events(cpuc, n, assign);
1523        if (ret)
1524                return ret;
1525
1526        /*
1527         * copy new assignment, now we know it is possible
1528         * will be used by hw_perf_enable()
1529         */
1530        memcpy(cpuc->assign, assign, n*sizeof(int));
1531
1532        cpuc->group_flag &= ~PERF_EVENT_TXN;
1533        perf_pmu_enable(pmu);
1534        return 0;
1535}
1536
1537/*
1538 * validate that we can schedule this event
1539 */
1540static int validate_event(struct perf_event *event)
1541{
1542        struct cpu_hw_events *fake_cpuc;
1543        struct event_constraint *c;
1544        int ret = 0;
1545
1546        fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1547        if (!fake_cpuc)
1548                return -ENOMEM;
1549
1550        c = x86_pmu.get_event_constraints(fake_cpuc, event);
1551
1552        if (!c || !c->weight)
1553                ret = -ENOSPC;
1554
1555        if (x86_pmu.put_event_constraints)
1556                x86_pmu.put_event_constraints(fake_cpuc, event);
1557
1558        kfree(fake_cpuc);
1559
1560        return ret;
1561}
1562
1563/*
1564 * validate a single event group
1565 *
1566 * validation include:
1567 *      - check events are compatible which each other
1568 *      - events do not compete for the same counter
1569 *      - number of events <= number of counters
1570 *
1571 * validation ensures the group can be loaded onto the
1572 * PMU if it was the only group available.
1573 */
1574static int validate_group(struct perf_event *event)
1575{
1576        struct perf_event *leader = event->group_leader;
1577        struct cpu_hw_events *fake_cpuc;
1578        int ret, n;
1579
1580        ret = -ENOMEM;
1581        fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1582        if (!fake_cpuc)
1583                goto out;
1584
1585        /*
1586         * the event is not yet connected with its
1587         * siblings therefore we must first collect
1588         * existing siblings, then add the new event
1589         * before we can simulate the scheduling
1590         */
1591        ret = -ENOSPC;
1592        n = collect_events(fake_cpuc, leader, true);
1593        if (n < 0)
1594                goto out_free;
1595
1596        fake_cpuc->n_events = n;
1597        n = collect_events(fake_cpuc, event, false);
1598        if (n < 0)
1599                goto out_free;
1600
1601        fake_cpuc->n_events = n;
1602
1603        ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1604
1605out_free:
1606        kfree(fake_cpuc);
1607out:
1608        return ret;
1609}
1610
1611int x86_pmu_event_init(struct perf_event *event)
1612{
1613        struct pmu *tmp;
1614        int err;
1615
1616        switch (event->attr.type) {
1617        case PERF_TYPE_RAW:
1618        case PERF_TYPE_HARDWARE:
1619        case PERF_TYPE_HW_CACHE:
1620                break;
1621
1622        default:
1623                return -ENOENT;
1624        }
1625
1626        err = __x86_pmu_event_init(event);
1627        if (!err) {
1628                /*
1629                 * we temporarily connect event to its pmu
1630                 * such that validate_group() can classify
1631                 * it as an x86 event using is_x86_event()
1632                 */
1633                tmp = event->pmu;
1634                event->pmu = &pmu;
1635
1636                if (event->group_leader != event)
1637                        err = validate_group(event);
1638                else
1639                        err = validate_event(event);
1640
1641                event->pmu = tmp;
1642        }
1643        if (err) {
1644                if (event->destroy)
1645                        event->destroy(event);
1646        }
1647
1648        return err;
1649}
1650
1651static struct pmu pmu = {
1652        .pmu_enable     = x86_pmu_enable,
1653        .pmu_disable    = x86_pmu_disable,
1654
1655        .event_init     = x86_pmu_event_init,
1656
1657        .add            = x86_pmu_add,
1658        .del            = x86_pmu_del,
1659        .start          = x86_pmu_start,
1660        .stop           = x86_pmu_stop,
1661        .read           = x86_pmu_read,
1662
1663        .start_txn      = x86_pmu_start_txn,
1664        .cancel_txn     = x86_pmu_cancel_txn,
1665        .commit_txn     = x86_pmu_commit_txn,
1666};
1667
1668/*
1669 * callchain support
1670 */
1671
1672static void
1673backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1674{
1675        /* Ignore warnings */
1676}
1677
1678static void backtrace_warning(void *data, char *msg)
1679{
1680        /* Ignore warnings */
1681}
1682
1683static int backtrace_stack(void *data, char *name)
1684{
1685        return 0;
1686}
1687
1688static void backtrace_address(void *data, unsigned long addr, int reliable)
1689{
1690        struct perf_callchain_entry *entry = data;
1691
1692        perf_callchain_store(entry, addr);
1693}
1694
1695static const struct stacktrace_ops backtrace_ops = {
1696        .warning                = backtrace_warning,
1697        .warning_symbol         = backtrace_warning_symbol,
1698        .stack                  = backtrace_stack,
1699        .address                = backtrace_address,
1700        .walk_stack             = print_context_stack_bp,
1701};
1702
1703void
1704perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1705{
1706        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1707                /* TODO: We don't support guest os callchain now */
1708                return;
1709        }
1710
1711        perf_callchain_store(entry, regs->ip);
1712
1713        dump_trace(NULL, regs, NULL, &backtrace_ops, entry);
1714}
1715
1716#ifdef CONFIG_COMPAT
1717static inline int
1718perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1719{
1720        /* 32-bit process in 64-bit kernel. */
1721        struct stack_frame_ia32 frame;
1722        const void __user *fp;
1723
1724        if (!test_thread_flag(TIF_IA32))
1725                return 0;
1726
1727        fp = compat_ptr(regs->bp);
1728        while (entry->nr < PERF_MAX_STACK_DEPTH) {
1729                unsigned long bytes;
1730                frame.next_frame     = 0;
1731                frame.return_address = 0;
1732
1733                bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1734                if (bytes != sizeof(frame))
1735                        break;
1736
1737                if (fp < compat_ptr(regs->sp))
1738                        break;
1739
1740                perf_callchain_store(entry, frame.return_address);
1741                fp = compat_ptr(frame.next_frame);
1742        }
1743        return 1;
1744}
1745#else
1746static inline int
1747perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1748{
1749    return 0;
1750}
1751#endif
1752
1753void
1754perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1755{
1756        struct stack_frame frame;
1757        const void __user *fp;
1758
1759        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1760                /* TODO: We don't support guest os callchain now */
1761                return;
1762        }
1763
1764        fp = (void __user *)regs->bp;
1765
1766        perf_callchain_store(entry, regs->ip);
1767
1768        if (perf_callchain_user32(regs, entry))
1769                return;
1770
1771        while (entry->nr < PERF_MAX_STACK_DEPTH) {
1772                unsigned long bytes;
1773                frame.next_frame             = NULL;
1774                frame.return_address = 0;
1775
1776                bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1777                if (bytes != sizeof(frame))
1778                        break;
1779
1780                if ((unsigned long)fp < regs->sp)
1781                        break;
1782
1783                perf_callchain_store(entry, frame.return_address);
1784                fp = frame.next_frame;
1785        }
1786}
1787
1788unsigned long perf_instruction_pointer(struct pt_regs *regs)
1789{
1790        unsigned long ip;
1791
1792        if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1793                ip = perf_guest_cbs->get_guest_ip();
1794        else
1795                ip = instruction_pointer(regs);
1796
1797        return ip;
1798}
1799
1800unsigned long perf_misc_flags(struct pt_regs *regs)
1801{
1802        int misc = 0;
1803
1804        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1805                if (perf_guest_cbs->is_user_mode())
1806                        misc |= PERF_RECORD_MISC_GUEST_USER;
1807                else
1808                        misc |= PERF_RECORD_MISC_GUEST_KERNEL;
1809        } else {
1810                if (user_mode(regs))
1811                        misc |= PERF_RECORD_MISC_USER;
1812                else
1813                        misc |= PERF_RECORD_MISC_KERNEL;
1814        }
1815
1816        if (regs->flags & PERF_EFLAGS_EXACT)
1817                misc |= PERF_RECORD_MISC_EXACT_IP;
1818
1819        return misc;
1820}
1821