linux/arch/x86/kernel/cpu/perf_event.c
<<
>>
Prefs
   1/*
   2 * Performance events x86 architecture code
   3 *
   4 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6 *  Copyright (C) 2009 Jaswinder Singh Rajput
   7 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
   8 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   9 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
  10 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
  11 *
  12 *  For licencing details see kernel-base/COPYING
  13 */
  14
  15#include <linux/perf_event.h>
  16#include <linux/capability.h>
  17#include <linux/notifier.h>
  18#include <linux/hardirq.h>
  19#include <linux/kprobes.h>
  20#include <linux/module.h>
  21#include <linux/kdebug.h>
  22#include <linux/sched.h>
  23#include <linux/uaccess.h>
  24#include <linux/slab.h>
  25#include <linux/cpu.h>
  26#include <linux/bitops.h>
  27#include <linux/device.h>
  28
  29#include <asm/apic.h>
  30#include <asm/stacktrace.h>
  31#include <asm/nmi.h>
  32#include <asm/smp.h>
  33#include <asm/alternative.h>
  34#include <asm/timer.h>
  35#include <asm/desc.h>
  36#include <asm/ldt.h>
  37
  38#include "perf_event.h"
  39
  40struct x86_pmu x86_pmu __read_mostly;
  41
  42DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
  43        .enabled = 1,
  44};
  45
  46u64 __read_mostly hw_cache_event_ids
  47                                [PERF_COUNT_HW_CACHE_MAX]
  48                                [PERF_COUNT_HW_CACHE_OP_MAX]
  49                                [PERF_COUNT_HW_CACHE_RESULT_MAX];
  50u64 __read_mostly hw_cache_extra_regs
  51                                [PERF_COUNT_HW_CACHE_MAX]
  52                                [PERF_COUNT_HW_CACHE_OP_MAX]
  53                                [PERF_COUNT_HW_CACHE_RESULT_MAX];
  54
  55/*
  56 * Propagate event elapsed time into the generic event.
  57 * Can only be executed on the CPU where the event is active.
  58 * Returns the delta events processed.
  59 */
  60u64 x86_perf_event_update(struct perf_event *event)
  61{
  62        struct hw_perf_event *hwc = &event->hw;
  63        int shift = 64 - x86_pmu.cntval_bits;
  64        u64 prev_raw_count, new_raw_count;
  65        int idx = hwc->idx;
  66        s64 delta;
  67
  68        if (idx == INTEL_PMC_IDX_FIXED_BTS)
  69                return 0;
  70
  71        /*
  72         * Careful: an NMI might modify the previous event value.
  73         *
  74         * Our tactic to handle this is to first atomically read and
  75         * exchange a new raw count - then add that new-prev delta
  76         * count to the generic event atomically:
  77         */
  78again:
  79        prev_raw_count = local64_read(&hwc->prev_count);
  80        rdpmcl(hwc->event_base_rdpmc, new_raw_count);
  81
  82        if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
  83                                        new_raw_count) != prev_raw_count)
  84                goto again;
  85
  86        /*
  87         * Now we have the new raw value and have updated the prev
  88         * timestamp already. We can now calculate the elapsed delta
  89         * (event-)time and add that to the generic event.
  90         *
  91         * Careful, not all hw sign-extends above the physical width
  92         * of the count.
  93         */
  94        delta = (new_raw_count << shift) - (prev_raw_count << shift);
  95        delta >>= shift;
  96
  97        local64_add(delta, &event->count);
  98        local64_sub(delta, &hwc->period_left);
  99
 100        return new_raw_count;
 101}
 102
 103/*
 104 * Find and validate any extra registers to set up.
 105 */
 106static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 107{
 108        struct hw_perf_event_extra *reg;
 109        struct extra_reg *er;
 110
 111        reg = &event->hw.extra_reg;
 112
 113        if (!x86_pmu.extra_regs)
 114                return 0;
 115
 116        for (er = x86_pmu.extra_regs; er->msr; er++) {
 117                if (er->event != (config & er->config_mask))
 118                        continue;
 119                if (event->attr.config1 & ~er->valid_mask)
 120                        return -EINVAL;
 121
 122                reg->idx = er->idx;
 123                reg->config = event->attr.config1;
 124                reg->reg = er->msr;
 125                break;
 126        }
 127        return 0;
 128}
 129
 130static atomic_t active_events;
 131static DEFINE_MUTEX(pmc_reserve_mutex);
 132
 133#ifdef CONFIG_X86_LOCAL_APIC
 134
 135static bool reserve_pmc_hardware(void)
 136{
 137        int i;
 138
 139        for (i = 0; i < x86_pmu.num_counters; i++) {
 140                if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
 141                        goto perfctr_fail;
 142        }
 143
 144        for (i = 0; i < x86_pmu.num_counters; i++) {
 145                if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
 146                        goto eventsel_fail;
 147        }
 148
 149        return true;
 150
 151eventsel_fail:
 152        for (i--; i >= 0; i--)
 153                release_evntsel_nmi(x86_pmu_config_addr(i));
 154
 155        i = x86_pmu.num_counters;
 156
 157perfctr_fail:
 158        for (i--; i >= 0; i--)
 159                release_perfctr_nmi(x86_pmu_event_addr(i));
 160
 161        return false;
 162}
 163
 164static void release_pmc_hardware(void)
 165{
 166        int i;
 167
 168        for (i = 0; i < x86_pmu.num_counters; i++) {
 169                release_perfctr_nmi(x86_pmu_event_addr(i));
 170                release_evntsel_nmi(x86_pmu_config_addr(i));
 171        }
 172}
 173
 174#else
 175
 176static bool reserve_pmc_hardware(void) { return true; }
 177static void release_pmc_hardware(void) {}
 178
 179#endif
 180
 181static bool check_hw_exists(void)
 182{
 183        u64 val, val_fail, val_new= ~0;
 184        int i, reg, reg_fail, ret = 0;
 185        int bios_fail = 0;
 186
 187        /*
 188         * Check to see if the BIOS enabled any of the counters, if so
 189         * complain and bail.
 190         */
 191        for (i = 0; i < x86_pmu.num_counters; i++) {
 192                reg = x86_pmu_config_addr(i);
 193                ret = rdmsrl_safe(reg, &val);
 194                if (ret)
 195                        goto msr_fail;
 196                if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
 197                        bios_fail = 1;
 198                        val_fail = val;
 199                        reg_fail = reg;
 200                }
 201        }
 202
 203        if (x86_pmu.num_counters_fixed) {
 204                reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
 205                ret = rdmsrl_safe(reg, &val);
 206                if (ret)
 207                        goto msr_fail;
 208                for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
 209                        if (val & (0x03 << i*4)) {
 210                                bios_fail = 1;
 211                                val_fail = val;
 212                                reg_fail = reg;
 213                        }
 214                }
 215        }
 216
 217        /*
 218         * Read the current value, change it and read it back to see if it
 219         * matches, this is needed to detect certain hardware emulators
 220         * (qemu/kvm) that don't trap on the MSR access and always return 0s.
 221         */
 222        reg = x86_pmu_event_addr(0);
 223        if (rdmsrl_safe(reg, &val))
 224                goto msr_fail;
 225        val ^= 0xffffUL;
 226        ret = wrmsrl_safe(reg, val);
 227        ret |= rdmsrl_safe(reg, &val_new);
 228        if (ret || val != val_new)
 229                goto msr_fail;
 230
 231        /*
 232         * We still allow the PMU driver to operate:
 233         */
 234        if (bios_fail) {
 235                printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
 236                printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail);
 237        }
 238
 239        return true;
 240
 241msr_fail:
 242        printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
 243        printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new);
 244
 245        return false;
 246}
 247
 248static void hw_perf_event_destroy(struct perf_event *event)
 249{
 250        if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
 251                release_pmc_hardware();
 252                release_ds_buffers();
 253                mutex_unlock(&pmc_reserve_mutex);
 254        }
 255}
 256
 257static inline int x86_pmu_initialized(void)
 258{
 259        return x86_pmu.handle_irq != NULL;
 260}
 261
 262static inline int
 263set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 264{
 265        struct perf_event_attr *attr = &event->attr;
 266        unsigned int cache_type, cache_op, cache_result;
 267        u64 config, val;
 268
 269        config = attr->config;
 270
 271        cache_type = (config >>  0) & 0xff;
 272        if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
 273                return -EINVAL;
 274
 275        cache_op = (config >>  8) & 0xff;
 276        if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
 277                return -EINVAL;
 278
 279        cache_result = (config >> 16) & 0xff;
 280        if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
 281                return -EINVAL;
 282
 283        val = hw_cache_event_ids[cache_type][cache_op][cache_result];
 284
 285        if (val == 0)
 286                return -ENOENT;
 287
 288        if (val == -1)
 289                return -EINVAL;
 290
 291        hwc->config |= val;
 292        attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
 293        return x86_pmu_extra_regs(val, event);
 294}
 295
 296int x86_setup_perfctr(struct perf_event *event)
 297{
 298        struct perf_event_attr *attr = &event->attr;
 299        struct hw_perf_event *hwc = &event->hw;
 300        u64 config;
 301
 302        if (!is_sampling_event(event)) {
 303                hwc->sample_period = x86_pmu.max_period;
 304                hwc->last_period = hwc->sample_period;
 305                local64_set(&hwc->period_left, hwc->sample_period);
 306        } else {
 307                /*
 308                 * If we have a PMU initialized but no APIC
 309                 * interrupts, we cannot sample hardware
 310                 * events (user-space has to fall back and
 311                 * sample via a hrtimer based software event):
 312                 */
 313                if (!x86_pmu.apic)
 314                        return -EOPNOTSUPP;
 315        }
 316
 317        if (attr->type == PERF_TYPE_RAW)
 318                return x86_pmu_extra_regs(event->attr.config, event);
 319
 320        if (attr->type == PERF_TYPE_HW_CACHE)
 321                return set_ext_hw_attr(hwc, event);
 322
 323        if (attr->config >= x86_pmu.max_events)
 324                return -EINVAL;
 325
 326        /*
 327         * The generic map:
 328         */
 329        config = x86_pmu.event_map(attr->config);
 330
 331        if (config == 0)
 332                return -ENOENT;
 333
 334        if (config == -1LL)
 335                return -EINVAL;
 336
 337        /*
 338         * Branch tracing:
 339         */
 340        if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
 341            !attr->freq && hwc->sample_period == 1) {
 342                /* BTS is not supported by this architecture. */
 343                if (!x86_pmu.bts_active)
 344                        return -EOPNOTSUPP;
 345
 346                /* BTS is currently only allowed for user-mode. */
 347                if (!attr->exclude_kernel)
 348                        return -EOPNOTSUPP;
 349        }
 350
 351        hwc->config |= config;
 352
 353        return 0;
 354}
 355
 356/*
 357 * check that branch_sample_type is compatible with
 358 * settings needed for precise_ip > 1 which implies
 359 * using the LBR to capture ALL taken branches at the
 360 * priv levels of the measurement
 361 */
 362static inline int precise_br_compat(struct perf_event *event)
 363{
 364        u64 m = event->attr.branch_sample_type;
 365        u64 b = 0;
 366
 367        /* must capture all branches */
 368        if (!(m & PERF_SAMPLE_BRANCH_ANY))
 369                return 0;
 370
 371        m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
 372
 373        if (!event->attr.exclude_user)
 374                b |= PERF_SAMPLE_BRANCH_USER;
 375
 376        if (!event->attr.exclude_kernel)
 377                b |= PERF_SAMPLE_BRANCH_KERNEL;
 378
 379        /*
 380         * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
 381         */
 382
 383        return m == b;
 384}
 385
 386int x86_pmu_hw_config(struct perf_event *event)
 387{
 388        if (event->attr.precise_ip) {
 389                int precise = 0;
 390
 391                /* Support for constant skid */
 392                if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
 393                        precise++;
 394
 395                        /* Support for IP fixup */
 396                        if (x86_pmu.lbr_nr)
 397                                precise++;
 398                }
 399
 400                if (event->attr.precise_ip > precise)
 401                        return -EOPNOTSUPP;
 402                /*
 403                 * check that PEBS LBR correction does not conflict with
 404                 * whatever the user is asking with attr->branch_sample_type
 405                 */
 406                if (event->attr.precise_ip > 1) {
 407                        u64 *br_type = &event->attr.branch_sample_type;
 408
 409                        if (has_branch_stack(event)) {
 410                                if (!precise_br_compat(event))
 411                                        return -EOPNOTSUPP;
 412
 413                                /* branch_sample_type is compatible */
 414
 415                        } else {
 416                                /*
 417                                 * user did not specify  branch_sample_type
 418                                 *
 419                                 * For PEBS fixups, we capture all
 420                                 * the branches at the priv level of the
 421                                 * event.
 422                                 */
 423                                *br_type = PERF_SAMPLE_BRANCH_ANY;
 424
 425                                if (!event->attr.exclude_user)
 426                                        *br_type |= PERF_SAMPLE_BRANCH_USER;
 427
 428                                if (!event->attr.exclude_kernel)
 429                                        *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
 430                        }
 431                }
 432        }
 433
 434        /*
 435         * Generate PMC IRQs:
 436         * (keep 'enabled' bit clear for now)
 437         */
 438        event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
 439
 440        /*
 441         * Count user and OS events unless requested not to
 442         */
 443        if (!event->attr.exclude_user)
 444                event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
 445        if (!event->attr.exclude_kernel)
 446                event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
 447
 448        if (event->attr.type == PERF_TYPE_RAW)
 449                event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 450
 451        return x86_setup_perfctr(event);
 452}
 453
 454/*
 455 * Setup the hardware configuration for a given attr_type
 456 */
 457static int __x86_pmu_event_init(struct perf_event *event)
 458{
 459        int err;
 460
 461        if (!x86_pmu_initialized())
 462                return -ENODEV;
 463
 464        err = 0;
 465        if (!atomic_inc_not_zero(&active_events)) {
 466                mutex_lock(&pmc_reserve_mutex);
 467                if (atomic_read(&active_events) == 0) {
 468                        if (!reserve_pmc_hardware())
 469                                err = -EBUSY;
 470                        else
 471                                reserve_ds_buffers();
 472                }
 473                if (!err)
 474                        atomic_inc(&active_events);
 475                mutex_unlock(&pmc_reserve_mutex);
 476        }
 477        if (err)
 478                return err;
 479
 480        event->destroy = hw_perf_event_destroy;
 481
 482        event->hw.idx = -1;
 483        event->hw.last_cpu = -1;
 484        event->hw.last_tag = ~0ULL;
 485
 486        /* mark unused */
 487        event->hw.extra_reg.idx = EXTRA_REG_NONE;
 488        event->hw.branch_reg.idx = EXTRA_REG_NONE;
 489
 490        return x86_pmu.hw_config(event);
 491}
 492
 493void x86_pmu_disable_all(void)
 494{
 495        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 496        int idx;
 497
 498        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 499                u64 val;
 500
 501                if (!test_bit(idx, cpuc->active_mask))
 502                        continue;
 503                rdmsrl(x86_pmu_config_addr(idx), val);
 504                if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
 505                        continue;
 506                val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
 507                wrmsrl(x86_pmu_config_addr(idx), val);
 508        }
 509}
 510
 511static void x86_pmu_disable(struct pmu *pmu)
 512{
 513        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 514
 515        if (!x86_pmu_initialized())
 516                return;
 517
 518        if (!cpuc->enabled)
 519                return;
 520
 521        cpuc->n_added = 0;
 522        cpuc->enabled = 0;
 523        barrier();
 524
 525        x86_pmu.disable_all();
 526}
 527
 528void x86_pmu_enable_all(int added)
 529{
 530        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 531        int idx;
 532
 533        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 534                struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
 535
 536                if (!test_bit(idx, cpuc->active_mask))
 537                        continue;
 538
 539                __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
 540        }
 541}
 542
 543static struct pmu pmu;
 544
 545static inline int is_x86_event(struct perf_event *event)
 546{
 547        return event->pmu == &pmu;
 548}
 549
 550/*
 551 * Event scheduler state:
 552 *
 553 * Assign events iterating over all events and counters, beginning
 554 * with events with least weights first. Keep the current iterator
 555 * state in struct sched_state.
 556 */
 557struct sched_state {
 558        int     weight;
 559        int     event;          /* event index */
 560        int     counter;        /* counter index */
 561        int     unassigned;     /* number of events to be assigned left */
 562        unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 563};
 564
 565/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
 566#define SCHED_STATES_MAX        2
 567
 568struct perf_sched {
 569        int                     max_weight;
 570        int                     max_events;
 571        struct event_constraint **constraints;
 572        struct sched_state      state;
 573        int                     saved_states;
 574        struct sched_state      saved[SCHED_STATES_MAX];
 575};
 576
 577/*
 578 * Initialize interator that runs through all events and counters.
 579 */
 580static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c,
 581                            int num, int wmin, int wmax)
 582{
 583        int idx;
 584
 585        memset(sched, 0, sizeof(*sched));
 586        sched->max_events       = num;
 587        sched->max_weight       = wmax;
 588        sched->constraints      = c;
 589
 590        for (idx = 0; idx < num; idx++) {
 591                if (c[idx]->weight == wmin)
 592                        break;
 593        }
 594
 595        sched->state.event      = idx;          /* start with min weight */
 596        sched->state.weight     = wmin;
 597        sched->state.unassigned = num;
 598}
 599
 600static void perf_sched_save_state(struct perf_sched *sched)
 601{
 602        if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
 603                return;
 604
 605        sched->saved[sched->saved_states] = sched->state;
 606        sched->saved_states++;
 607}
 608
 609static bool perf_sched_restore_state(struct perf_sched *sched)
 610{
 611        if (!sched->saved_states)
 612                return false;
 613
 614        sched->saved_states--;
 615        sched->state = sched->saved[sched->saved_states];
 616
 617        /* continue with next counter: */
 618        clear_bit(sched->state.counter++, sched->state.used);
 619
 620        return true;
 621}
 622
 623/*
 624 * Select a counter for the current event to schedule. Return true on
 625 * success.
 626 */
 627static bool __perf_sched_find_counter(struct perf_sched *sched)
 628{
 629        struct event_constraint *c;
 630        int idx;
 631
 632        if (!sched->state.unassigned)
 633                return false;
 634
 635        if (sched->state.event >= sched->max_events)
 636                return false;
 637
 638        c = sched->constraints[sched->state.event];
 639
 640        /* Prefer fixed purpose counters */
 641        if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
 642                idx = INTEL_PMC_IDX_FIXED;
 643                for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
 644                        if (!__test_and_set_bit(idx, sched->state.used))
 645                                goto done;
 646                }
 647        }
 648        /* Grab the first unused counter starting with idx */
 649        idx = sched->state.counter;
 650        for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
 651                if (!__test_and_set_bit(idx, sched->state.used))
 652                        goto done;
 653        }
 654
 655        return false;
 656
 657done:
 658        sched->state.counter = idx;
 659
 660        if (c->overlap)
 661                perf_sched_save_state(sched);
 662
 663        return true;
 664}
 665
 666static bool perf_sched_find_counter(struct perf_sched *sched)
 667{
 668        while (!__perf_sched_find_counter(sched)) {
 669                if (!perf_sched_restore_state(sched))
 670                        return false;
 671        }
 672
 673        return true;
 674}
 675
 676/*
 677 * Go through all unassigned events and find the next one to schedule.
 678 * Take events with the least weight first. Return true on success.
 679 */
 680static bool perf_sched_next_event(struct perf_sched *sched)
 681{
 682        struct event_constraint *c;
 683
 684        if (!sched->state.unassigned || !--sched->state.unassigned)
 685                return false;
 686
 687        do {
 688                /* next event */
 689                sched->state.event++;
 690                if (sched->state.event >= sched->max_events) {
 691                        /* next weight */
 692                        sched->state.event = 0;
 693                        sched->state.weight++;
 694                        if (sched->state.weight > sched->max_weight)
 695                                return false;
 696                }
 697                c = sched->constraints[sched->state.event];
 698        } while (c->weight != sched->state.weight);
 699
 700        sched->state.counter = 0;       /* start with first counter */
 701
 702        return true;
 703}
 704
 705/*
 706 * Assign a counter for each event.
 707 */
 708int perf_assign_events(struct event_constraint **constraints, int n,
 709                        int wmin, int wmax, int *assign)
 710{
 711        struct perf_sched sched;
 712
 713        perf_sched_init(&sched, constraints, n, wmin, wmax);
 714
 715        do {
 716                if (!perf_sched_find_counter(&sched))
 717                        break;  /* failed */
 718                if (assign)
 719                        assign[sched.state.event] = sched.state.counter;
 720        } while (perf_sched_next_event(&sched));
 721
 722        return sched.state.unassigned;
 723}
 724
 725int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 726{
 727        struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
 728        unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 729        int i, wmin, wmax, num = 0;
 730        struct hw_perf_event *hwc;
 731
 732        bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 733
 734        for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 735                c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
 736                constraints[i] = c;
 737                wmin = min(wmin, c->weight);
 738                wmax = max(wmax, c->weight);
 739        }
 740
 741        /*
 742         * fastpath, try to reuse previous register
 743         */
 744        for (i = 0; i < n; i++) {
 745                hwc = &cpuc->event_list[i]->hw;
 746                c = constraints[i];
 747
 748                /* never assigned */
 749                if (hwc->idx == -1)
 750                        break;
 751
 752                /* constraint still honored */
 753                if (!test_bit(hwc->idx, c->idxmsk))
 754                        break;
 755
 756                /* not already used */
 757                if (test_bit(hwc->idx, used_mask))
 758                        break;
 759
 760                __set_bit(hwc->idx, used_mask);
 761                if (assign)
 762                        assign[i] = hwc->idx;
 763        }
 764
 765        /* slow path */
 766        if (i != n)
 767                num = perf_assign_events(constraints, n, wmin, wmax, assign);
 768
 769        /*
 770         * scheduling failed or is just a simulation,
 771         * free resources if necessary
 772         */
 773        if (!assign || num) {
 774                for (i = 0; i < n; i++) {
 775                        if (x86_pmu.put_event_constraints)
 776                                x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
 777                }
 778        }
 779        return num ? -EINVAL : 0;
 780}
 781
 782/*
 783 * dogrp: true if must collect siblings events (group)
 784 * returns total number of events and error code
 785 */
 786static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
 787{
 788        struct perf_event *event;
 789        int n, max_count;
 790
 791        max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
 792
 793        /* current number of events already accepted */
 794        n = cpuc->n_events;
 795
 796        if (is_x86_event(leader)) {
 797                if (n >= max_count)
 798                        return -EINVAL;
 799                cpuc->event_list[n] = leader;
 800                n++;
 801        }
 802        if (!dogrp)
 803                return n;
 804
 805        list_for_each_entry(event, &leader->sibling_list, group_entry) {
 806                if (!is_x86_event(event) ||
 807                    event->state <= PERF_EVENT_STATE_OFF)
 808                        continue;
 809
 810                if (n >= max_count)
 811                        return -EINVAL;
 812
 813                cpuc->event_list[n] = event;
 814                n++;
 815        }
 816        return n;
 817}
 818
 819static inline void x86_assign_hw_event(struct perf_event *event,
 820                                struct cpu_hw_events *cpuc, int i)
 821{
 822        struct hw_perf_event *hwc = &event->hw;
 823
 824        hwc->idx = cpuc->assign[i];
 825        hwc->last_cpu = smp_processor_id();
 826        hwc->last_tag = ++cpuc->tags[i];
 827
 828        if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
 829                hwc->config_base = 0;
 830                hwc->event_base = 0;
 831        } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
 832                hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
 833                hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
 834                hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
 835        } else {
 836                hwc->config_base = x86_pmu_config_addr(hwc->idx);
 837                hwc->event_base  = x86_pmu_event_addr(hwc->idx);
 838                hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
 839        }
 840}
 841
 842static inline int match_prev_assignment(struct hw_perf_event *hwc,
 843                                        struct cpu_hw_events *cpuc,
 844                                        int i)
 845{
 846        return hwc->idx == cpuc->assign[i] &&
 847                hwc->last_cpu == smp_processor_id() &&
 848                hwc->last_tag == cpuc->tags[i];
 849}
 850
 851static void x86_pmu_start(struct perf_event *event, int flags);
 852
 853static void x86_pmu_enable(struct pmu *pmu)
 854{
 855        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 856        struct perf_event *event;
 857        struct hw_perf_event *hwc;
 858        int i, added = cpuc->n_added;
 859
 860        if (!x86_pmu_initialized())
 861                return;
 862
 863        if (cpuc->enabled)
 864                return;
 865
 866        if (cpuc->n_added) {
 867                int n_running = cpuc->n_events - cpuc->n_added;
 868                /*
 869                 * apply assignment obtained either from
 870                 * hw_perf_group_sched_in() or x86_pmu_enable()
 871                 *
 872                 * step1: save events moving to new counters
 873                 * step2: reprogram moved events into new counters
 874                 */
 875                for (i = 0; i < n_running; i++) {
 876                        event = cpuc->event_list[i];
 877                        hwc = &event->hw;
 878
 879                        /*
 880                         * we can avoid reprogramming counter if:
 881                         * - assigned same counter as last time
 882                         * - running on same CPU as last time
 883                         * - no other event has used the counter since
 884                         */
 885                        if (hwc->idx == -1 ||
 886                            match_prev_assignment(hwc, cpuc, i))
 887                                continue;
 888
 889                        /*
 890                         * Ensure we don't accidentally enable a stopped
 891                         * counter simply because we rescheduled.
 892                         */
 893                        if (hwc->state & PERF_HES_STOPPED)
 894                                hwc->state |= PERF_HES_ARCH;
 895
 896                        x86_pmu_stop(event, PERF_EF_UPDATE);
 897                }
 898
 899                for (i = 0; i < cpuc->n_events; i++) {
 900                        event = cpuc->event_list[i];
 901                        hwc = &event->hw;
 902
 903                        if (!match_prev_assignment(hwc, cpuc, i))
 904                                x86_assign_hw_event(event, cpuc, i);
 905                        else if (i < n_running)
 906                                continue;
 907
 908                        if (hwc->state & PERF_HES_ARCH)
 909                                continue;
 910
 911                        x86_pmu_start(event, PERF_EF_RELOAD);
 912                }
 913                cpuc->n_added = 0;
 914                perf_events_lapic_init();
 915        }
 916
 917        cpuc->enabled = 1;
 918        barrier();
 919
 920        x86_pmu.enable_all(added);
 921}
 922
 923static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
 924
 925/*
 926 * Set the next IRQ period, based on the hwc->period_left value.
 927 * To be called with the event disabled in hw:
 928 */
 929int x86_perf_event_set_period(struct perf_event *event)
 930{
 931        struct hw_perf_event *hwc = &event->hw;
 932        s64 left = local64_read(&hwc->period_left);
 933        s64 period = hwc->sample_period;
 934        int ret = 0, idx = hwc->idx;
 935
 936        if (idx == INTEL_PMC_IDX_FIXED_BTS)
 937                return 0;
 938
 939        /*
 940         * If we are way outside a reasonable range then just skip forward:
 941         */
 942        if (unlikely(left <= -period)) {
 943                left = period;
 944                local64_set(&hwc->period_left, left);
 945                hwc->last_period = period;
 946                ret = 1;
 947        }
 948
 949        if (unlikely(left <= 0)) {
 950                left += period;
 951                local64_set(&hwc->period_left, left);
 952                hwc->last_period = period;
 953                ret = 1;
 954        }
 955        /*
 956         * Quirk: certain CPUs dont like it if just 1 hw_event is left:
 957         */
 958        if (unlikely(left < 2))
 959                left = 2;
 960
 961        if (left > x86_pmu.max_period)
 962                left = x86_pmu.max_period;
 963
 964        per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 965
 966        /*
 967         * The hw event starts counting from this event offset,
 968         * mark it to be able to extra future deltas:
 969         */
 970        local64_set(&hwc->prev_count, (u64)-left);
 971
 972        wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
 973
 974        /*
 975         * Due to erratum on certan cpu we need
 976         * a second write to be sure the register
 977         * is updated properly
 978         */
 979        if (x86_pmu.perfctr_second_write) {
 980                wrmsrl(hwc->event_base,
 981                        (u64)(-left) & x86_pmu.cntval_mask);
 982        }
 983
 984        perf_event_update_userpage(event);
 985
 986        return ret;
 987}
 988
 989void x86_pmu_enable_event(struct perf_event *event)
 990{
 991        if (__this_cpu_read(cpu_hw_events.enabled))
 992                __x86_pmu_enable_event(&event->hw,
 993                                       ARCH_PERFMON_EVENTSEL_ENABLE);
 994}
 995
 996/*
 997 * Add a single event to the PMU.
 998 *
 999 * The event is added to the group of enabled events
1000 * but only if it can be scehduled with existing events.
1001 */
1002static int x86_pmu_add(struct perf_event *event, int flags)
1003{
1004        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1005        struct hw_perf_event *hwc;
1006        int assign[X86_PMC_IDX_MAX];
1007        int n, n0, ret;
1008
1009        hwc = &event->hw;
1010
1011        perf_pmu_disable(event->pmu);
1012        n0 = cpuc->n_events;
1013        ret = n = collect_events(cpuc, event, false);
1014        if (ret < 0)
1015                goto out;
1016
1017        hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1018        if (!(flags & PERF_EF_START))
1019                hwc->state |= PERF_HES_ARCH;
1020
1021        /*
1022         * If group events scheduling transaction was started,
1023         * skip the schedulability test here, it will be performed
1024         * at commit time (->commit_txn) as a whole
1025         */
1026        if (cpuc->group_flag & PERF_EVENT_TXN)
1027                goto done_collect;
1028
1029        ret = x86_pmu.schedule_events(cpuc, n, assign);
1030        if (ret)
1031                goto out;
1032        /*
1033         * copy new assignment, now we know it is possible
1034         * will be used by hw_perf_enable()
1035         */
1036        memcpy(cpuc->assign, assign, n*sizeof(int));
1037
1038done_collect:
1039        cpuc->n_events = n;
1040        cpuc->n_added += n - n0;
1041        cpuc->n_txn += n - n0;
1042
1043        ret = 0;
1044out:
1045        perf_pmu_enable(event->pmu);
1046        return ret;
1047}
1048
1049static void x86_pmu_start(struct perf_event *event, int flags)
1050{
1051        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1052        int idx = event->hw.idx;
1053
1054        if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1055                return;
1056
1057        if (WARN_ON_ONCE(idx == -1))
1058                return;
1059
1060        if (flags & PERF_EF_RELOAD) {
1061                WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1062                x86_perf_event_set_period(event);
1063        }
1064
1065        event->hw.state = 0;
1066
1067        cpuc->events[idx] = event;
1068        __set_bit(idx, cpuc->active_mask);
1069        __set_bit(idx, cpuc->running);
1070        x86_pmu.enable(event);
1071        perf_event_update_userpage(event);
1072}
1073
1074void perf_event_print_debug(void)
1075{
1076        u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1077        u64 pebs;
1078        struct cpu_hw_events *cpuc;
1079        unsigned long flags;
1080        int cpu, idx;
1081
1082        if (!x86_pmu.num_counters)
1083                return;
1084
1085        local_irq_save(flags);
1086
1087        cpu = smp_processor_id();
1088        cpuc = &per_cpu(cpu_hw_events, cpu);
1089
1090        if (x86_pmu.version >= 2) {
1091                rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1092                rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1093                rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1094                rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1095                rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1096
1097                pr_info("\n");
1098                pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
1099                pr_info("CPU#%d: status:     %016llx\n", cpu, status);
1100                pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
1101                pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1102                pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1103        }
1104        pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1105
1106        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1107                rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1108                rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1109
1110                prev_left = per_cpu(pmc_prev_left[idx], cpu);
1111
1112                pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
1113                        cpu, idx, pmc_ctrl);
1114                pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
1115                        cpu, idx, pmc_count);
1116                pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1117                        cpu, idx, prev_left);
1118        }
1119        for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1120                rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1121
1122                pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1123                        cpu, idx, pmc_count);
1124        }
1125        local_irq_restore(flags);
1126}
1127
1128void x86_pmu_stop(struct perf_event *event, int flags)
1129{
1130        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1131        struct hw_perf_event *hwc = &event->hw;
1132
1133        if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
1134                x86_pmu.disable(event);
1135                cpuc->events[hwc->idx] = NULL;
1136                WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1137                hwc->state |= PERF_HES_STOPPED;
1138        }
1139
1140        if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1141                /*
1142                 * Drain the remaining delta count out of a event
1143                 * that we are disabling:
1144                 */
1145                x86_perf_event_update(event);
1146                hwc->state |= PERF_HES_UPTODATE;
1147        }
1148}
1149
1150static void x86_pmu_del(struct perf_event *event, int flags)
1151{
1152        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1153        int i;
1154
1155        /*
1156         * If we're called during a txn, we don't need to do anything.
1157         * The events never got scheduled and ->cancel_txn will truncate
1158         * the event_list.
1159         */
1160        if (cpuc->group_flag & PERF_EVENT_TXN)
1161                return;
1162
1163        x86_pmu_stop(event, PERF_EF_UPDATE);
1164
1165        for (i = 0; i < cpuc->n_events; i++) {
1166                if (event == cpuc->event_list[i]) {
1167
1168                        if (x86_pmu.put_event_constraints)
1169                                x86_pmu.put_event_constraints(cpuc, event);
1170
1171                        while (++i < cpuc->n_events)
1172                                cpuc->event_list[i-1] = cpuc->event_list[i];
1173
1174                        --cpuc->n_events;
1175                        break;
1176                }
1177        }
1178        perf_event_update_userpage(event);
1179}
1180
1181int x86_pmu_handle_irq(struct pt_regs *regs)
1182{
1183        struct perf_sample_data data;
1184        struct cpu_hw_events *cpuc;
1185        struct perf_event *event;
1186        int idx, handled = 0;
1187        u64 val;
1188
1189        cpuc = &__get_cpu_var(cpu_hw_events);
1190
1191        /*
1192         * Some chipsets need to unmask the LVTPC in a particular spot
1193         * inside the nmi handler.  As a result, the unmasking was pushed
1194         * into all the nmi handlers.
1195         *
1196         * This generic handler doesn't seem to have any issues where the
1197         * unmasking occurs so it was left at the top.
1198         */
1199        apic_write(APIC_LVTPC, APIC_DM_NMI);
1200
1201        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1202                if (!test_bit(idx, cpuc->active_mask)) {
1203                        /*
1204                         * Though we deactivated the counter some cpus
1205                         * might still deliver spurious interrupts still
1206                         * in flight. Catch them:
1207                         */
1208                        if (__test_and_clear_bit(idx, cpuc->running))
1209                                handled++;
1210                        continue;
1211                }
1212
1213                event = cpuc->events[idx];
1214
1215                val = x86_perf_event_update(event);
1216                if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1217                        continue;
1218
1219                /*
1220                 * event overflow
1221                 */
1222                handled++;
1223                perf_sample_data_init(&data, 0, event->hw.last_period);
1224
1225                if (!x86_perf_event_set_period(event))
1226                        continue;
1227
1228                if (perf_event_overflow(event, &data, regs))
1229                        x86_pmu_stop(event, 0);
1230        }
1231
1232        if (handled)
1233                inc_irq_stat(apic_perf_irqs);
1234
1235        return handled;
1236}
1237
1238void perf_events_lapic_init(void)
1239{
1240        if (!x86_pmu.apic || !x86_pmu_initialized())
1241                return;
1242
1243        /*
1244         * Always use NMI for PMU
1245         */
1246        apic_write(APIC_LVTPC, APIC_DM_NMI);
1247}
1248
1249static int __kprobes
1250perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
1251{
1252        if (!atomic_read(&active_events))
1253                return NMI_DONE;
1254
1255        return x86_pmu.handle_irq(regs);
1256}
1257
1258struct event_constraint emptyconstraint;
1259struct event_constraint unconstrained;
1260
1261static int __cpuinit
1262x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1263{
1264        unsigned int cpu = (long)hcpu;
1265        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1266        int ret = NOTIFY_OK;
1267
1268        switch (action & ~CPU_TASKS_FROZEN) {
1269        case CPU_UP_PREPARE:
1270                cpuc->kfree_on_online = NULL;
1271                if (x86_pmu.cpu_prepare)
1272                        ret = x86_pmu.cpu_prepare(cpu);
1273                break;
1274
1275        case CPU_STARTING:
1276                if (x86_pmu.attr_rdpmc)
1277                        set_in_cr4(X86_CR4_PCE);
1278                if (x86_pmu.cpu_starting)
1279                        x86_pmu.cpu_starting(cpu);
1280                break;
1281
1282        case CPU_ONLINE:
1283                kfree(cpuc->kfree_on_online);
1284                break;
1285
1286        case CPU_DYING:
1287                if (x86_pmu.cpu_dying)
1288                        x86_pmu.cpu_dying(cpu);
1289                break;
1290
1291        case CPU_UP_CANCELED:
1292        case CPU_DEAD:
1293                if (x86_pmu.cpu_dead)
1294                        x86_pmu.cpu_dead(cpu);
1295                break;
1296
1297        default:
1298                break;
1299        }
1300
1301        return ret;
1302}
1303
1304static void __init pmu_check_apic(void)
1305{
1306        if (cpu_has_apic)
1307                return;
1308
1309        x86_pmu.apic = 0;
1310        pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1311        pr_info("no hardware sampling interrupt available.\n");
1312}
1313
1314static struct attribute_group x86_pmu_format_group = {
1315        .name = "format",
1316        .attrs = NULL,
1317};
1318
1319/*
1320 * Remove all undefined events (x86_pmu.event_map(id) == 0)
1321 * out of events_attr attributes.
1322 */
1323static void __init filter_events(struct attribute **attrs)
1324{
1325        struct device_attribute *d;
1326        struct perf_pmu_events_attr *pmu_attr;
1327        int i, j;
1328
1329        for (i = 0; attrs[i]; i++) {
1330                d = (struct device_attribute *)attrs[i];
1331                pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
1332                /* str trumps id */
1333                if (pmu_attr->event_str)
1334                        continue;
1335                if (x86_pmu.event_map(i))
1336                        continue;
1337
1338                for (j = i; attrs[j]; j++)
1339                        attrs[j] = attrs[j + 1];
1340
1341                /* Check the shifted attr. */
1342                i--;
1343        }
1344}
1345
1346/* Merge two pointer arrays */
1347static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
1348{
1349        struct attribute **new;
1350        int j, i;
1351
1352        for (j = 0; a[j]; j++)
1353                ;
1354        for (i = 0; b[i]; i++)
1355                j++;
1356        j++;
1357
1358        new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
1359        if (!new)
1360                return NULL;
1361
1362        j = 0;
1363        for (i = 0; a[i]; i++)
1364                new[j++] = a[i];
1365        for (i = 0; b[i]; i++)
1366                new[j++] = b[i];
1367        new[j] = NULL;
1368
1369        return new;
1370}
1371
1372ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
1373                          char *page)
1374{
1375        struct perf_pmu_events_attr *pmu_attr = \
1376                container_of(attr, struct perf_pmu_events_attr, attr);
1377        u64 config = x86_pmu.event_map(pmu_attr->id);
1378
1379        /* string trumps id */
1380        if (pmu_attr->event_str)
1381                return sprintf(page, "%s", pmu_attr->event_str);
1382
1383        return x86_pmu.events_sysfs_show(page, config);
1384}
1385
1386EVENT_ATTR(cpu-cycles,                  CPU_CYCLES              );
1387EVENT_ATTR(instructions,                INSTRUCTIONS            );
1388EVENT_ATTR(cache-references,            CACHE_REFERENCES        );
1389EVENT_ATTR(cache-misses,                CACHE_MISSES            );
1390EVENT_ATTR(branch-instructions,         BRANCH_INSTRUCTIONS     );
1391EVENT_ATTR(branch-misses,               BRANCH_MISSES           );
1392EVENT_ATTR(bus-cycles,                  BUS_CYCLES              );
1393EVENT_ATTR(stalled-cycles-frontend,     STALLED_CYCLES_FRONTEND );
1394EVENT_ATTR(stalled-cycles-backend,      STALLED_CYCLES_BACKEND  );
1395EVENT_ATTR(ref-cycles,                  REF_CPU_CYCLES          );
1396
1397static struct attribute *empty_attrs;
1398
1399static struct attribute *events_attr[] = {
1400        EVENT_PTR(CPU_CYCLES),
1401        EVENT_PTR(INSTRUCTIONS),
1402        EVENT_PTR(CACHE_REFERENCES),
1403        EVENT_PTR(CACHE_MISSES),
1404        EVENT_PTR(BRANCH_INSTRUCTIONS),
1405        EVENT_PTR(BRANCH_MISSES),
1406        EVENT_PTR(BUS_CYCLES),
1407        EVENT_PTR(STALLED_CYCLES_FRONTEND),
1408        EVENT_PTR(STALLED_CYCLES_BACKEND),
1409        EVENT_PTR(REF_CPU_CYCLES),
1410        NULL,
1411};
1412
1413static struct attribute_group x86_pmu_events_group = {
1414        .name = "events",
1415        .attrs = events_attr,
1416};
1417
1418ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
1419{
1420        u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
1421        u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
1422        bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
1423        bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
1424        bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
1425        bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
1426        ssize_t ret;
1427
1428        /*
1429        * We have whole page size to spend and just little data
1430        * to write, so we can safely use sprintf.
1431        */
1432        ret = sprintf(page, "event=0x%02llx", event);
1433
1434        if (umask)
1435                ret += sprintf(page + ret, ",umask=0x%02llx", umask);
1436
1437        if (edge)
1438                ret += sprintf(page + ret, ",edge");
1439
1440        if (pc)
1441                ret += sprintf(page + ret, ",pc");
1442
1443        if (any)
1444                ret += sprintf(page + ret, ",any");
1445
1446        if (inv)
1447                ret += sprintf(page + ret, ",inv");
1448
1449        if (cmask)
1450                ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
1451
1452        ret += sprintf(page + ret, "\n");
1453
1454        return ret;
1455}
1456
1457static int __init init_hw_perf_events(void)
1458{
1459        struct x86_pmu_quirk *quirk;
1460        int err;
1461
1462        pr_info("Performance Events: ");
1463
1464        switch (boot_cpu_data.x86_vendor) {
1465        case X86_VENDOR_INTEL:
1466                err = intel_pmu_init();
1467                break;
1468        case X86_VENDOR_AMD:
1469                err = amd_pmu_init();
1470                break;
1471        default:
1472                return 0;
1473        }
1474        if (err != 0) {
1475                pr_cont("no PMU driver, software events only.\n");
1476                return 0;
1477        }
1478
1479        pmu_check_apic();
1480
1481        /* sanity check that the hardware exists or is emulated */
1482        if (!check_hw_exists())
1483                return 0;
1484
1485        pr_cont("%s PMU driver.\n", x86_pmu.name);
1486
1487        for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
1488                quirk->func();
1489
1490        if (!x86_pmu.intel_ctrl)
1491                x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1492
1493        perf_events_lapic_init();
1494        register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
1495
1496        unconstrained = (struct event_constraint)
1497                __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1498                                   0, x86_pmu.num_counters, 0, 0);
1499
1500        x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1501        x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1502
1503        if (x86_pmu.event_attrs)
1504                x86_pmu_events_group.attrs = x86_pmu.event_attrs;
1505
1506        if (!x86_pmu.events_sysfs_show)
1507                x86_pmu_events_group.attrs = &empty_attrs;
1508        else
1509                filter_events(x86_pmu_events_group.attrs);
1510
1511        if (x86_pmu.cpu_events) {
1512                struct attribute **tmp;
1513
1514                tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
1515                if (!WARN_ON(!tmp))
1516                        x86_pmu_events_group.attrs = tmp;
1517        }
1518
1519        pr_info("... version:                %d\n",     x86_pmu.version);
1520        pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
1521        pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
1522        pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
1523        pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1524        pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1525        pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1526
1527        perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1528        perf_cpu_notifier(x86_pmu_notifier);
1529
1530        return 0;
1531}
1532early_initcall(init_hw_perf_events);
1533
1534static inline void x86_pmu_read(struct perf_event *event)
1535{
1536        x86_perf_event_update(event);
1537}
1538
1539/*
1540 * Start group events scheduling transaction
1541 * Set the flag to make pmu::enable() not perform the
1542 * schedulability test, it will be performed at commit time
1543 */
1544static void x86_pmu_start_txn(struct pmu *pmu)
1545{
1546        perf_pmu_disable(pmu);
1547        __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
1548        __this_cpu_write(cpu_hw_events.n_txn, 0);
1549}
1550
1551/*
1552 * Stop group events scheduling transaction
1553 * Clear the flag and pmu::enable() will perform the
1554 * schedulability test.
1555 */
1556static void x86_pmu_cancel_txn(struct pmu *pmu)
1557{
1558        __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1559        /*
1560         * Truncate the collected events.
1561         */
1562        __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1563        __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
1564        perf_pmu_enable(pmu);
1565}
1566
1567/*
1568 * Commit group events scheduling transaction
1569 * Perform the group schedulability test as a whole
1570 * Return 0 if success
1571 */
1572static int x86_pmu_commit_txn(struct pmu *pmu)
1573{
1574        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1575        int assign[X86_PMC_IDX_MAX];
1576        int n, ret;
1577
1578        n = cpuc->n_events;
1579
1580        if (!x86_pmu_initialized())
1581                return -EAGAIN;
1582
1583        ret = x86_pmu.schedule_events(cpuc, n, assign);
1584        if (ret)
1585                return ret;
1586
1587        /*
1588         * copy new assignment, now we know it is possible
1589         * will be used by hw_perf_enable()
1590         */
1591        memcpy(cpuc->assign, assign, n*sizeof(int));
1592
1593        cpuc->group_flag &= ~PERF_EVENT_TXN;
1594        perf_pmu_enable(pmu);
1595        return 0;
1596}
1597/*
1598 * a fake_cpuc is used to validate event groups. Due to
1599 * the extra reg logic, we need to also allocate a fake
1600 * per_core and per_cpu structure. Otherwise, group events
1601 * using extra reg may conflict without the kernel being
1602 * able to catch this when the last event gets added to
1603 * the group.
1604 */
1605static void free_fake_cpuc(struct cpu_hw_events *cpuc)
1606{
1607        kfree(cpuc->shared_regs);
1608        kfree(cpuc);
1609}
1610
1611static struct cpu_hw_events *allocate_fake_cpuc(void)
1612{
1613        struct cpu_hw_events *cpuc;
1614        int cpu = raw_smp_processor_id();
1615
1616        cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
1617        if (!cpuc)
1618                return ERR_PTR(-ENOMEM);
1619
1620        /* only needed, if we have extra_regs */
1621        if (x86_pmu.extra_regs) {
1622                cpuc->shared_regs = allocate_shared_regs(cpu);
1623                if (!cpuc->shared_regs)
1624                        goto error;
1625        }
1626        cpuc->is_fake = 1;
1627        return cpuc;
1628error:
1629        free_fake_cpuc(cpuc);
1630        return ERR_PTR(-ENOMEM);
1631}
1632
1633/*
1634 * validate that we can schedule this event
1635 */
1636static int validate_event(struct perf_event *event)
1637{
1638        struct cpu_hw_events *fake_cpuc;
1639        struct event_constraint *c;
1640        int ret = 0;
1641
1642        fake_cpuc = allocate_fake_cpuc();
1643        if (IS_ERR(fake_cpuc))
1644                return PTR_ERR(fake_cpuc);
1645
1646        c = x86_pmu.get_event_constraints(fake_cpuc, event);
1647
1648        if (!c || !c->weight)
1649                ret = -EINVAL;
1650
1651        if (x86_pmu.put_event_constraints)
1652                x86_pmu.put_event_constraints(fake_cpuc, event);
1653
1654        free_fake_cpuc(fake_cpuc);
1655
1656        return ret;
1657}
1658
1659/*
1660 * validate a single event group
1661 *
1662 * validation include:
1663 *      - check events are compatible which each other
1664 *      - events do not compete for the same counter
1665 *      - number of events <= number of counters
1666 *
1667 * validation ensures the group can be loaded onto the
1668 * PMU if it was the only group available.
1669 */
1670static int validate_group(struct perf_event *event)
1671{
1672        struct perf_event *leader = event->group_leader;
1673        struct cpu_hw_events *fake_cpuc;
1674        int ret = -EINVAL, n;
1675
1676        fake_cpuc = allocate_fake_cpuc();
1677        if (IS_ERR(fake_cpuc))
1678                return PTR_ERR(fake_cpuc);
1679        /*
1680         * the event is not yet connected with its
1681         * siblings therefore we must first collect
1682         * existing siblings, then add the new event
1683         * before we can simulate the scheduling
1684         */
1685        n = collect_events(fake_cpuc, leader, true);
1686        if (n < 0)
1687                goto out;
1688
1689        fake_cpuc->n_events = n;
1690        n = collect_events(fake_cpuc, event, false);
1691        if (n < 0)
1692                goto out;
1693
1694        fake_cpuc->n_events = n;
1695
1696        ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1697
1698out:
1699        free_fake_cpuc(fake_cpuc);
1700        return ret;
1701}
1702
1703static int x86_pmu_event_init(struct perf_event *event)
1704{
1705        struct pmu *tmp;
1706        int err;
1707
1708        switch (event->attr.type) {
1709        case PERF_TYPE_RAW:
1710        case PERF_TYPE_HARDWARE:
1711        case PERF_TYPE_HW_CACHE:
1712                break;
1713
1714        default:
1715                return -ENOENT;
1716        }
1717
1718        err = __x86_pmu_event_init(event);
1719        if (!err) {
1720                /*
1721                 * we temporarily connect event to its pmu
1722                 * such that validate_group() can classify
1723                 * it as an x86 event using is_x86_event()
1724                 */
1725                tmp = event->pmu;
1726                event->pmu = &pmu;
1727
1728                if (event->group_leader != event)
1729                        err = validate_group(event);
1730                else
1731                        err = validate_event(event);
1732
1733                event->pmu = tmp;
1734        }
1735        if (err) {
1736                if (event->destroy)
1737                        event->destroy(event);
1738        }
1739
1740        return err;
1741}
1742
1743static int x86_pmu_event_idx(struct perf_event *event)
1744{
1745        int idx = event->hw.idx;
1746
1747        if (!x86_pmu.attr_rdpmc)
1748                return 0;
1749
1750        if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
1751                idx -= INTEL_PMC_IDX_FIXED;
1752                idx |= 1 << 30;
1753        }
1754
1755        return idx + 1;
1756}
1757
1758static ssize_t get_attr_rdpmc(struct device *cdev,
1759                              struct device_attribute *attr,
1760                              char *buf)
1761{
1762        return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
1763}
1764
1765static void change_rdpmc(void *info)
1766{
1767        bool enable = !!(unsigned long)info;
1768
1769        if (enable)
1770                set_in_cr4(X86_CR4_PCE);
1771        else
1772                clear_in_cr4(X86_CR4_PCE);
1773}
1774
1775static ssize_t set_attr_rdpmc(struct device *cdev,
1776                              struct device_attribute *attr,
1777                              const char *buf, size_t count)
1778{
1779        unsigned long val;
1780        ssize_t ret;
1781
1782        ret = kstrtoul(buf, 0, &val);
1783        if (ret)
1784                return ret;
1785
1786        if (!!val != !!x86_pmu.attr_rdpmc) {
1787                x86_pmu.attr_rdpmc = !!val;
1788                smp_call_function(change_rdpmc, (void *)val, 1);
1789        }
1790
1791        return count;
1792}
1793
1794static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
1795
1796static struct attribute *x86_pmu_attrs[] = {
1797        &dev_attr_rdpmc.attr,
1798        NULL,
1799};
1800
1801static struct attribute_group x86_pmu_attr_group = {
1802        .attrs = x86_pmu_attrs,
1803};
1804
1805static const struct attribute_group *x86_pmu_attr_groups[] = {
1806        &x86_pmu_attr_group,
1807        &x86_pmu_format_group,
1808        &x86_pmu_events_group,
1809        NULL,
1810};
1811
1812static void x86_pmu_flush_branch_stack(void)
1813{
1814        if (x86_pmu.flush_branch_stack)
1815                x86_pmu.flush_branch_stack();
1816}
1817
1818void perf_check_microcode(void)
1819{
1820        if (x86_pmu.check_microcode)
1821                x86_pmu.check_microcode();
1822}
1823EXPORT_SYMBOL_GPL(perf_check_microcode);
1824
1825static struct pmu pmu = {
1826        .pmu_enable             = x86_pmu_enable,
1827        .pmu_disable            = x86_pmu_disable,
1828
1829        .attr_groups            = x86_pmu_attr_groups,
1830
1831        .event_init             = x86_pmu_event_init,
1832
1833        .add                    = x86_pmu_add,
1834        .del                    = x86_pmu_del,
1835        .start                  = x86_pmu_start,
1836        .stop                   = x86_pmu_stop,
1837        .read                   = x86_pmu_read,
1838
1839        .start_txn              = x86_pmu_start_txn,
1840        .cancel_txn             = x86_pmu_cancel_txn,
1841        .commit_txn             = x86_pmu_commit_txn,
1842
1843        .event_idx              = x86_pmu_event_idx,
1844        .flush_branch_stack     = x86_pmu_flush_branch_stack,
1845};
1846
1847void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
1848{
1849        userpg->cap_usr_time = 0;
1850        userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc;
1851        userpg->pmc_width = x86_pmu.cntval_bits;
1852
1853        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
1854                return;
1855
1856        if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
1857                return;
1858
1859        userpg->cap_usr_time = 1;
1860        userpg->time_mult = this_cpu_read(cyc2ns);
1861        userpg->time_shift = CYC2NS_SCALE_FACTOR;
1862        userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
1863}
1864
1865/*
1866 * callchain support
1867 */
1868
1869static int backtrace_stack(void *data, char *name)
1870{
1871        return 0;
1872}
1873
1874static void backtrace_address(void *data, unsigned long addr, int reliable)
1875{
1876        struct perf_callchain_entry *entry = data;
1877
1878        perf_callchain_store(entry, addr);
1879}
1880
1881static const struct stacktrace_ops backtrace_ops = {
1882        .stack                  = backtrace_stack,
1883        .address                = backtrace_address,
1884        .walk_stack             = print_context_stack_bp,
1885};
1886
1887void
1888perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1889{
1890        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1891                /* TODO: We don't support guest os callchain now */
1892                return;
1893        }
1894
1895        perf_callchain_store(entry, regs->ip);
1896
1897        dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1898}
1899
1900static inline int
1901valid_user_frame(const void __user *fp, unsigned long size)
1902{
1903        return (__range_not_ok(fp, size, TASK_SIZE) == 0);
1904}
1905
1906static unsigned long get_segment_base(unsigned int segment)
1907{
1908        struct desc_struct *desc;
1909        int idx = segment >> 3;
1910
1911        if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1912                if (idx > LDT_ENTRIES)
1913                        return 0;
1914
1915                if (idx > current->active_mm->context.size)
1916                        return 0;
1917
1918                desc = current->active_mm->context.ldt;
1919        } else {
1920                if (idx > GDT_ENTRIES)
1921                        return 0;
1922
1923                desc = __this_cpu_ptr(&gdt_page.gdt[0]);
1924        }
1925
1926        return get_desc_base(desc + idx);
1927}
1928
1929#ifdef CONFIG_COMPAT
1930
1931#include <asm/compat.h>
1932
1933static inline int
1934perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1935{
1936        /* 32-bit process in 64-bit kernel. */
1937        unsigned long ss_base, cs_base;
1938        struct stack_frame_ia32 frame;
1939        const void __user *fp;
1940
1941        if (!test_thread_flag(TIF_IA32))
1942                return 0;
1943
1944        cs_base = get_segment_base(regs->cs);
1945        ss_base = get_segment_base(regs->ss);
1946
1947        fp = compat_ptr(ss_base + regs->bp);
1948        while (entry->nr < PERF_MAX_STACK_DEPTH) {
1949                unsigned long bytes;
1950                frame.next_frame     = 0;
1951                frame.return_address = 0;
1952
1953                bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1954                if (bytes != sizeof(frame))
1955                        break;
1956
1957                if (!valid_user_frame(fp, sizeof(frame)))
1958                        break;
1959
1960                perf_callchain_store(entry, cs_base + frame.return_address);
1961                fp = compat_ptr(ss_base + frame.next_frame);
1962        }
1963        return 1;
1964}
1965#else
1966static inline int
1967perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1968{
1969    return 0;
1970}
1971#endif
1972
1973void
1974perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1975{
1976        struct stack_frame frame;
1977        const void __user *fp;
1978
1979        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1980                /* TODO: We don't support guest os callchain now */
1981                return;
1982        }
1983
1984        /*
1985         * We don't know what to do with VM86 stacks.. ignore them for now.
1986         */
1987        if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
1988                return;
1989
1990        fp = (void __user *)regs->bp;
1991
1992        perf_callchain_store(entry, regs->ip);
1993
1994        if (!current->mm)
1995                return;
1996
1997        if (perf_callchain_user32(regs, entry))
1998                return;
1999
2000        while (entry->nr < PERF_MAX_STACK_DEPTH) {
2001                unsigned long bytes;
2002                frame.next_frame             = NULL;
2003                frame.return_address = 0;
2004
2005                bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
2006                if (bytes != sizeof(frame))
2007                        break;
2008
2009                if (!valid_user_frame(fp, sizeof(frame)))
2010                        break;
2011
2012                perf_callchain_store(entry, frame.return_address);
2013                fp = frame.next_frame;
2014        }
2015}
2016
2017/*
2018 * Deal with code segment offsets for the various execution modes:
2019 *
2020 *   VM86 - the good olde 16 bit days, where the linear address is
2021 *          20 bits and we use regs->ip + 0x10 * regs->cs.
2022 *
2023 *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
2024 *          to figure out what the 32bit base address is.
2025 *
2026 *    X32 - has TIF_X32 set, but is running in x86_64
2027 *
2028 * X86_64 - CS,DS,SS,ES are all zero based.
2029 */
2030static unsigned long code_segment_base(struct pt_regs *regs)
2031{
2032        /*
2033         * If we are in VM86 mode, add the segment offset to convert to a
2034         * linear address.
2035         */
2036        if (regs->flags & X86_VM_MASK)
2037                return 0x10 * regs->cs;
2038
2039        /*
2040         * For IA32 we look at the GDT/LDT segment base to convert the
2041         * effective IP to a linear address.
2042         */
2043#ifdef CONFIG_X86_32
2044        if (user_mode(regs) && regs->cs != __USER_CS)
2045                return get_segment_base(regs->cs);
2046#else
2047        if (test_thread_flag(TIF_IA32)) {
2048                if (user_mode(regs) && regs->cs != __USER32_CS)
2049                        return get_segment_base(regs->cs);
2050        }
2051#endif
2052        return 0;
2053}
2054
2055unsigned long perf_instruction_pointer(struct pt_regs *regs)
2056{
2057        if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
2058                return perf_guest_cbs->get_guest_ip();
2059
2060        return regs->ip + code_segment_base(regs);
2061}
2062
2063unsigned long perf_misc_flags(struct pt_regs *regs)
2064{
2065        int misc = 0;
2066
2067        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
2068                if (perf_guest_cbs->is_user_mode())
2069                        misc |= PERF_RECORD_MISC_GUEST_USER;
2070                else
2071                        misc |= PERF_RECORD_MISC_GUEST_KERNEL;
2072        } else {
2073                if (user_mode(regs))
2074                        misc |= PERF_RECORD_MISC_USER;
2075                else
2076                        misc |= PERF_RECORD_MISC_KERNEL;
2077        }
2078
2079        if (regs->flags & PERF_EFLAGS_EXACT)
2080                misc |= PERF_RECORD_MISC_EXACT_IP;
2081
2082        return misc;
2083}
2084
2085void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
2086{
2087        cap->version            = x86_pmu.version;
2088        cap->num_counters_gp    = x86_pmu.num_counters;
2089        cap->num_counters_fixed = x86_pmu.num_counters_fixed;
2090        cap->bit_width_gp       = x86_pmu.cntval_bits;
2091        cap->bit_width_fixed    = x86_pmu.cntval_bits;
2092        cap->events_mask        = (unsigned int)x86_pmu.events_maskl;
2093        cap->events_mask_len    = x86_pmu.events_mask_len;
2094}
2095EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
2096