linux/arch/x86/events/core.c
<<
>>
Prefs
   1/*
   2 * Performance events x86 architecture code
   3 *
   4 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6 *  Copyright (C) 2009 Jaswinder Singh Rajput
   7 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
   8 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
   9 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
  10 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
  11 *
  12 *  For licencing details see kernel-base/COPYING
  13 */
  14
  15#include <linux/perf_event.h>
  16#include <linux/capability.h>
  17#include <linux/notifier.h>
  18#include <linux/hardirq.h>
  19#include <linux/kprobes.h>
  20#include <linux/export.h>
  21#include <linux/init.h>
  22#include <linux/kdebug.h>
  23#include <linux/sched.h>
  24#include <linux/uaccess.h>
  25#include <linux/slab.h>
  26#include <linux/cpu.h>
  27#include <linux/bitops.h>
  28#include <linux/device.h>
  29
  30#include <asm/apic.h>
  31#include <asm/stacktrace.h>
  32#include <asm/nmi.h>
  33#include <asm/smp.h>
  34#include <asm/alternative.h>
  35#include <asm/mmu_context.h>
  36#include <asm/tlbflush.h>
  37#include <asm/timer.h>
  38#include <asm/desc.h>
  39#include <asm/ldt.h>
  40
  41#include "perf_event.h"
  42
  43struct x86_pmu x86_pmu __read_mostly;
  44
  45DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
  46        .enabled = 1,
  47};
  48
  49struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
  50
  51u64 __read_mostly hw_cache_event_ids
  52                                [PERF_COUNT_HW_CACHE_MAX]
  53                                [PERF_COUNT_HW_CACHE_OP_MAX]
  54                                [PERF_COUNT_HW_CACHE_RESULT_MAX];
  55u64 __read_mostly hw_cache_extra_regs
  56                                [PERF_COUNT_HW_CACHE_MAX]
  57                                [PERF_COUNT_HW_CACHE_OP_MAX]
  58                                [PERF_COUNT_HW_CACHE_RESULT_MAX];
  59
  60/*
  61 * Propagate event elapsed time into the generic event.
  62 * Can only be executed on the CPU where the event is active.
  63 * Returns the delta events processed.
  64 */
  65u64 x86_perf_event_update(struct perf_event *event)
  66{
  67        struct hw_perf_event *hwc = &event->hw;
  68        int shift = 64 - x86_pmu.cntval_bits;
  69        u64 prev_raw_count, new_raw_count;
  70        int idx = hwc->idx;
  71        s64 delta;
  72
  73        if (idx == INTEL_PMC_IDX_FIXED_BTS)
  74                return 0;
  75
  76        /*
  77         * Careful: an NMI might modify the previous event value.
  78         *
  79         * Our tactic to handle this is to first atomically read and
  80         * exchange a new raw count - then add that new-prev delta
  81         * count to the generic event atomically:
  82         */
  83again:
  84        prev_raw_count = local64_read(&hwc->prev_count);
  85        rdpmcl(hwc->event_base_rdpmc, new_raw_count);
  86
  87        if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
  88                                        new_raw_count) != prev_raw_count)
  89                goto again;
  90
  91        /*
  92         * Now we have the new raw value and have updated the prev
  93         * timestamp already. We can now calculate the elapsed delta
  94         * (event-)time and add that to the generic event.
  95         *
  96         * Careful, not all hw sign-extends above the physical width
  97         * of the count.
  98         */
  99        delta = (new_raw_count << shift) - (prev_raw_count << shift);
 100        delta >>= shift;
 101
 102        local64_add(delta, &event->count);
 103        local64_sub(delta, &hwc->period_left);
 104
 105        return new_raw_count;
 106}
 107
 108/*
 109 * Find and validate any extra registers to set up.
 110 */
 111static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 112{
 113        struct hw_perf_event_extra *reg;
 114        struct extra_reg *er;
 115
 116        reg = &event->hw.extra_reg;
 117
 118        if (!x86_pmu.extra_regs)
 119                return 0;
 120
 121        for (er = x86_pmu.extra_regs; er->msr; er++) {
 122                if (er->event != (config & er->config_mask))
 123                        continue;
 124                if (event->attr.config1 & ~er->valid_mask)
 125                        return -EINVAL;
 126                /* Check if the extra msrs can be safely accessed*/
 127                if (!er->extra_msr_access)
 128                        return -ENXIO;
 129
 130                reg->idx = er->idx;
 131                reg->config = event->attr.config1;
 132                reg->reg = er->msr;
 133                break;
 134        }
 135        return 0;
 136}
 137
 138static atomic_t active_events;
 139static atomic_t pmc_refcount;
 140static DEFINE_MUTEX(pmc_reserve_mutex);
 141
 142#ifdef CONFIG_X86_LOCAL_APIC
 143
 144static bool reserve_pmc_hardware(void)
 145{
 146        int i;
 147
 148        for (i = 0; i < x86_pmu.num_counters; i++) {
 149                if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
 150                        goto perfctr_fail;
 151        }
 152
 153        for (i = 0; i < x86_pmu.num_counters; i++) {
 154                if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
 155                        goto eventsel_fail;
 156        }
 157
 158        return true;
 159
 160eventsel_fail:
 161        for (i--; i >= 0; i--)
 162                release_evntsel_nmi(x86_pmu_config_addr(i));
 163
 164        i = x86_pmu.num_counters;
 165
 166perfctr_fail:
 167        for (i--; i >= 0; i--)
 168                release_perfctr_nmi(x86_pmu_event_addr(i));
 169
 170        return false;
 171}
 172
 173static void release_pmc_hardware(void)
 174{
 175        int i;
 176
 177        for (i = 0; i < x86_pmu.num_counters; i++) {
 178                release_perfctr_nmi(x86_pmu_event_addr(i));
 179                release_evntsel_nmi(x86_pmu_config_addr(i));
 180        }
 181}
 182
 183#else
 184
 185static bool reserve_pmc_hardware(void) { return true; }
 186static void release_pmc_hardware(void) {}
 187
 188#endif
 189
 190static bool check_hw_exists(void)
 191{
 192        u64 val, val_fail, val_new= ~0;
 193        int i, reg, reg_fail, ret = 0;
 194        int bios_fail = 0;
 195        int reg_safe = -1;
 196
 197        /*
 198         * Check to see if the BIOS enabled any of the counters, if so
 199         * complain and bail.
 200         */
 201        for (i = 0; i < x86_pmu.num_counters; i++) {
 202                reg = x86_pmu_config_addr(i);
 203                ret = rdmsrl_safe(reg, &val);
 204                if (ret)
 205                        goto msr_fail;
 206                if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
 207                        bios_fail = 1;
 208                        val_fail = val;
 209                        reg_fail = reg;
 210                } else {
 211                        reg_safe = i;
 212                }
 213        }
 214
 215        if (x86_pmu.num_counters_fixed) {
 216                reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
 217                ret = rdmsrl_safe(reg, &val);
 218                if (ret)
 219                        goto msr_fail;
 220                for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
 221                        if (val & (0x03 << i*4)) {
 222                                bios_fail = 1;
 223                                val_fail = val;
 224                                reg_fail = reg;
 225                        }
 226                }
 227        }
 228
 229        /*
 230         * If all the counters are enabled, the below test will always
 231         * fail.  The tools will also become useless in this scenario.
 232         * Just fail and disable the hardware counters.
 233         */
 234
 235        if (reg_safe == -1) {
 236                reg = reg_safe;
 237                goto msr_fail;
 238        }
 239
 240        /*
 241         * Read the current value, change it and read it back to see if it
 242         * matches, this is needed to detect certain hardware emulators
 243         * (qemu/kvm) that don't trap on the MSR access and always return 0s.
 244         */
 245        reg = x86_pmu_event_addr(reg_safe);
 246        if (rdmsrl_safe(reg, &val))
 247                goto msr_fail;
 248        val ^= 0xffffUL;
 249        ret = wrmsrl_safe(reg, val);
 250        ret |= rdmsrl_safe(reg, &val_new);
 251        if (ret || val != val_new)
 252                goto msr_fail;
 253
 254        /*
 255         * We still allow the PMU driver to operate:
 256         */
 257        if (bios_fail) {
 258                pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
 259                pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
 260                              reg_fail, val_fail);
 261        }
 262
 263        return true;
 264
 265msr_fail:
 266        if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
 267                pr_cont("PMU not available due to virtualization, using software events only.\n");
 268        } else {
 269                pr_cont("Broken PMU hardware detected, using software events only.\n");
 270                pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
 271                       reg, val_new);
 272        }
 273
 274        return false;
 275}
 276
 277static void hw_perf_event_destroy(struct perf_event *event)
 278{
 279        x86_release_hardware();
 280        atomic_dec(&active_events);
 281}
 282
 283void hw_perf_lbr_event_destroy(struct perf_event *event)
 284{
 285        hw_perf_event_destroy(event);
 286
 287        /* undo the lbr/bts event accounting */
 288        x86_del_exclusive(x86_lbr_exclusive_lbr);
 289}
 290
 291static inline int x86_pmu_initialized(void)
 292{
 293        return x86_pmu.handle_irq != NULL;
 294}
 295
 296static inline int
 297set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 298{
 299        struct perf_event_attr *attr = &event->attr;
 300        unsigned int cache_type, cache_op, cache_result;
 301        u64 config, val;
 302
 303        config = attr->config;
 304
 305        cache_type = (config >>  0) & 0xff;
 306        if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
 307                return -EINVAL;
 308
 309        cache_op = (config >>  8) & 0xff;
 310        if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
 311                return -EINVAL;
 312
 313        cache_result = (config >> 16) & 0xff;
 314        if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
 315                return -EINVAL;
 316
 317        val = hw_cache_event_ids[cache_type][cache_op][cache_result];
 318
 319        if (val == 0)
 320                return -ENOENT;
 321
 322        if (val == -1)
 323                return -EINVAL;
 324
 325        hwc->config |= val;
 326        attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
 327        return x86_pmu_extra_regs(val, event);
 328}
 329
 330int x86_reserve_hardware(void)
 331{
 332        int err = 0;
 333
 334        if (!atomic_inc_not_zero(&pmc_refcount)) {
 335                mutex_lock(&pmc_reserve_mutex);
 336                if (atomic_read(&pmc_refcount) == 0) {
 337                        if (!reserve_pmc_hardware())
 338                                err = -EBUSY;
 339                        else
 340                                reserve_ds_buffers();
 341                }
 342                if (!err)
 343                        atomic_inc(&pmc_refcount);
 344                mutex_unlock(&pmc_reserve_mutex);
 345        }
 346
 347        return err;
 348}
 349
 350void x86_release_hardware(void)
 351{
 352        if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
 353                release_pmc_hardware();
 354                release_ds_buffers();
 355                mutex_unlock(&pmc_reserve_mutex);
 356        }
 357}
 358
 359/*
 360 * Check if we can create event of a certain type (that no conflicting events
 361 * are present).
 362 */
 363int x86_add_exclusive(unsigned int what)
 364{
 365        int i;
 366
 367        if (x86_pmu.lbr_pt_coexist)
 368                return 0;
 369
 370        if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
 371                mutex_lock(&pmc_reserve_mutex);
 372                for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
 373                        if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
 374                                goto fail_unlock;
 375                }
 376                atomic_inc(&x86_pmu.lbr_exclusive[what]);
 377                mutex_unlock(&pmc_reserve_mutex);
 378        }
 379
 380        atomic_inc(&active_events);
 381        return 0;
 382
 383fail_unlock:
 384        mutex_unlock(&pmc_reserve_mutex);
 385        return -EBUSY;
 386}
 387
 388void x86_del_exclusive(unsigned int what)
 389{
 390        if (x86_pmu.lbr_pt_coexist)
 391                return;
 392
 393        atomic_dec(&x86_pmu.lbr_exclusive[what]);
 394        atomic_dec(&active_events);
 395}
 396
 397int x86_setup_perfctr(struct perf_event *event)
 398{
 399        struct perf_event_attr *attr = &event->attr;
 400        struct hw_perf_event *hwc = &event->hw;
 401        u64 config;
 402
 403        if (!is_sampling_event(event)) {
 404                hwc->sample_period = x86_pmu.max_period;
 405                hwc->last_period = hwc->sample_period;
 406                local64_set(&hwc->period_left, hwc->sample_period);
 407        }
 408
 409        if (attr->type == PERF_TYPE_RAW)
 410                return x86_pmu_extra_regs(event->attr.config, event);
 411
 412        if (attr->type == PERF_TYPE_HW_CACHE)
 413                return set_ext_hw_attr(hwc, event);
 414
 415        if (attr->config >= x86_pmu.max_events)
 416                return -EINVAL;
 417
 418        /*
 419         * The generic map:
 420         */
 421        config = x86_pmu.event_map(attr->config);
 422
 423        if (config == 0)
 424                return -ENOENT;
 425
 426        if (config == -1LL)
 427                return -EINVAL;
 428
 429        /*
 430         * Branch tracing:
 431         */
 432        if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
 433            !attr->freq && hwc->sample_period == 1) {
 434                /* BTS is not supported by this architecture. */
 435                if (!x86_pmu.bts_active)
 436                        return -EOPNOTSUPP;
 437
 438                /* BTS is currently only allowed for user-mode. */
 439                if (!attr->exclude_kernel)
 440                        return -EOPNOTSUPP;
 441
 442                /* disallow bts if conflicting events are present */
 443                if (x86_add_exclusive(x86_lbr_exclusive_lbr))
 444                        return -EBUSY;
 445
 446                event->destroy = hw_perf_lbr_event_destroy;
 447        }
 448
 449        hwc->config |= config;
 450
 451        return 0;
 452}
 453
 454/*
 455 * check that branch_sample_type is compatible with
 456 * settings needed for precise_ip > 1 which implies
 457 * using the LBR to capture ALL taken branches at the
 458 * priv levels of the measurement
 459 */
 460static inline int precise_br_compat(struct perf_event *event)
 461{
 462        u64 m = event->attr.branch_sample_type;
 463        u64 b = 0;
 464
 465        /* must capture all branches */
 466        if (!(m & PERF_SAMPLE_BRANCH_ANY))
 467                return 0;
 468
 469        m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
 470
 471        if (!event->attr.exclude_user)
 472                b |= PERF_SAMPLE_BRANCH_USER;
 473
 474        if (!event->attr.exclude_kernel)
 475                b |= PERF_SAMPLE_BRANCH_KERNEL;
 476
 477        /*
 478         * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
 479         */
 480
 481        return m == b;
 482}
 483
 484int x86_pmu_hw_config(struct perf_event *event)
 485{
 486        if (event->attr.precise_ip) {
 487                int precise = 0;
 488
 489                /* Support for constant skid */
 490                if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
 491                        precise++;
 492
 493                        /* Support for IP fixup */
 494                        if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
 495                                precise++;
 496
 497                        if (x86_pmu.pebs_prec_dist)
 498                                precise++;
 499                }
 500
 501                if (event->attr.precise_ip > precise)
 502                        return -EOPNOTSUPP;
 503        }
 504        /*
 505         * check that PEBS LBR correction does not conflict with
 506         * whatever the user is asking with attr->branch_sample_type
 507         */
 508        if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
 509                u64 *br_type = &event->attr.branch_sample_type;
 510
 511                if (has_branch_stack(event)) {
 512                        if (!precise_br_compat(event))
 513                                return -EOPNOTSUPP;
 514
 515                        /* branch_sample_type is compatible */
 516
 517                } else {
 518                        /*
 519                         * user did not specify  branch_sample_type
 520                         *
 521                         * For PEBS fixups, we capture all
 522                         * the branches at the priv level of the
 523                         * event.
 524                         */
 525                        *br_type = PERF_SAMPLE_BRANCH_ANY;
 526
 527                        if (!event->attr.exclude_user)
 528                                *br_type |= PERF_SAMPLE_BRANCH_USER;
 529
 530                        if (!event->attr.exclude_kernel)
 531                                *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
 532                }
 533        }
 534
 535        if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
 536                event->attach_state |= PERF_ATTACH_TASK_DATA;
 537
 538        /*
 539         * Generate PMC IRQs:
 540         * (keep 'enabled' bit clear for now)
 541         */
 542        event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
 543
 544        /*
 545         * Count user and OS events unless requested not to
 546         */
 547        if (!event->attr.exclude_user)
 548                event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
 549        if (!event->attr.exclude_kernel)
 550                event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
 551
 552        if (event->attr.type == PERF_TYPE_RAW)
 553                event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 554
 555        if (event->attr.sample_period && x86_pmu.limit_period) {
 556                if (x86_pmu.limit_period(event, event->attr.sample_period) >
 557                                event->attr.sample_period)
 558                        return -EINVAL;
 559        }
 560
 561        return x86_setup_perfctr(event);
 562}
 563
 564/*
 565 * Setup the hardware configuration for a given attr_type
 566 */
 567static int __x86_pmu_event_init(struct perf_event *event)
 568{
 569        int err;
 570
 571        if (!x86_pmu_initialized())
 572                return -ENODEV;
 573
 574        err = x86_reserve_hardware();
 575        if (err)
 576                return err;
 577
 578        atomic_inc(&active_events);
 579        event->destroy = hw_perf_event_destroy;
 580
 581        event->hw.idx = -1;
 582        event->hw.last_cpu = -1;
 583        event->hw.last_tag = ~0ULL;
 584
 585        /* mark unused */
 586        event->hw.extra_reg.idx = EXTRA_REG_NONE;
 587        event->hw.branch_reg.idx = EXTRA_REG_NONE;
 588
 589        return x86_pmu.hw_config(event);
 590}
 591
 592void x86_pmu_disable_all(void)
 593{
 594        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 595        int idx;
 596
 597        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 598                u64 val;
 599
 600                if (!test_bit(idx, cpuc->active_mask))
 601                        continue;
 602                rdmsrl(x86_pmu_config_addr(idx), val);
 603                if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
 604                        continue;
 605                val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
 606                wrmsrl(x86_pmu_config_addr(idx), val);
 607        }
 608}
 609
 610/*
 611 * There may be PMI landing after enabled=0. The PMI hitting could be before or
 612 * after disable_all.
 613 *
 614 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
 615 * It will not be re-enabled in the NMI handler again, because enabled=0. After
 616 * handling the NMI, disable_all will be called, which will not change the
 617 * state either. If PMI hits after disable_all, the PMU is already disabled
 618 * before entering NMI handler. The NMI handler will not change the state
 619 * either.
 620 *
 621 * So either situation is harmless.
 622 */
 623static void x86_pmu_disable(struct pmu *pmu)
 624{
 625        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 626
 627        if (!x86_pmu_initialized())
 628                return;
 629
 630        if (!cpuc->enabled)
 631                return;
 632
 633        cpuc->n_added = 0;
 634        cpuc->enabled = 0;
 635        barrier();
 636
 637        x86_pmu.disable_all();
 638}
 639
 640void x86_pmu_enable_all(int added)
 641{
 642        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 643        int idx;
 644
 645        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 646                struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
 647
 648                if (!test_bit(idx, cpuc->active_mask))
 649                        continue;
 650
 651                __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
 652        }
 653}
 654
 655static struct pmu pmu;
 656
 657static inline int is_x86_event(struct perf_event *event)
 658{
 659        return event->pmu == &pmu;
 660}
 661
 662/*
 663 * Event scheduler state:
 664 *
 665 * Assign events iterating over all events and counters, beginning
 666 * with events with least weights first. Keep the current iterator
 667 * state in struct sched_state.
 668 */
 669struct sched_state {
 670        int     weight;
 671        int     event;          /* event index */
 672        int     counter;        /* counter index */
 673        int     unassigned;     /* number of events to be assigned left */
 674        int     nr_gp;          /* number of GP counters used */
 675        unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 676};
 677
 678/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
 679#define SCHED_STATES_MAX        2
 680
 681struct perf_sched {
 682        int                     max_weight;
 683        int                     max_events;
 684        int                     max_gp;
 685        int                     saved_states;
 686        struct event_constraint **constraints;
 687        struct sched_state      state;
 688        struct sched_state      saved[SCHED_STATES_MAX];
 689};
 690
 691/*
 692 * Initialize interator that runs through all events and counters.
 693 */
 694static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
 695                            int num, int wmin, int wmax, int gpmax)
 696{
 697        int idx;
 698
 699        memset(sched, 0, sizeof(*sched));
 700        sched->max_events       = num;
 701        sched->max_weight       = wmax;
 702        sched->max_gp           = gpmax;
 703        sched->constraints      = constraints;
 704
 705        for (idx = 0; idx < num; idx++) {
 706                if (constraints[idx]->weight == wmin)
 707                        break;
 708        }
 709
 710        sched->state.event      = idx;          /* start with min weight */
 711        sched->state.weight     = wmin;
 712        sched->state.unassigned = num;
 713}
 714
 715static void perf_sched_save_state(struct perf_sched *sched)
 716{
 717        if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
 718                return;
 719
 720        sched->saved[sched->saved_states] = sched->state;
 721        sched->saved_states++;
 722}
 723
 724static bool perf_sched_restore_state(struct perf_sched *sched)
 725{
 726        if (!sched->saved_states)
 727                return false;
 728
 729        sched->saved_states--;
 730        sched->state = sched->saved[sched->saved_states];
 731
 732        /* continue with next counter: */
 733        clear_bit(sched->state.counter++, sched->state.used);
 734
 735        return true;
 736}
 737
 738/*
 739 * Select a counter for the current event to schedule. Return true on
 740 * success.
 741 */
 742static bool __perf_sched_find_counter(struct perf_sched *sched)
 743{
 744        struct event_constraint *c;
 745        int idx;
 746
 747        if (!sched->state.unassigned)
 748                return false;
 749
 750        if (sched->state.event >= sched->max_events)
 751                return false;
 752
 753        c = sched->constraints[sched->state.event];
 754        /* Prefer fixed purpose counters */
 755        if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
 756                idx = INTEL_PMC_IDX_FIXED;
 757                for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
 758                        if (!__test_and_set_bit(idx, sched->state.used))
 759                                goto done;
 760                }
 761        }
 762
 763        /* Grab the first unused counter starting with idx */
 764        idx = sched->state.counter;
 765        for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
 766                if (!__test_and_set_bit(idx, sched->state.used)) {
 767                        if (sched->state.nr_gp++ >= sched->max_gp)
 768                                return false;
 769
 770                        goto done;
 771                }
 772        }
 773
 774        return false;
 775
 776done:
 777        sched->state.counter = idx;
 778
 779        if (c->overlap)
 780                perf_sched_save_state(sched);
 781
 782        return true;
 783}
 784
 785static bool perf_sched_find_counter(struct perf_sched *sched)
 786{
 787        while (!__perf_sched_find_counter(sched)) {
 788                if (!perf_sched_restore_state(sched))
 789                        return false;
 790        }
 791
 792        return true;
 793}
 794
 795/*
 796 * Go through all unassigned events and find the next one to schedule.
 797 * Take events with the least weight first. Return true on success.
 798 */
 799static bool perf_sched_next_event(struct perf_sched *sched)
 800{
 801        struct event_constraint *c;
 802
 803        if (!sched->state.unassigned || !--sched->state.unassigned)
 804                return false;
 805
 806        do {
 807                /* next event */
 808                sched->state.event++;
 809                if (sched->state.event >= sched->max_events) {
 810                        /* next weight */
 811                        sched->state.event = 0;
 812                        sched->state.weight++;
 813                        if (sched->state.weight > sched->max_weight)
 814                                return false;
 815                }
 816                c = sched->constraints[sched->state.event];
 817        } while (c->weight != sched->state.weight);
 818
 819        sched->state.counter = 0;       /* start with first counter */
 820
 821        return true;
 822}
 823
 824/*
 825 * Assign a counter for each event.
 826 */
 827int perf_assign_events(struct event_constraint **constraints, int n,
 828                        int wmin, int wmax, int gpmax, int *assign)
 829{
 830        struct perf_sched sched;
 831
 832        perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
 833
 834        do {
 835                if (!perf_sched_find_counter(&sched))
 836                        break;  /* failed */
 837                if (assign)
 838                        assign[sched.state.event] = sched.state.counter;
 839        } while (perf_sched_next_event(&sched));
 840
 841        return sched.state.unassigned;
 842}
 843EXPORT_SYMBOL_GPL(perf_assign_events);
 844
 845int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 846{
 847        struct event_constraint *c;
 848        unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 849        struct perf_event *e;
 850        int i, wmin, wmax, unsched = 0;
 851        struct hw_perf_event *hwc;
 852
 853        bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 854
 855        if (x86_pmu.start_scheduling)
 856                x86_pmu.start_scheduling(cpuc);
 857
 858        for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 859                cpuc->event_constraint[i] = NULL;
 860                c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
 861                cpuc->event_constraint[i] = c;
 862
 863                wmin = min(wmin, c->weight);
 864                wmax = max(wmax, c->weight);
 865        }
 866
 867        /*
 868         * fastpath, try to reuse previous register
 869         */
 870        for (i = 0; i < n; i++) {
 871                hwc = &cpuc->event_list[i]->hw;
 872                c = cpuc->event_constraint[i];
 873
 874                /* never assigned */
 875                if (hwc->idx == -1)
 876                        break;
 877
 878                /* constraint still honored */
 879                if (!test_bit(hwc->idx, c->idxmsk))
 880                        break;
 881
 882                /* not already used */
 883                if (test_bit(hwc->idx, used_mask))
 884                        break;
 885
 886                __set_bit(hwc->idx, used_mask);
 887                if (assign)
 888                        assign[i] = hwc->idx;
 889        }
 890
 891        /* slow path */
 892        if (i != n) {
 893                int gpmax = x86_pmu.num_counters;
 894
 895                /*
 896                 * Do not allow scheduling of more than half the available
 897                 * generic counters.
 898                 *
 899                 * This helps avoid counter starvation of sibling thread by
 900                 * ensuring at most half the counters cannot be in exclusive
 901                 * mode. There is no designated counters for the limits. Any
 902                 * N/2 counters can be used. This helps with events with
 903                 * specific counter constraints.
 904                 */
 905                if (is_ht_workaround_enabled() && !cpuc->is_fake &&
 906                    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
 907                        gpmax /= 2;
 908
 909                unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
 910                                             wmax, gpmax, assign);
 911        }
 912
 913        /*
 914         * In case of success (unsched = 0), mark events as committed,
 915         * so we do not put_constraint() in case new events are added
 916         * and fail to be scheduled
 917         *
 918         * We invoke the lower level commit callback to lock the resource
 919         *
 920         * We do not need to do all of this in case we are called to
 921         * validate an event group (assign == NULL)
 922         */
 923        if (!unsched && assign) {
 924                for (i = 0; i < n; i++) {
 925                        e = cpuc->event_list[i];
 926                        e->hw.flags |= PERF_X86_EVENT_COMMITTED;
 927                        if (x86_pmu.commit_scheduling)
 928                                x86_pmu.commit_scheduling(cpuc, i, assign[i]);
 929                }
 930        } else {
 931                for (i = 0; i < n; i++) {
 932                        e = cpuc->event_list[i];
 933                        /*
 934                         * do not put_constraint() on comitted events,
 935                         * because they are good to go
 936                         */
 937                        if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
 938                                continue;
 939
 940                        /*
 941                         * release events that failed scheduling
 942                         */
 943                        if (x86_pmu.put_event_constraints)
 944                                x86_pmu.put_event_constraints(cpuc, e);
 945                }
 946        }
 947
 948        if (x86_pmu.stop_scheduling)
 949                x86_pmu.stop_scheduling(cpuc);
 950
 951        return unsched ? -EINVAL : 0;
 952}
 953
 954/*
 955 * dogrp: true if must collect siblings events (group)
 956 * returns total number of events and error code
 957 */
 958static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
 959{
 960        struct perf_event *event;
 961        int n, max_count;
 962
 963        max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
 964
 965        /* current number of events already accepted */
 966        n = cpuc->n_events;
 967
 968        if (is_x86_event(leader)) {
 969                if (n >= max_count)
 970                        return -EINVAL;
 971                cpuc->event_list[n] = leader;
 972                n++;
 973        }
 974        if (!dogrp)
 975                return n;
 976
 977        list_for_each_entry(event, &leader->sibling_list, group_entry) {
 978                if (!is_x86_event(event) ||
 979                    event->state <= PERF_EVENT_STATE_OFF)
 980                        continue;
 981
 982                if (n >= max_count)
 983                        return -EINVAL;
 984
 985                cpuc->event_list[n] = event;
 986                n++;
 987        }
 988        return n;
 989}
 990
 991static inline void x86_assign_hw_event(struct perf_event *event,
 992                                struct cpu_hw_events *cpuc, int i)
 993{
 994        struct hw_perf_event *hwc = &event->hw;
 995
 996        hwc->idx = cpuc->assign[i];
 997        hwc->last_cpu = smp_processor_id();
 998        hwc->last_tag = ++cpuc->tags[i];
 999
1000        if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
1001                hwc->config_base = 0;
1002                hwc->event_base = 0;
1003        } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
1004                hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1005                hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
1006                hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
1007        } else {
1008                hwc->config_base = x86_pmu_config_addr(hwc->idx);
1009                hwc->event_base  = x86_pmu_event_addr(hwc->idx);
1010                hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
1011        }
1012}
1013
1014static inline int match_prev_assignment(struct hw_perf_event *hwc,
1015                                        struct cpu_hw_events *cpuc,
1016                                        int i)
1017{
1018        return hwc->idx == cpuc->assign[i] &&
1019                hwc->last_cpu == smp_processor_id() &&
1020                hwc->last_tag == cpuc->tags[i];
1021}
1022
1023static void x86_pmu_start(struct perf_event *event, int flags);
1024
1025static void x86_pmu_enable(struct pmu *pmu)
1026{
1027        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1028        struct perf_event *event;
1029        struct hw_perf_event *hwc;
1030        int i, added = cpuc->n_added;
1031
1032        if (!x86_pmu_initialized())
1033                return;
1034
1035        if (cpuc->enabled)
1036                return;
1037
1038        if (cpuc->n_added) {
1039                int n_running = cpuc->n_events - cpuc->n_added;
1040                /*
1041                 * apply assignment obtained either from
1042                 * hw_perf_group_sched_in() or x86_pmu_enable()
1043                 *
1044                 * step1: save events moving to new counters
1045                 */
1046                for (i = 0; i < n_running; i++) {
1047                        event = cpuc->event_list[i];
1048                        hwc = &event->hw;
1049
1050                        /*
1051                         * we can avoid reprogramming counter if:
1052                         * - assigned same counter as last time
1053                         * - running on same CPU as last time
1054                         * - no other event has used the counter since
1055                         */
1056                        if (hwc->idx == -1 ||
1057                            match_prev_assignment(hwc, cpuc, i))
1058                                continue;
1059
1060                        /*
1061                         * Ensure we don't accidentally enable a stopped
1062                         * counter simply because we rescheduled.
1063                         */
1064                        if (hwc->state & PERF_HES_STOPPED)
1065                                hwc->state |= PERF_HES_ARCH;
1066
1067                        x86_pmu_stop(event, PERF_EF_UPDATE);
1068                }
1069
1070                /*
1071                 * step2: reprogram moved events into new counters
1072                 */
1073                for (i = 0; i < cpuc->n_events; i++) {
1074                        event = cpuc->event_list[i];
1075                        hwc = &event->hw;
1076
1077                        if (!match_prev_assignment(hwc, cpuc, i))
1078                                x86_assign_hw_event(event, cpuc, i);
1079                        else if (i < n_running)
1080                                continue;
1081
1082                        if (hwc->state & PERF_HES_ARCH)
1083                                continue;
1084
1085                        x86_pmu_start(event, PERF_EF_RELOAD);
1086                }
1087                cpuc->n_added = 0;
1088                perf_events_lapic_init();
1089        }
1090
1091        cpuc->enabled = 1;
1092        barrier();
1093
1094        x86_pmu.enable_all(added);
1095}
1096
1097static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1098
1099/*
1100 * Set the next IRQ period, based on the hwc->period_left value.
1101 * To be called with the event disabled in hw:
1102 */
1103int x86_perf_event_set_period(struct perf_event *event)
1104{
1105        struct hw_perf_event *hwc = &event->hw;
1106        s64 left = local64_read(&hwc->period_left);
1107        s64 period = hwc->sample_period;
1108        int ret = 0, idx = hwc->idx;
1109
1110        if (idx == INTEL_PMC_IDX_FIXED_BTS)
1111                return 0;
1112
1113        /*
1114         * If we are way outside a reasonable range then just skip forward:
1115         */
1116        if (unlikely(left <= -period)) {
1117                left = period;
1118                local64_set(&hwc->period_left, left);
1119                hwc->last_period = period;
1120                ret = 1;
1121        }
1122
1123        if (unlikely(left <= 0)) {
1124                left += period;
1125                local64_set(&hwc->period_left, left);
1126                hwc->last_period = period;
1127                ret = 1;
1128        }
1129        /*
1130         * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1131         */
1132        if (unlikely(left < 2))
1133                left = 2;
1134
1135        if (left > x86_pmu.max_period)
1136                left = x86_pmu.max_period;
1137
1138        if (x86_pmu.limit_period)
1139                left = x86_pmu.limit_period(event, left);
1140
1141        per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1142
1143        if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
1144            local64_read(&hwc->prev_count) != (u64)-left) {
1145                /*
1146                 * The hw event starts counting from this event offset,
1147                 * mark it to be able to extra future deltas:
1148                 */
1149                local64_set(&hwc->prev_count, (u64)-left);
1150
1151                wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
1152        }
1153
1154        /*
1155         * Due to erratum on certan cpu we need
1156         * a second write to be sure the register
1157         * is updated properly
1158         */
1159        if (x86_pmu.perfctr_second_write) {
1160                wrmsrl(hwc->event_base,
1161                        (u64)(-left) & x86_pmu.cntval_mask);
1162        }
1163
1164        perf_event_update_userpage(event);
1165
1166        return ret;
1167}
1168
1169void x86_pmu_enable_event(struct perf_event *event)
1170{
1171        if (__this_cpu_read(cpu_hw_events.enabled))
1172                __x86_pmu_enable_event(&event->hw,
1173                                       ARCH_PERFMON_EVENTSEL_ENABLE);
1174}
1175
1176/*
1177 * Add a single event to the PMU.
1178 *
1179 * The event is added to the group of enabled events
1180 * but only if it can be scehduled with existing events.
1181 */
1182static int x86_pmu_add(struct perf_event *event, int flags)
1183{
1184        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1185        struct hw_perf_event *hwc;
1186        int assign[X86_PMC_IDX_MAX];
1187        int n, n0, ret;
1188
1189        hwc = &event->hw;
1190
1191        n0 = cpuc->n_events;
1192        ret = n = collect_events(cpuc, event, false);
1193        if (ret < 0)
1194                goto out;
1195
1196        hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1197        if (!(flags & PERF_EF_START))
1198                hwc->state |= PERF_HES_ARCH;
1199
1200        /*
1201         * If group events scheduling transaction was started,
1202         * skip the schedulability test here, it will be performed
1203         * at commit time (->commit_txn) as a whole.
1204         */
1205        if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
1206                goto done_collect;
1207
1208        ret = x86_pmu.schedule_events(cpuc, n, assign);
1209        if (ret)
1210                goto out;
1211        /*
1212         * copy new assignment, now we know it is possible
1213         * will be used by hw_perf_enable()
1214         */
1215        memcpy(cpuc->assign, assign, n*sizeof(int));
1216
1217done_collect:
1218        /*
1219         * Commit the collect_events() state. See x86_pmu_del() and
1220         * x86_pmu_*_txn().
1221         */
1222        cpuc->n_events = n;
1223        cpuc->n_added += n - n0;
1224        cpuc->n_txn += n - n0;
1225
1226        ret = 0;
1227out:
1228        return ret;
1229}
1230
1231static void x86_pmu_start(struct perf_event *event, int flags)
1232{
1233        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1234        int idx = event->hw.idx;
1235
1236        if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1237                return;
1238
1239        if (WARN_ON_ONCE(idx == -1))
1240                return;
1241
1242        if (flags & PERF_EF_RELOAD) {
1243                WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1244                x86_perf_event_set_period(event);
1245        }
1246
1247        event->hw.state = 0;
1248
1249        cpuc->events[idx] = event;
1250        __set_bit(idx, cpuc->active_mask);
1251        __set_bit(idx, cpuc->running);
1252        x86_pmu.enable(event);
1253        perf_event_update_userpage(event);
1254}
1255
1256void perf_event_print_debug(void)
1257{
1258        u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1259        u64 pebs, debugctl;
1260        struct cpu_hw_events *cpuc;
1261        unsigned long flags;
1262        int cpu, idx;
1263
1264        if (!x86_pmu.num_counters)
1265                return;
1266
1267        local_irq_save(flags);
1268
1269        cpu = smp_processor_id();
1270        cpuc = &per_cpu(cpu_hw_events, cpu);
1271
1272        if (x86_pmu.version >= 2) {
1273                rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
1274                rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1275                rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1276                rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1277
1278                pr_info("\n");
1279                pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
1280                pr_info("CPU#%d: status:     %016llx\n", cpu, status);
1281                pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
1282                pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1283                if (x86_pmu.pebs_constraints) {
1284                        rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1285                        pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1286                }
1287                if (x86_pmu.lbr_nr) {
1288                        rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
1289                        pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
1290                }
1291        }
1292        pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1293
1294        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1295                rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1296                rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1297
1298                prev_left = per_cpu(pmc_prev_left[idx], cpu);
1299
1300                pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
1301                        cpu, idx, pmc_ctrl);
1302                pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
1303                        cpu, idx, pmc_count);
1304                pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1305                        cpu, idx, prev_left);
1306        }
1307        for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1308                rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1309
1310                pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1311                        cpu, idx, pmc_count);
1312        }
1313        local_irq_restore(flags);
1314}
1315
1316void x86_pmu_stop(struct perf_event *event, int flags)
1317{
1318        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1319        struct hw_perf_event *hwc = &event->hw;
1320
1321        if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
1322                x86_pmu.disable(event);
1323                cpuc->events[hwc->idx] = NULL;
1324                WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1325                hwc->state |= PERF_HES_STOPPED;
1326        }
1327
1328        if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1329                /*
1330                 * Drain the remaining delta count out of a event
1331                 * that we are disabling:
1332                 */
1333                x86_perf_event_update(event);
1334                hwc->state |= PERF_HES_UPTODATE;
1335        }
1336}
1337
1338static void x86_pmu_del(struct perf_event *event, int flags)
1339{
1340        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1341        int i;
1342
1343        /*
1344         * event is descheduled
1345         */
1346        event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
1347
1348        /*
1349         * If we're called during a txn, we don't need to do anything.
1350         * The events never got scheduled and ->cancel_txn will truncate
1351         * the event_list.
1352         *
1353         * XXX assumes any ->del() called during a TXN will only be on
1354         * an event added during that same TXN.
1355         */
1356        if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
1357                return;
1358
1359        /*
1360         * Not a TXN, therefore cleanup properly.
1361         */
1362        x86_pmu_stop(event, PERF_EF_UPDATE);
1363
1364        for (i = 0; i < cpuc->n_events; i++) {
1365                if (event == cpuc->event_list[i])
1366                        break;
1367        }
1368
1369        if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
1370                return;
1371
1372        /* If we have a newly added event; make sure to decrease n_added. */
1373        if (i >= cpuc->n_events - cpuc->n_added)
1374                --cpuc->n_added;
1375
1376        if (x86_pmu.put_event_constraints)
1377                x86_pmu.put_event_constraints(cpuc, event);
1378
1379        /* Delete the array entry. */
1380        while (++i < cpuc->n_events) {
1381                cpuc->event_list[i-1] = cpuc->event_list[i];
1382                cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
1383        }
1384        --cpuc->n_events;
1385
1386        perf_event_update_userpage(event);
1387}
1388
1389int x86_pmu_handle_irq(struct pt_regs *regs)
1390{
1391        struct perf_sample_data data;
1392        struct cpu_hw_events *cpuc;
1393        struct perf_event *event;
1394        int idx, handled = 0;
1395        u64 val;
1396
1397        cpuc = this_cpu_ptr(&cpu_hw_events);
1398
1399        /*
1400         * Some chipsets need to unmask the LVTPC in a particular spot
1401         * inside the nmi handler.  As a result, the unmasking was pushed
1402         * into all the nmi handlers.
1403         *
1404         * This generic handler doesn't seem to have any issues where the
1405         * unmasking occurs so it was left at the top.
1406         */
1407        apic_write(APIC_LVTPC, APIC_DM_NMI);
1408
1409        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1410                if (!test_bit(idx, cpuc->active_mask)) {
1411                        /*
1412                         * Though we deactivated the counter some cpus
1413                         * might still deliver spurious interrupts still
1414                         * in flight. Catch them:
1415                         */
1416                        if (__test_and_clear_bit(idx, cpuc->running))
1417                                handled++;
1418                        continue;
1419                }
1420
1421                event = cpuc->events[idx];
1422
1423                val = x86_perf_event_update(event);
1424                if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1425                        continue;
1426
1427                /*
1428                 * event overflow
1429                 */
1430                handled++;
1431                perf_sample_data_init(&data, 0, event->hw.last_period);
1432
1433                if (!x86_perf_event_set_period(event))
1434                        continue;
1435
1436                if (perf_event_overflow(event, &data, regs))
1437                        x86_pmu_stop(event, 0);
1438        }
1439
1440        if (handled)
1441                inc_irq_stat(apic_perf_irqs);
1442
1443        return handled;
1444}
1445
1446void perf_events_lapic_init(void)
1447{
1448        if (!x86_pmu.apic || !x86_pmu_initialized())
1449                return;
1450
1451        /*
1452         * Always use NMI for PMU
1453         */
1454        apic_write(APIC_LVTPC, APIC_DM_NMI);
1455}
1456
1457static int
1458perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
1459{
1460        u64 start_clock;
1461        u64 finish_clock;
1462        int ret;
1463
1464        /*
1465         * All PMUs/events that share this PMI handler should make sure to
1466         * increment active_events for their events.
1467         */
1468        if (!atomic_read(&active_events))
1469                return NMI_DONE;
1470
1471        start_clock = sched_clock();
1472        ret = x86_pmu.handle_irq(regs);
1473        finish_clock = sched_clock();
1474
1475        perf_sample_event_took(finish_clock - start_clock);
1476
1477        return ret;
1478}
1479NOKPROBE_SYMBOL(perf_event_nmi_handler);
1480
1481struct event_constraint emptyconstraint;
1482struct event_constraint unconstrained;
1483
1484static int x86_pmu_prepare_cpu(unsigned int cpu)
1485{
1486        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1487        int i;
1488
1489        for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
1490                cpuc->kfree_on_online[i] = NULL;
1491        if (x86_pmu.cpu_prepare)
1492                return x86_pmu.cpu_prepare(cpu);
1493        return 0;
1494}
1495
1496static int x86_pmu_dead_cpu(unsigned int cpu)
1497{
1498        if (x86_pmu.cpu_dead)
1499                x86_pmu.cpu_dead(cpu);
1500        return 0;
1501}
1502
1503static int x86_pmu_online_cpu(unsigned int cpu)
1504{
1505        struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1506        int i;
1507
1508        for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
1509                kfree(cpuc->kfree_on_online[i]);
1510                cpuc->kfree_on_online[i] = NULL;
1511        }
1512        return 0;
1513}
1514
1515static int x86_pmu_starting_cpu(unsigned int cpu)
1516{
1517        if (x86_pmu.cpu_starting)
1518                x86_pmu.cpu_starting(cpu);
1519        return 0;
1520}
1521
1522static int x86_pmu_dying_cpu(unsigned int cpu)
1523{
1524        if (x86_pmu.cpu_dying)
1525                x86_pmu.cpu_dying(cpu);
1526        return 0;
1527}
1528
1529static void __init pmu_check_apic(void)
1530{
1531        if (boot_cpu_has(X86_FEATURE_APIC))
1532                return;
1533
1534        x86_pmu.apic = 0;
1535        pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1536        pr_info("no hardware sampling interrupt available.\n");
1537
1538        /*
1539         * If we have a PMU initialized but no APIC
1540         * interrupts, we cannot sample hardware
1541         * events (user-space has to fall back and
1542         * sample via a hrtimer based software event):
1543         */
1544        pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
1545
1546}
1547
1548static struct attribute_group x86_pmu_format_group = {
1549        .name = "format",
1550        .attrs = NULL,
1551};
1552
1553/*
1554 * Remove all undefined events (x86_pmu.event_map(id) == 0)
1555 * out of events_attr attributes.
1556 */
1557static void __init filter_events(struct attribute **attrs)
1558{
1559        struct device_attribute *d;
1560        struct perf_pmu_events_attr *pmu_attr;
1561        int offset = 0;
1562        int i, j;
1563
1564        for (i = 0; attrs[i]; i++) {
1565                d = (struct device_attribute *)attrs[i];
1566                pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
1567                /* str trumps id */
1568                if (pmu_attr->event_str)
1569                        continue;
1570                if (x86_pmu.event_map(i + offset))
1571                        continue;
1572
1573                for (j = i; attrs[j]; j++)
1574                        attrs[j] = attrs[j + 1];
1575
1576                /* Check the shifted attr. */
1577                i--;
1578
1579                /*
1580                 * event_map() is index based, the attrs array is organized
1581                 * by increasing event index. If we shift the events, then
1582                 * we need to compensate for the event_map(), otherwise
1583                 * we are looking up the wrong event in the map
1584                 */
1585                offset++;
1586        }
1587}
1588
1589/* Merge two pointer arrays */
1590__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
1591{
1592        struct attribute **new;
1593        int j, i;
1594
1595        for (j = 0; a[j]; j++)
1596                ;
1597        for (i = 0; b[i]; i++)
1598                j++;
1599        j++;
1600
1601        new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
1602        if (!new)
1603                return NULL;
1604
1605        j = 0;
1606        for (i = 0; a[i]; i++)
1607                new[j++] = a[i];
1608        for (i = 0; b[i]; i++)
1609                new[j++] = b[i];
1610        new[j] = NULL;
1611
1612        return new;
1613}
1614
1615ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
1616{
1617        struct perf_pmu_events_attr *pmu_attr = \
1618                container_of(attr, struct perf_pmu_events_attr, attr);
1619        u64 config = x86_pmu.event_map(pmu_attr->id);
1620
1621        /* string trumps id */
1622        if (pmu_attr->event_str)
1623                return sprintf(page, "%s", pmu_attr->event_str);
1624
1625        return x86_pmu.events_sysfs_show(page, config);
1626}
1627EXPORT_SYMBOL_GPL(events_sysfs_show);
1628
1629ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
1630                          char *page)
1631{
1632        struct perf_pmu_events_ht_attr *pmu_attr =
1633                container_of(attr, struct perf_pmu_events_ht_attr, attr);
1634
1635        /*
1636         * Report conditional events depending on Hyper-Threading.
1637         *
1638         * This is overly conservative as usually the HT special
1639         * handling is not needed if the other CPU thread is idle.
1640         *
1641         * Note this does not (and cannot) handle the case when thread
1642         * siblings are invisible, for example with virtualization
1643         * if they are owned by some other guest.  The user tool
1644         * has to re-read when a thread sibling gets onlined later.
1645         */
1646        return sprintf(page, "%s",
1647                        topology_max_smt_threads() > 1 ?
1648                        pmu_attr->event_str_ht :
1649                        pmu_attr->event_str_noht);
1650}
1651
1652EVENT_ATTR(cpu-cycles,                  CPU_CYCLES              );
1653EVENT_ATTR(instructions,                INSTRUCTIONS            );
1654EVENT_ATTR(cache-references,            CACHE_REFERENCES        );
1655EVENT_ATTR(cache-misses,                CACHE_MISSES            );
1656EVENT_ATTR(branch-instructions,         BRANCH_INSTRUCTIONS     );
1657EVENT_ATTR(branch-misses,               BRANCH_MISSES           );
1658EVENT_ATTR(bus-cycles,                  BUS_CYCLES              );
1659EVENT_ATTR(stalled-cycles-frontend,     STALLED_CYCLES_FRONTEND );
1660EVENT_ATTR(stalled-cycles-backend,      STALLED_CYCLES_BACKEND  );
1661EVENT_ATTR(ref-cycles,                  REF_CPU_CYCLES          );
1662
1663static struct attribute *empty_attrs;
1664
1665static struct attribute *events_attr[] = {
1666        EVENT_PTR(CPU_CYCLES),
1667        EVENT_PTR(INSTRUCTIONS),
1668        EVENT_PTR(CACHE_REFERENCES),
1669        EVENT_PTR(CACHE_MISSES),
1670        EVENT_PTR(BRANCH_INSTRUCTIONS),
1671        EVENT_PTR(BRANCH_MISSES),
1672        EVENT_PTR(BUS_CYCLES),
1673        EVENT_PTR(STALLED_CYCLES_FRONTEND),
1674        EVENT_PTR(STALLED_CYCLES_BACKEND),
1675        EVENT_PTR(REF_CPU_CYCLES),
1676        NULL,
1677};
1678
1679static struct attribute_group x86_pmu_events_group = {
1680        .name = "events",
1681        .attrs = events_attr,
1682};
1683
1684ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
1685{
1686        u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
1687        u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
1688        bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
1689        bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
1690        bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
1691        bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
1692        ssize_t ret;
1693
1694        /*
1695        * We have whole page size to spend and just little data
1696        * to write, so we can safely use sprintf.
1697        */
1698        ret = sprintf(page, "event=0x%02llx", event);
1699
1700        if (umask)
1701                ret += sprintf(page + ret, ",umask=0x%02llx", umask);
1702
1703        if (edge)
1704                ret += sprintf(page + ret, ",edge");
1705
1706        if (pc)
1707                ret += sprintf(page + ret, ",pc");
1708
1709        if (any)
1710                ret += sprintf(page + ret, ",any");
1711
1712        if (inv)
1713                ret += sprintf(page + ret, ",inv");
1714
1715        if (cmask)
1716                ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
1717
1718        ret += sprintf(page + ret, "\n");
1719
1720        return ret;
1721}
1722
1723static int __init init_hw_perf_events(void)
1724{
1725        struct x86_pmu_quirk *quirk;
1726        int err;
1727
1728        pr_info("Performance Events: ");
1729
1730        switch (boot_cpu_data.x86_vendor) {
1731        case X86_VENDOR_INTEL:
1732                err = intel_pmu_init();
1733                break;
1734        case X86_VENDOR_AMD:
1735                err = amd_pmu_init();
1736                break;
1737        default:
1738                err = -ENOTSUPP;
1739        }
1740        if (err != 0) {
1741                pr_cont("no PMU driver, software events only.\n");
1742                return 0;
1743        }
1744
1745        pmu_check_apic();
1746
1747        /* sanity check that the hardware exists or is emulated */
1748        if (!check_hw_exists())
1749                return 0;
1750
1751        pr_cont("%s PMU driver.\n", x86_pmu.name);
1752
1753        x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
1754
1755        for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
1756                quirk->func();
1757
1758        if (!x86_pmu.intel_ctrl)
1759                x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1760
1761        perf_events_lapic_init();
1762        register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
1763
1764        unconstrained = (struct event_constraint)
1765                __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1766                                   0, x86_pmu.num_counters, 0, 0);
1767
1768        x86_pmu_format_group.attrs = x86_pmu.format_attrs;
1769
1770        if (x86_pmu.event_attrs)
1771                x86_pmu_events_group.attrs = x86_pmu.event_attrs;
1772
1773        if (!x86_pmu.events_sysfs_show)
1774                x86_pmu_events_group.attrs = &empty_attrs;
1775        else
1776                filter_events(x86_pmu_events_group.attrs);
1777
1778        if (x86_pmu.cpu_events) {
1779                struct attribute **tmp;
1780
1781                tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
1782                if (!WARN_ON(!tmp))
1783                        x86_pmu_events_group.attrs = tmp;
1784        }
1785
1786        pr_info("... version:                %d\n",     x86_pmu.version);
1787        pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
1788        pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
1789        pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
1790        pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1791        pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1792        pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1793
1794        /*
1795         * Install callbacks. Core will call them for each online
1796         * cpu.
1797         */
1798        err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "PERF_X86_PREPARE",
1799                                x86_pmu_prepare_cpu, x86_pmu_dead_cpu);
1800        if (err)
1801                return err;
1802
1803        err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING,
1804                                "AP_PERF_X86_STARTING", x86_pmu_starting_cpu,
1805                                x86_pmu_dying_cpu);
1806        if (err)
1807                goto out;
1808
1809        err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "AP_PERF_X86_ONLINE",
1810                                x86_pmu_online_cpu, NULL);
1811        if (err)
1812                goto out1;
1813
1814        err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1815        if (err)
1816                goto out2;
1817
1818        return 0;
1819
1820out2:
1821        cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE);
1822out1:
1823        cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING);
1824out:
1825        cpuhp_remove_state(CPUHP_PERF_X86_PREPARE);
1826        return err;
1827}
1828early_initcall(init_hw_perf_events);
1829
1830static inline void x86_pmu_read(struct perf_event *event)
1831{
1832        x86_perf_event_update(event);
1833}
1834
1835/*
1836 * Start group events scheduling transaction
1837 * Set the flag to make pmu::enable() not perform the
1838 * schedulability test, it will be performed at commit time
1839 *
1840 * We only support PERF_PMU_TXN_ADD transactions. Save the
1841 * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
1842 * transactions.
1843 */
1844static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
1845{
1846        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1847
1848        WARN_ON_ONCE(cpuc->txn_flags);          /* txn already in flight */
1849
1850        cpuc->txn_flags = txn_flags;
1851        if (txn_flags & ~PERF_PMU_TXN_ADD)
1852                return;
1853
1854        perf_pmu_disable(pmu);
1855        __this_cpu_write(cpu_hw_events.n_txn, 0);
1856}
1857
1858/*
1859 * Stop group events scheduling transaction
1860 * Clear the flag and pmu::enable() will perform the
1861 * schedulability test.
1862 */
1863static void x86_pmu_cancel_txn(struct pmu *pmu)
1864{
1865        unsigned int txn_flags;
1866        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1867
1868        WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
1869
1870        txn_flags = cpuc->txn_flags;
1871        cpuc->txn_flags = 0;
1872        if (txn_flags & ~PERF_PMU_TXN_ADD)
1873                return;
1874
1875        /*
1876         * Truncate collected array by the number of events added in this
1877         * transaction. See x86_pmu_add() and x86_pmu_*_txn().
1878         */
1879        __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1880        __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
1881        perf_pmu_enable(pmu);
1882}
1883
1884/*
1885 * Commit group events scheduling transaction
1886 * Perform the group schedulability test as a whole
1887 * Return 0 if success
1888 *
1889 * Does not cancel the transaction on failure; expects the caller to do this.
1890 */
1891static int x86_pmu_commit_txn(struct pmu *pmu)
1892{
1893        struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
1894        int assign[X86_PMC_IDX_MAX];
1895        int n, ret;
1896
1897        WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
1898
1899        if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
1900                cpuc->txn_flags = 0;
1901                return 0;
1902        }
1903
1904        n = cpuc->n_events;
1905
1906        if (!x86_pmu_initialized())
1907                return -EAGAIN;
1908
1909        ret = x86_pmu.schedule_events(cpuc, n, assign);
1910        if (ret)
1911                return ret;
1912
1913        /*
1914         * copy new assignment, now we know it is possible
1915         * will be used by hw_perf_enable()
1916         */
1917        memcpy(cpuc->assign, assign, n*sizeof(int));
1918
1919        cpuc->txn_flags = 0;
1920        perf_pmu_enable(pmu);
1921        return 0;
1922}
1923/*
1924 * a fake_cpuc is used to validate event groups. Due to
1925 * the extra reg logic, we need to also allocate a fake
1926 * per_core and per_cpu structure. Otherwise, group events
1927 * using extra reg may conflict without the kernel being
1928 * able to catch this when the last event gets added to
1929 * the group.
1930 */
1931static void free_fake_cpuc(struct cpu_hw_events *cpuc)
1932{
1933        kfree(cpuc->shared_regs);
1934        kfree(cpuc);
1935}
1936
1937static struct cpu_hw_events *allocate_fake_cpuc(void)
1938{
1939        struct cpu_hw_events *cpuc;
1940        int cpu = raw_smp_processor_id();
1941
1942        cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
1943        if (!cpuc)
1944                return ERR_PTR(-ENOMEM);
1945
1946        /* only needed, if we have extra_regs */
1947        if (x86_pmu.extra_regs) {
1948                cpuc->shared_regs = allocate_shared_regs(cpu);
1949                if (!cpuc->shared_regs)
1950                        goto error;
1951        }
1952        cpuc->is_fake = 1;
1953        return cpuc;
1954error:
1955        free_fake_cpuc(cpuc);
1956        return ERR_PTR(-ENOMEM);
1957}
1958
1959/*
1960 * validate that we can schedule this event
1961 */
1962static int validate_event(struct perf_event *event)
1963{
1964        struct cpu_hw_events *fake_cpuc;
1965        struct event_constraint *c;
1966        int ret = 0;
1967
1968        fake_cpuc = allocate_fake_cpuc();
1969        if (IS_ERR(fake_cpuc))
1970                return PTR_ERR(fake_cpuc);
1971
1972        c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
1973
1974        if (!c || !c->weight)
1975                ret = -EINVAL;
1976
1977        if (x86_pmu.put_event_constraints)
1978                x86_pmu.put_event_constraints(fake_cpuc, event);
1979
1980        free_fake_cpuc(fake_cpuc);
1981
1982        return ret;
1983}
1984
1985/*
1986 * validate a single event group
1987 *
1988 * validation include:
1989 *      - check events are compatible which each other
1990 *      - events do not compete for the same counter
1991 *      - number of events <= number of counters
1992 *
1993 * validation ensures the group can be loaded onto the
1994 * PMU if it was the only group available.
1995 */
1996static int validate_group(struct perf_event *event)
1997{
1998        struct perf_event *leader = event->group_leader;
1999        struct cpu_hw_events *fake_cpuc;
2000        int ret = -EINVAL, n;
2001
2002        fake_cpuc = allocate_fake_cpuc();
2003        if (IS_ERR(fake_cpuc))
2004                return PTR_ERR(fake_cpuc);
2005        /*
2006         * the event is not yet connected with its
2007         * siblings therefore we must first collect
2008         * existing siblings, then add the new event
2009         * before we can simulate the scheduling
2010         */
2011        n = collect_events(fake_cpuc, leader, true);
2012        if (n < 0)
2013                goto out;
2014
2015        fake_cpuc->n_events = n;
2016        n = collect_events(fake_cpuc, event, false);
2017        if (n < 0)
2018                goto out;
2019
2020        fake_cpuc->n_events = n;
2021
2022        ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
2023
2024out:
2025        free_fake_cpuc(fake_cpuc);
2026        return ret;
2027}
2028
2029static int x86_pmu_event_init(struct perf_event *event)
2030{
2031        struct pmu *tmp;
2032        int err;
2033
2034        switch (event->attr.type) {
2035        case PERF_TYPE_RAW:
2036        case PERF_TYPE_HARDWARE:
2037        case PERF_TYPE_HW_CACHE:
2038                break;
2039
2040        default:
2041                return -ENOENT;
2042        }
2043
2044        err = __x86_pmu_event_init(event);
2045        if (!err) {
2046                /*
2047                 * we temporarily connect event to its pmu
2048                 * such that validate_group() can classify
2049                 * it as an x86 event using is_x86_event()
2050                 */
2051                tmp = event->pmu;
2052                event->pmu = &pmu;
2053
2054                if (event->group_leader != event)
2055                        err = validate_group(event);
2056                else
2057                        err = validate_event(event);
2058
2059                event->pmu = tmp;
2060        }
2061        if (err) {
2062                if (event->destroy)
2063                        event->destroy(event);
2064        }
2065
2066        if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
2067                event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
2068
2069        return err;
2070}
2071
2072static void refresh_pce(void *ignored)
2073{
2074        if (current->mm)
2075                load_mm_cr4(current->mm);
2076}
2077
2078static void x86_pmu_event_mapped(struct perf_event *event)
2079{
2080        if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
2081                return;
2082
2083        if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
2084                on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
2085}
2086
2087static void x86_pmu_event_unmapped(struct perf_event *event)
2088{
2089        if (!current->mm)
2090                return;
2091
2092        if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
2093                return;
2094
2095        if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
2096                on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
2097}
2098
2099static int x86_pmu_event_idx(struct perf_event *event)
2100{
2101        int idx = event->hw.idx;
2102
2103        if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
2104                return 0;
2105
2106        if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
2107                idx -= INTEL_PMC_IDX_FIXED;
2108                idx |= 1 << 30;
2109        }
2110
2111        return idx + 1;
2112}
2113
2114static ssize_t get_attr_rdpmc(struct device *cdev,
2115                              struct device_attribute *attr,
2116                              char *buf)
2117{
2118        return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
2119}
2120
2121static ssize_t set_attr_rdpmc(struct device *cdev,
2122                              struct device_attribute *attr,
2123                              const char *buf, size_t count)
2124{
2125        unsigned long val;
2126        ssize_t ret;
2127
2128        ret = kstrtoul(buf, 0, &val);
2129        if (ret)
2130                return ret;
2131
2132        if (val > 2)
2133                return -EINVAL;
2134
2135        if (x86_pmu.attr_rdpmc_broken)
2136                return -ENOTSUPP;
2137
2138        if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
2139                /*
2140                 * Changing into or out of always available, aka
2141                 * perf-event-bypassing mode.  This path is extremely slow,
2142                 * but only root can trigger it, so it's okay.
2143                 */
2144                if (val == 2)
2145                        static_key_slow_inc(&rdpmc_always_available);
2146                else
2147                        static_key_slow_dec(&rdpmc_always_available);
2148                on_each_cpu(refresh_pce, NULL, 1);
2149        }
2150
2151        x86_pmu.attr_rdpmc = val;
2152
2153        return count;
2154}
2155
2156static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
2157
2158static struct attribute *x86_pmu_attrs[] = {
2159        &dev_attr_rdpmc.attr,
2160        NULL,
2161};
2162
2163static struct attribute_group x86_pmu_attr_group = {
2164        .attrs = x86_pmu_attrs,
2165};
2166
2167static const struct attribute_group *x86_pmu_attr_groups[] = {
2168        &x86_pmu_attr_group,
2169        &x86_pmu_format_group,
2170        &x86_pmu_events_group,
2171        NULL,
2172};
2173
2174static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
2175{
2176        if (x86_pmu.sched_task)
2177                x86_pmu.sched_task(ctx, sched_in);
2178}
2179
2180void perf_check_microcode(void)
2181{
2182        if (x86_pmu.check_microcode)
2183                x86_pmu.check_microcode();
2184}
2185EXPORT_SYMBOL_GPL(perf_check_microcode);
2186
2187static struct pmu pmu = {
2188        .pmu_enable             = x86_pmu_enable,
2189        .pmu_disable            = x86_pmu_disable,
2190
2191        .attr_groups            = x86_pmu_attr_groups,
2192
2193        .event_init             = x86_pmu_event_init,
2194
2195        .event_mapped           = x86_pmu_event_mapped,
2196        .event_unmapped         = x86_pmu_event_unmapped,
2197
2198        .add                    = x86_pmu_add,
2199        .del                    = x86_pmu_del,
2200        .start                  = x86_pmu_start,
2201        .stop                   = x86_pmu_stop,
2202        .read                   = x86_pmu_read,
2203
2204        .start_txn              = x86_pmu_start_txn,
2205        .cancel_txn             = x86_pmu_cancel_txn,
2206        .commit_txn             = x86_pmu_commit_txn,
2207
2208        .event_idx              = x86_pmu_event_idx,
2209        .sched_task             = x86_pmu_sched_task,
2210        .task_ctx_size          = sizeof(struct x86_perf_task_context),
2211};
2212
2213void arch_perf_update_userpage(struct perf_event *event,
2214                               struct perf_event_mmap_page *userpg, u64 now)
2215{
2216        struct cyc2ns_data *data;
2217
2218        userpg->cap_user_time = 0;
2219        userpg->cap_user_time_zero = 0;
2220        userpg->cap_user_rdpmc =
2221                !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
2222        userpg->pmc_width = x86_pmu.cntval_bits;
2223
2224        if (!sched_clock_stable())
2225                return;
2226
2227        data = cyc2ns_read_begin();
2228
2229        /*
2230         * Internal timekeeping for enabled/running/stopped times
2231         * is always in the local_clock domain.
2232         */
2233        userpg->cap_user_time = 1;
2234        userpg->time_mult = data->cyc2ns_mul;
2235        userpg->time_shift = data->cyc2ns_shift;
2236        userpg->time_offset = data->cyc2ns_offset - now;
2237
2238        /*
2239         * cap_user_time_zero doesn't make sense when we're using a different
2240         * time base for the records.
2241         */
2242        if (!event->attr.use_clockid) {
2243                userpg->cap_user_time_zero = 1;
2244                userpg->time_zero = data->cyc2ns_offset;
2245        }
2246
2247        cyc2ns_read_end(data);
2248}
2249
2250/*
2251 * callchain support
2252 */
2253
2254static int backtrace_stack(void *data, char *name)
2255{
2256        return 0;
2257}
2258
2259static int backtrace_address(void *data, unsigned long addr, int reliable)
2260{
2261        struct perf_callchain_entry_ctx *entry = data;
2262
2263        return perf_callchain_store(entry, addr);
2264}
2265
2266static const struct stacktrace_ops backtrace_ops = {
2267        .stack                  = backtrace_stack,
2268        .address                = backtrace_address,
2269        .walk_stack             = print_context_stack_bp,
2270};
2271
2272void
2273perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
2274{
2275        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
2276                /* TODO: We don't support guest os callchain now */
2277                return;
2278        }
2279
2280        perf_callchain_store(entry, regs->ip);
2281
2282        dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
2283}
2284
2285static inline int
2286valid_user_frame(const void __user *fp, unsigned long size)
2287{
2288        return (__range_not_ok(fp, size, TASK_SIZE) == 0);
2289}
2290
2291static unsigned long get_segment_base(unsigned int segment)
2292{
2293        struct desc_struct *desc;
2294        int idx = segment >> 3;
2295
2296        if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2297#ifdef CONFIG_MODIFY_LDT_SYSCALL
2298                struct ldt_struct *ldt;
2299
2300                if (idx > LDT_ENTRIES)
2301                        return 0;
2302
2303                /* IRQs are off, so this synchronizes with smp_store_release */
2304                ldt = lockless_dereference(current->active_mm->context.ldt);
2305                if (!ldt || idx > ldt->size)
2306                        return 0;
2307
2308                desc = &ldt->entries[idx];
2309#else
2310                return 0;
2311#endif
2312        } else {
2313                if (idx > GDT_ENTRIES)
2314                        return 0;
2315
2316                desc = raw_cpu_ptr(gdt_page.gdt) + idx;
2317        }
2318
2319        return get_desc_base(desc);
2320}
2321
2322#ifdef CONFIG_IA32_EMULATION
2323
2324#include <asm/compat.h>
2325
2326static inline int
2327perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
2328{
2329        /* 32-bit process in 64-bit kernel. */
2330        unsigned long ss_base, cs_base;
2331        struct stack_frame_ia32 frame;
2332        const void __user *fp;
2333
2334        if (!test_thread_flag(TIF_IA32))
2335                return 0;
2336
2337        cs_base = get_segment_base(regs->cs);
2338        ss_base = get_segment_base(regs->ss);
2339
2340        fp = compat_ptr(ss_base + regs->bp);
2341        pagefault_disable();
2342        while (entry->nr < entry->max_stack) {
2343                unsigned long bytes;
2344                frame.next_frame     = 0;
2345                frame.return_address = 0;
2346
2347                if (!access_ok(VERIFY_READ, fp, 8))
2348                        break;
2349
2350                bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
2351                if (bytes != 0)
2352                        break;
2353                bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
2354                if (bytes != 0)
2355                        break;
2356
2357                if (!valid_user_frame(fp, sizeof(frame)))
2358                        break;
2359
2360                perf_callchain_store(entry, cs_base + frame.return_address);
2361                fp = compat_ptr(ss_base + frame.next_frame);
2362        }
2363        pagefault_enable();
2364        return 1;
2365}
2366#else
2367static inline int
2368perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
2369{
2370    return 0;
2371}
2372#endif
2373
2374void
2375perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
2376{
2377        struct stack_frame frame;
2378        const unsigned long __user *fp;
2379
2380        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
2381                /* TODO: We don't support guest os callchain now */
2382                return;
2383        }
2384
2385        /*
2386         * We don't know what to do with VM86 stacks.. ignore them for now.
2387         */
2388        if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
2389                return;
2390
2391        fp = (unsigned long __user *)regs->bp;
2392
2393        perf_callchain_store(entry, regs->ip);
2394
2395        if (!current->mm)
2396                return;
2397
2398        if (perf_callchain_user32(regs, entry))
2399                return;
2400
2401        pagefault_disable();
2402        while (entry->nr < entry->max_stack) {
2403                unsigned long bytes;
2404
2405                frame.next_frame             = NULL;
2406                frame.return_address = 0;
2407
2408                if (!access_ok(VERIFY_READ, fp, sizeof(*fp) * 2))
2409                        break;
2410
2411                bytes = __copy_from_user_nmi(&frame.next_frame, fp, sizeof(*fp));
2412                if (bytes != 0)
2413                        break;
2414                bytes = __copy_from_user_nmi(&frame.return_address, fp + 1, sizeof(*fp));
2415                if (bytes != 0)
2416                        break;
2417
2418                if (!valid_user_frame(fp, sizeof(frame)))
2419                        break;
2420
2421                perf_callchain_store(entry, frame.return_address);
2422                fp = (void __user *)frame.next_frame;
2423        }
2424        pagefault_enable();
2425}
2426
2427/*
2428 * Deal with code segment offsets for the various execution modes:
2429 *
2430 *   VM86 - the good olde 16 bit days, where the linear address is
2431 *          20 bits and we use regs->ip + 0x10 * regs->cs.
2432 *
2433 *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
2434 *          to figure out what the 32bit base address is.
2435 *
2436 *    X32 - has TIF_X32 set, but is running in x86_64
2437 *
2438 * X86_64 - CS,DS,SS,ES are all zero based.
2439 */
2440static unsigned long code_segment_base(struct pt_regs *regs)
2441{
2442        /*
2443         * For IA32 we look at the GDT/LDT segment base to convert the
2444         * effective IP to a linear address.
2445         */
2446
2447#ifdef CONFIG_X86_32
2448        /*
2449         * If we are in VM86 mode, add the segment offset to convert to a
2450         * linear address.
2451         */
2452        if (regs->flags & X86_VM_MASK)
2453                return 0x10 * regs->cs;
2454
2455        if (user_mode(regs) && regs->cs != __USER_CS)
2456                return get_segment_base(regs->cs);
2457#else
2458        if (user_mode(regs) && !user_64bit_mode(regs) &&
2459            regs->cs != __USER32_CS)
2460                return get_segment_base(regs->cs);
2461#endif
2462        return 0;
2463}
2464
2465unsigned long perf_instruction_pointer(struct pt_regs *regs)
2466{
2467        if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
2468                return perf_guest_cbs->get_guest_ip();
2469
2470        return regs->ip + code_segment_base(regs);
2471}
2472
2473unsigned long perf_misc_flags(struct pt_regs *regs)
2474{
2475        int misc = 0;
2476
2477        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
2478                if (perf_guest_cbs->is_user_mode())
2479                        misc |= PERF_RECORD_MISC_GUEST_USER;
2480                else
2481                        misc |= PERF_RECORD_MISC_GUEST_KERNEL;
2482        } else {
2483                if (user_mode(regs))
2484                        misc |= PERF_RECORD_MISC_USER;
2485                else
2486                        misc |= PERF_RECORD_MISC_KERNEL;
2487        }
2488
2489        if (regs->flags & PERF_EFLAGS_EXACT)
2490                misc |= PERF_RECORD_MISC_EXACT_IP;
2491
2492        return misc;
2493}
2494
2495void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
2496{
2497        cap->version            = x86_pmu.version;
2498        cap->num_counters_gp    = x86_pmu.num_counters;
2499        cap->num_counters_fixed = x86_pmu.num_counters_fixed;
2500        cap->bit_width_gp       = x86_pmu.cntval_bits;
2501        cap->bit_width_fixed    = x86_pmu.cntval_bits;
2502        cap->events_mask        = (unsigned int)x86_pmu.events_maskl;
2503        cap->events_mask_len    = x86_pmu.events_mask_len;
2504}
2505EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
2506