linux/arch/x86/kvm/pmu.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Kernel-based Virtual Machine -- Performance Monitoring Unit support
   4 *
   5 * Copyright 2015 Red Hat, Inc. and/or its affiliates.
   6 *
   7 * Authors:
   8 *   Avi Kivity   <avi@redhat.com>
   9 *   Gleb Natapov <gleb@redhat.com>
  10 *   Wei Huang    <wei@redhat.com>
  11 */
  12
  13#include <linux/types.h>
  14#include <linux/kvm_host.h>
  15#include <linux/perf_event.h>
  16#include <asm/perf_event.h>
  17#include "x86.h"
  18#include "cpuid.h"
  19#include "lapic.h"
  20#include "pmu.h"
  21
  22/* This is enough to filter the vast majority of currently defined events. */
  23#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
  24
  25/* NOTE:
  26 * - Each perf counter is defined as "struct kvm_pmc";
  27 * - There are two types of perf counters: general purpose (gp) and fixed.
  28 *   gp counters are stored in gp_counters[] and fixed counters are stored
  29 *   in fixed_counters[] respectively. Both of them are part of "struct
  30 *   kvm_pmu";
  31 * - pmu.c understands the difference between gp counters and fixed counters.
  32 *   However AMD doesn't support fixed-counters;
  33 * - There are three types of index to access perf counters (PMC):
  34 *     1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
  35 *        has MSR_K7_PERFCTRn.
  36 *     2. MSR Index (named idx): This normally is used by RDPMC instruction.
  37 *        For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
  38 *        C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
  39 *        that it also supports fixed counters. idx can be used to as index to
  40 *        gp and fixed counters.
  41 *     3. Global PMC Index (named pmc): pmc is an index specific to PMU
  42 *        code. Each pmc, stored in kvm_pmc.idx field, is unique across
  43 *        all perf counters (both gp and fixed). The mapping relationship
  44 *        between pmc and perf counters is as the following:
  45 *        * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
  46 *                 [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
  47 *        * AMD:   [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
  48 */
  49
  50static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
  51{
  52        struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
  53        struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
  54
  55        kvm_pmu_deliver_pmi(vcpu);
  56}
  57
  58static void kvm_perf_overflow(struct perf_event *perf_event,
  59                              struct perf_sample_data *data,
  60                              struct pt_regs *regs)
  61{
  62        struct kvm_pmc *pmc = perf_event->overflow_handler_context;
  63        struct kvm_pmu *pmu = pmc_to_pmu(pmc);
  64
  65        if (!test_and_set_bit(pmc->idx,
  66                              (unsigned long *)&pmu->reprogram_pmi)) {
  67                __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
  68                kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
  69        }
  70}
  71
  72static void kvm_perf_overflow_intr(struct perf_event *perf_event,
  73                                   struct perf_sample_data *data,
  74                                   struct pt_regs *regs)
  75{
  76        struct kvm_pmc *pmc = perf_event->overflow_handler_context;
  77        struct kvm_pmu *pmu = pmc_to_pmu(pmc);
  78
  79        if (!test_and_set_bit(pmc->idx,
  80                              (unsigned long *)&pmu->reprogram_pmi)) {
  81                __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
  82                kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
  83
  84                /*
  85                 * Inject PMI. If vcpu was in a guest mode during NMI PMI
  86                 * can be ejected on a guest mode re-entry. Otherwise we can't
  87                 * be sure that vcpu wasn't executing hlt instruction at the
  88                 * time of vmexit and is not going to re-enter guest mode until
  89                 * woken up. So we should wake it, but this is impossible from
  90                 * NMI context. Do it from irq work instead.
  91                 */
  92                if (!kvm_is_in_guest())
  93                        irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
  94                else
  95                        kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
  96        }
  97}
  98
  99static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 100                                  unsigned config, bool exclude_user,
 101                                  bool exclude_kernel, bool intr,
 102                                  bool in_tx, bool in_tx_cp)
 103{
 104        struct perf_event *event;
 105        struct perf_event_attr attr = {
 106                .type = type,
 107                .size = sizeof(attr),
 108                .pinned = true,
 109                .exclude_idle = true,
 110                .exclude_host = 1,
 111                .exclude_user = exclude_user,
 112                .exclude_kernel = exclude_kernel,
 113                .config = config,
 114        };
 115
 116        attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
 117
 118        if (in_tx)
 119                attr.config |= HSW_IN_TX;
 120        if (in_tx_cp) {
 121                /*
 122                 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
 123                 * period. Just clear the sample period so at least
 124                 * allocating the counter doesn't fail.
 125                 */
 126                attr.sample_period = 0;
 127                attr.config |= HSW_IN_TX_CHECKPOINTED;
 128        }
 129
 130        event = perf_event_create_kernel_counter(&attr, -1, current,
 131                                                 intr ? kvm_perf_overflow_intr :
 132                                                 kvm_perf_overflow, pmc);
 133        if (IS_ERR(event)) {
 134                pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
 135                            PTR_ERR(event), pmc->idx);
 136                return;
 137        }
 138
 139        pmc->perf_event = event;
 140        clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi);
 141}
 142
 143void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 144{
 145        unsigned config, type = PERF_TYPE_RAW;
 146        u8 event_select, unit_mask;
 147        struct kvm *kvm = pmc->vcpu->kvm;
 148        struct kvm_pmu_event_filter *filter;
 149        int i;
 150        bool allow_event = true;
 151
 152        if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
 153                printk_once("kvm pmu: pin control bit is ignored\n");
 154
 155        pmc->eventsel = eventsel;
 156
 157        pmc_stop_counter(pmc);
 158
 159        if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
 160                return;
 161
 162        filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
 163        if (filter) {
 164                for (i = 0; i < filter->nevents; i++)
 165                        if (filter->events[i] ==
 166                            (eventsel & AMD64_RAW_EVENT_MASK_NB))
 167                                break;
 168                if (filter->action == KVM_PMU_EVENT_ALLOW &&
 169                    i == filter->nevents)
 170                        allow_event = false;
 171                if (filter->action == KVM_PMU_EVENT_DENY &&
 172                    i < filter->nevents)
 173                        allow_event = false;
 174        }
 175        if (!allow_event)
 176                return;
 177
 178        event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
 179        unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
 180
 181        if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
 182                          ARCH_PERFMON_EVENTSEL_INV |
 183                          ARCH_PERFMON_EVENTSEL_CMASK |
 184                          HSW_IN_TX |
 185                          HSW_IN_TX_CHECKPOINTED))) {
 186                config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc),
 187                                                      event_select,
 188                                                      unit_mask);
 189                if (config != PERF_COUNT_HW_MAX)
 190                        type = PERF_TYPE_HARDWARE;
 191        }
 192
 193        if (type == PERF_TYPE_RAW)
 194                config = eventsel & X86_RAW_EVENT_MASK;
 195
 196        pmc_reprogram_counter(pmc, type, config,
 197                              !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
 198                              !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
 199                              eventsel & ARCH_PERFMON_EVENTSEL_INT,
 200                              (eventsel & HSW_IN_TX),
 201                              (eventsel & HSW_IN_TX_CHECKPOINTED));
 202}
 203EXPORT_SYMBOL_GPL(reprogram_gp_counter);
 204
 205void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
 206{
 207        unsigned en_field = ctrl & 0x3;
 208        bool pmi = ctrl & 0x8;
 209        struct kvm_pmu_event_filter *filter;
 210        struct kvm *kvm = pmc->vcpu->kvm;
 211
 212        pmc_stop_counter(pmc);
 213
 214        if (!en_field || !pmc_is_enabled(pmc))
 215                return;
 216
 217        filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
 218        if (filter) {
 219                if (filter->action == KVM_PMU_EVENT_DENY &&
 220                    test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
 221                        return;
 222                if (filter->action == KVM_PMU_EVENT_ALLOW &&
 223                    !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
 224                        return;
 225        }
 226
 227        pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
 228                              kvm_x86_ops->pmu_ops->find_fixed_event(idx),
 229                              !(en_field & 0x2), /* exclude user */
 230                              !(en_field & 0x1), /* exclude kernel */
 231                              pmi, false, false);
 232}
 233EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
 234
 235void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
 236{
 237        struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
 238
 239        if (!pmc)
 240                return;
 241
 242        if (pmc_is_gp(pmc))
 243                reprogram_gp_counter(pmc, pmc->eventsel);
 244        else {
 245                int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
 246                u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
 247
 248                reprogram_fixed_counter(pmc, ctrl, idx);
 249        }
 250}
 251EXPORT_SYMBOL_GPL(reprogram_counter);
 252
 253void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
 254{
 255        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 256        u64 bitmask;
 257        int bit;
 258
 259        bitmask = pmu->reprogram_pmi;
 260
 261        for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
 262                struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit);
 263
 264                if (unlikely(!pmc || !pmc->perf_event)) {
 265                        clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
 266                        continue;
 267                }
 268
 269                reprogram_counter(pmu, bit);
 270        }
 271}
 272
 273/* check if idx is a valid index to access PMU */
 274int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
 275{
 276        return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
 277}
 278
 279bool is_vmware_backdoor_pmc(u32 pmc_idx)
 280{
 281        switch (pmc_idx) {
 282        case VMWARE_BACKDOOR_PMC_HOST_TSC:
 283        case VMWARE_BACKDOOR_PMC_REAL_TIME:
 284        case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
 285                return true;
 286        }
 287        return false;
 288}
 289
 290static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 291{
 292        u64 ctr_val;
 293
 294        switch (idx) {
 295        case VMWARE_BACKDOOR_PMC_HOST_TSC:
 296                ctr_val = rdtsc();
 297                break;
 298        case VMWARE_BACKDOOR_PMC_REAL_TIME:
 299                ctr_val = ktime_get_boottime_ns();
 300                break;
 301        case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
 302                ctr_val = ktime_get_boottime_ns() +
 303                        vcpu->kvm->arch.kvmclock_offset;
 304                break;
 305        default:
 306                return 1;
 307        }
 308
 309        *data = ctr_val;
 310        return 0;
 311}
 312
 313int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 314{
 315        bool fast_mode = idx & (1u << 31);
 316        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 317        struct kvm_pmc *pmc;
 318        u64 mask = fast_mode ? ~0u : ~0ull;
 319
 320        if (!pmu->version)
 321                return 1;
 322
 323        if (is_vmware_backdoor_pmc(idx))
 324                return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
 325
 326        pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx, &mask);
 327        if (!pmc)
 328                return 1;
 329
 330        *data = pmc_read_counter(pmc) & mask;
 331        return 0;
 332}
 333
 334void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
 335{
 336        if (lapic_in_kernel(vcpu))
 337                kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
 338}
 339
 340bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
 341{
 342        return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr);
 343}
 344
 345int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
 346{
 347        return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data);
 348}
 349
 350int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 351{
 352        return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info);
 353}
 354
 355/* refresh PMU settings. This function generally is called when underlying
 356 * settings are changed (such as changes of PMU CPUID by guest VMs), which
 357 * should rarely happen.
 358 */
 359void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
 360{
 361        kvm_x86_ops->pmu_ops->refresh(vcpu);
 362}
 363
 364void kvm_pmu_reset(struct kvm_vcpu *vcpu)
 365{
 366        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 367
 368        irq_work_sync(&pmu->irq_work);
 369        kvm_x86_ops->pmu_ops->reset(vcpu);
 370}
 371
 372void kvm_pmu_init(struct kvm_vcpu *vcpu)
 373{
 374        struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 375
 376        memset(pmu, 0, sizeof(*pmu));
 377        kvm_x86_ops->pmu_ops->init(vcpu);
 378        init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
 379        kvm_pmu_refresh(vcpu);
 380}
 381
 382void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
 383{
 384        kvm_pmu_reset(vcpu);
 385}
 386
 387int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
 388{
 389        struct kvm_pmu_event_filter tmp, *filter;
 390        size_t size;
 391        int r;
 392
 393        if (copy_from_user(&tmp, argp, sizeof(tmp)))
 394                return -EFAULT;
 395
 396        if (tmp.action != KVM_PMU_EVENT_ALLOW &&
 397            tmp.action != KVM_PMU_EVENT_DENY)
 398                return -EINVAL;
 399
 400        if (tmp.flags != 0)
 401                return -EINVAL;
 402
 403        if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
 404                return -E2BIG;
 405
 406        size = struct_size(filter, events, tmp.nevents);
 407        filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
 408        if (!filter)
 409                return -ENOMEM;
 410
 411        r = -EFAULT;
 412        if (copy_from_user(filter, argp, size))
 413                goto cleanup;
 414
 415        /* Ensure nevents can't be changed between the user copies. */
 416        *filter = tmp;
 417
 418        mutex_lock(&kvm->lock);
 419        rcu_swap_protected(kvm->arch.pmu_event_filter, filter,
 420                           mutex_is_locked(&kvm->lock));
 421        mutex_unlock(&kvm->lock);
 422
 423        synchronize_srcu_expedited(&kvm->srcu);
 424        r = 0;
 425cleanup:
 426        kfree(filter);
 427        return r;
 428}
 429