1
2
3
4
5
6
7
8
9
10
11
12
13#include <linux/types.h>
14#include <linux/kvm_host.h>
15#include <linux/perf_event.h>
16#include <linux/bsearch.h>
17#include <linux/sort.h>
18#include <asm/perf_event.h>
19#include "x86.h"
20#include "cpuid.h"
21#include "lapic.h"
22#include "pmu.h"
23
24
25#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
53{
54 struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
55 struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
56
57 kvm_pmu_deliver_pmi(vcpu);
58}
59
60static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
61{
62 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
63
64
65 if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
66 return;
67
68 __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
69 kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
70
71 if (!pmc->intr)
72 return;
73
74
75
76
77
78
79
80
81
82 if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
83 irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
84 else
85 kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
86}
87
88static void kvm_perf_overflow(struct perf_event *perf_event,
89 struct perf_sample_data *data,
90 struct pt_regs *regs)
91{
92 struct kvm_pmc *pmc = perf_event->overflow_handler_context;
93
94 __kvm_perf_overflow(pmc, true);
95}
96
97static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
98 u64 config, bool exclude_user,
99 bool exclude_kernel, bool intr)
100{
101 struct perf_event *event;
102 struct perf_event_attr attr = {
103 .type = type,
104 .size = sizeof(attr),
105 .pinned = true,
106 .exclude_idle = true,
107 .exclude_host = 1,
108 .exclude_user = exclude_user,
109 .exclude_kernel = exclude_kernel,
110 .config = config,
111 };
112
113 if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
114 return;
115
116 attr.sample_period = get_sample_period(pmc, pmc->counter);
117
118 if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
119 guest_cpuid_is_intel(pmc->vcpu)) {
120
121
122
123
124
125 attr.sample_period = 0;
126 }
127
128 event = perf_event_create_kernel_counter(&attr, -1, current,
129 kvm_perf_overflow, pmc);
130 if (IS_ERR(event)) {
131 pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
132 PTR_ERR(event), pmc->idx);
133 return;
134 }
135
136 pmc->perf_event = event;
137 pmc_to_pmu(pmc)->event_count++;
138 clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
139 pmc->is_paused = false;
140 pmc->intr = intr;
141}
142
143static void pmc_pause_counter(struct kvm_pmc *pmc)
144{
145 u64 counter = pmc->counter;
146
147 if (!pmc->perf_event || pmc->is_paused)
148 return;
149
150
151 counter += perf_event_pause(pmc->perf_event, true);
152 pmc->counter = counter & pmc_bitmask(pmc);
153 pmc->is_paused = true;
154}
155
156static bool pmc_resume_counter(struct kvm_pmc *pmc)
157{
158 if (!pmc->perf_event)
159 return false;
160
161
162 if (perf_event_period(pmc->perf_event,
163 get_sample_period(pmc, pmc->counter)))
164 return false;
165
166
167 perf_event_enable(pmc->perf_event);
168 pmc->is_paused = false;
169
170 clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
171 return true;
172}
173
174static int cmp_u64(const void *pa, const void *pb)
175{
176 u64 a = *(u64 *)pa;
177 u64 b = *(u64 *)pb;
178
179 return (a > b) - (a < b);
180}
181
182void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
183{
184 u64 config;
185 u32 type = PERF_TYPE_RAW;
186 struct kvm *kvm = pmc->vcpu->kvm;
187 struct kvm_pmu_event_filter *filter;
188 struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu);
189 bool allow_event = true;
190
191 if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
192 printk_once("kvm pmu: pin control bit is ignored\n");
193
194 pmc->eventsel = eventsel;
195
196 pmc_pause_counter(pmc);
197
198 if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
199 return;
200
201 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
202 if (filter) {
203 __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
204
205 if (bsearch(&key, filter->events, filter->nevents,
206 sizeof(__u64), cmp_u64))
207 allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
208 else
209 allow_event = filter->action == KVM_PMU_EVENT_DENY;
210 }
211 if (!allow_event)
212 return;
213
214 if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
215 ARCH_PERFMON_EVENTSEL_INV |
216 ARCH_PERFMON_EVENTSEL_CMASK |
217 HSW_IN_TX |
218 HSW_IN_TX_CHECKPOINTED))) {
219 config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
220 if (config != PERF_COUNT_HW_MAX)
221 type = PERF_TYPE_HARDWARE;
222 }
223
224 if (type == PERF_TYPE_RAW)
225 config = eventsel & pmu->raw_event_mask;
226
227 if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
228 return;
229
230 pmc_release_perf_event(pmc);
231
232 pmc->current_config = eventsel;
233 pmc_reprogram_counter(pmc, type, config,
234 !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
235 !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
236 eventsel & ARCH_PERFMON_EVENTSEL_INT);
237}
238EXPORT_SYMBOL_GPL(reprogram_gp_counter);
239
240void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
241{
242 unsigned en_field = ctrl & 0x3;
243 bool pmi = ctrl & 0x8;
244 struct kvm_pmu_event_filter *filter;
245 struct kvm *kvm = pmc->vcpu->kvm;
246
247 pmc_pause_counter(pmc);
248
249 if (!en_field || !pmc_is_enabled(pmc))
250 return;
251
252 filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
253 if (filter) {
254 if (filter->action == KVM_PMU_EVENT_DENY &&
255 test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
256 return;
257 if (filter->action == KVM_PMU_EVENT_ALLOW &&
258 !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
259 return;
260 }
261
262 if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc))
263 return;
264
265 pmc_release_perf_event(pmc);
266
267 pmc->current_config = (u64)ctrl;
268 pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
269 kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc),
270 !(en_field & 0x2),
271 !(en_field & 0x1),
272 pmi);
273}
274EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
275
276void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
277{
278 struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
279
280 if (!pmc)
281 return;
282
283 if (pmc_is_gp(pmc))
284 reprogram_gp_counter(pmc, pmc->eventsel);
285 else {
286 int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
287 u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
288
289 reprogram_fixed_counter(pmc, ctrl, idx);
290 }
291}
292EXPORT_SYMBOL_GPL(reprogram_counter);
293
294void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
295{
296 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
297 int bit;
298
299 for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
300 struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit);
301
302 if (unlikely(!pmc || !pmc->perf_event)) {
303 clear_bit(bit, pmu->reprogram_pmi);
304 continue;
305 }
306
307 reprogram_counter(pmu, bit);
308 }
309
310
311
312
313
314
315 if (unlikely(pmu->need_cleanup))
316 kvm_pmu_cleanup(vcpu);
317}
318
319
320bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
321{
322 return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx);
323}
324
325bool is_vmware_backdoor_pmc(u32 pmc_idx)
326{
327 switch (pmc_idx) {
328 case VMWARE_BACKDOOR_PMC_HOST_TSC:
329 case VMWARE_BACKDOOR_PMC_REAL_TIME:
330 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
331 return true;
332 }
333 return false;
334}
335
336static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
337{
338 u64 ctr_val;
339
340 switch (idx) {
341 case VMWARE_BACKDOOR_PMC_HOST_TSC:
342 ctr_val = rdtsc();
343 break;
344 case VMWARE_BACKDOOR_PMC_REAL_TIME:
345 ctr_val = ktime_get_boottime_ns();
346 break;
347 case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
348 ctr_val = ktime_get_boottime_ns() +
349 vcpu->kvm->arch.kvmclock_offset;
350 break;
351 default:
352 return 1;
353 }
354
355 *data = ctr_val;
356 return 0;
357}
358
359int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
360{
361 bool fast_mode = idx & (1u << 31);
362 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
363 struct kvm_pmc *pmc;
364 u64 mask = fast_mode ? ~0u : ~0ull;
365
366 if (!pmu->version)
367 return 1;
368
369 if (is_vmware_backdoor_pmc(idx))
370 return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
371
372 pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask);
373 if (!pmc)
374 return 1;
375
376 if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
377 (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
378 (kvm_read_cr0(vcpu) & X86_CR0_PE))
379 return 1;
380
381 *data = pmc_read_counter(pmc) & mask;
382 return 0;
383}
384
385void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
386{
387 if (lapic_in_kernel(vcpu)) {
388 if (kvm_x86_ops.pmu_ops->deliver_pmi)
389 kvm_x86_ops.pmu_ops->deliver_pmi(vcpu);
390 kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
391 }
392}
393
394bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
395{
396 return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) ||
397 kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr);
398}
399
400static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
401{
402 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
403 struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr);
404
405 if (pmc)
406 __set_bit(pmc->idx, pmu->pmc_in_use);
407}
408
409int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
410{
411 return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info);
412}
413
414int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
415{
416 kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
417 return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info);
418}
419
420
421
422
423
424void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
425{
426 kvm_x86_ops.pmu_ops->refresh(vcpu);
427}
428
429void kvm_pmu_reset(struct kvm_vcpu *vcpu)
430{
431 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
432
433 irq_work_sync(&pmu->irq_work);
434 kvm_x86_ops.pmu_ops->reset(vcpu);
435}
436
437void kvm_pmu_init(struct kvm_vcpu *vcpu)
438{
439 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
440
441 memset(pmu, 0, sizeof(*pmu));
442 kvm_x86_ops.pmu_ops->init(vcpu);
443 init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
444 pmu->event_count = 0;
445 pmu->need_cleanup = false;
446 kvm_pmu_refresh(vcpu);
447}
448
449static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
450{
451 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
452
453 if (pmc_is_fixed(pmc))
454 return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
455 pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
456
457 return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
458}
459
460
461void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
462{
463 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
464 struct kvm_pmc *pmc = NULL;
465 DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
466 int i;
467
468 pmu->need_cleanup = false;
469
470 bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
471 pmu->pmc_in_use, X86_PMC_IDX_MAX);
472
473 for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
474 pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
475
476 if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
477 pmc_stop_counter(pmc);
478 }
479
480 if (kvm_x86_ops.pmu_ops->cleanup)
481 kvm_x86_ops.pmu_ops->cleanup(vcpu);
482
483 bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
484}
485
486void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
487{
488 kvm_pmu_reset(vcpu);
489}
490
491static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
492{
493 struct kvm_pmu *pmu = pmc_to_pmu(pmc);
494 u64 prev_count;
495
496 prev_count = pmc->counter;
497 pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
498
499 reprogram_counter(pmu, pmc->idx);
500 if (pmc->counter < prev_count)
501 __kvm_perf_overflow(pmc, false);
502}
503
504static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
505 unsigned int perf_hw_id)
506{
507 u64 old_eventsel = pmc->eventsel;
508 unsigned int config;
509
510 pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK);
511 config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
512 pmc->eventsel = old_eventsel;
513 return config == perf_hw_id;
514}
515
516static inline bool cpl_is_matched(struct kvm_pmc *pmc)
517{
518 bool select_os, select_user;
519 u64 config = pmc->current_config;
520
521 if (pmc_is_gp(pmc)) {
522 select_os = config & ARCH_PERFMON_EVENTSEL_OS;
523 select_user = config & ARCH_PERFMON_EVENTSEL_USR;
524 } else {
525 select_os = config & 0x1;
526 select_user = config & 0x2;
527 }
528
529 return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
530}
531
532void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
533{
534 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
535 struct kvm_pmc *pmc;
536 int i;
537
538 for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
539 pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
540
541 if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
542 continue;
543
544
545 if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
546 kvm_pmu_incr_counter(pmc);
547 }
548}
549EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
550
551int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
552{
553 struct kvm_pmu_event_filter tmp, *filter;
554 size_t size;
555 int r;
556
557 if (copy_from_user(&tmp, argp, sizeof(tmp)))
558 return -EFAULT;
559
560 if (tmp.action != KVM_PMU_EVENT_ALLOW &&
561 tmp.action != KVM_PMU_EVENT_DENY)
562 return -EINVAL;
563
564 if (tmp.flags != 0)
565 return -EINVAL;
566
567 if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
568 return -E2BIG;
569
570 size = struct_size(filter, events, tmp.nevents);
571 filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
572 if (!filter)
573 return -ENOMEM;
574
575 r = -EFAULT;
576 if (copy_from_user(filter, argp, size))
577 goto cleanup;
578
579
580 *filter = tmp;
581
582
583
584
585 sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
586
587 mutex_lock(&kvm->lock);
588 filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
589 mutex_is_locked(&kvm->lock));
590 mutex_unlock(&kvm->lock);
591
592 synchronize_srcu_expedited(&kvm->srcu);
593 r = 0;
594cleanup:
595 kfree(filter);
596 return r;
597}
598