1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <kvm/iodev.h>
17
18#include <linux/kvm_host.h>
19#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
22#include <linux/percpu.h>
23#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
26#include <linux/reboot.h>
27#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
30#include <linux/syscore_ops.h>
31#include <linux/cpu.h>
32#include <linux/sched/signal.h>
33#include <linux/sched/mm.h>
34#include <linux/sched/stat.h>
35#include <linux/cpumask.h>
36#include <linux/smp.h>
37#include <linux/anon_inodes.h>
38#include <linux/profile.h>
39#include <linux/kvm_para.h>
40#include <linux/pagemap.h>
41#include <linux/mman.h>
42#include <linux/swap.h>
43#include <linux/bitops.h>
44#include <linux/spinlock.h>
45#include <linux/compat.h>
46#include <linux/srcu.h>
47#include <linux/hugetlb.h>
48#include <linux/slab.h>
49#include <linux/sort.h>
50#include <linux/bsearch.h>
51#include <linux/io.h>
52#include <linux/lockdep.h>
53#include <linux/kthread.h>
54#include <linux/suspend.h>
55
56#include <asm/processor.h>
57#include <asm/ioctl.h>
58#include <linux/uaccess.h>
59
60#include "coalesced_mmio.h"
61#include "async_pf.h"
62#include "mmu_lock.h"
63#include "vfio.h"
64
65#define CREATE_TRACE_POINTS
66#include <trace/events/kvm.h>
67
68#include <linux/kvm_dirty_ring.h>
69
70
71#define ITOA_MAX_LEN 12
72
73MODULE_AUTHOR("Qumranet");
74MODULE_LICENSE("GPL");
75
76
77unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
78module_param(halt_poll_ns, uint, 0644);
79EXPORT_SYMBOL_GPL(halt_poll_ns);
80
81
82unsigned int halt_poll_ns_grow = 2;
83module_param(halt_poll_ns_grow, uint, 0644);
84EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
85
86
87unsigned int halt_poll_ns_grow_start = 10000;
88module_param(halt_poll_ns_grow_start, uint, 0644);
89EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
90
91
92unsigned int halt_poll_ns_shrink;
93module_param(halt_poll_ns_shrink, uint, 0644);
94EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
95
96
97
98
99
100
101
102DEFINE_MUTEX(kvm_lock);
103static DEFINE_RAW_SPINLOCK(kvm_count_lock);
104LIST_HEAD(vm_list);
105
106static cpumask_var_t cpus_hardware_enabled;
107static int kvm_usage_count;
108static atomic_t hardware_enable_failed;
109
110static struct kmem_cache *kvm_vcpu_cache;
111
112static __read_mostly struct preempt_ops kvm_preempt_ops;
113static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
114
115struct dentry *kvm_debugfs_dir;
116EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
117
118static const struct file_operations stat_fops_per_vm;
119
120static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
121 unsigned long arg);
122#ifdef CONFIG_KVM_COMPAT
123static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
124 unsigned long arg);
125#define KVM_COMPAT(c) .compat_ioctl = (c)
126#else
127
128
129
130
131
132
133
134static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
135 unsigned long arg) { return -EINVAL; }
136
137static int kvm_no_compat_open(struct inode *inode, struct file *file)
138{
139 return is_compat_task() ? -ENODEV : 0;
140}
141#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
142 .open = kvm_no_compat_open
143#endif
144static int hardware_enable_all(void);
145static void hardware_disable_all(void);
146
147static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
148
149__visible bool kvm_rebooting;
150EXPORT_SYMBOL_GPL(kvm_rebooting);
151
152#define KVM_EVENT_CREATE_VM 0
153#define KVM_EVENT_DESTROY_VM 1
154static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
155static unsigned long long kvm_createvm_count;
156static unsigned long long kvm_active_vms;
157
158static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
159
160__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
161 unsigned long start, unsigned long end)
162{
163}
164
165bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
166{
167
168
169
170
171
172
173 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
174 return false;
175
176 return is_zone_device_page(pfn_to_page(pfn));
177}
178
179bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
180{
181
182
183
184
185
186 if (pfn_valid(pfn))
187 return PageReserved(pfn_to_page(pfn)) &&
188 !is_zero_pfn(pfn) &&
189 !kvm_is_zone_device_pfn(pfn);
190
191 return true;
192}
193
194
195
196
197void vcpu_load(struct kvm_vcpu *vcpu)
198{
199 int cpu = get_cpu();
200
201 __this_cpu_write(kvm_running_vcpu, vcpu);
202 preempt_notifier_register(&vcpu->preempt_notifier);
203 kvm_arch_vcpu_load(vcpu, cpu);
204 put_cpu();
205}
206EXPORT_SYMBOL_GPL(vcpu_load);
207
208void vcpu_put(struct kvm_vcpu *vcpu)
209{
210 preempt_disable();
211 kvm_arch_vcpu_put(vcpu);
212 preempt_notifier_unregister(&vcpu->preempt_notifier);
213 __this_cpu_write(kvm_running_vcpu, NULL);
214 preempt_enable();
215}
216EXPORT_SYMBOL_GPL(vcpu_put);
217
218
219static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
220{
221 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
222
223
224
225
226
227 if (req & KVM_REQUEST_WAIT)
228 return mode != OUTSIDE_GUEST_MODE;
229
230
231
232
233 return mode == IN_GUEST_MODE;
234}
235
236static void ack_flush(void *_completed)
237{
238}
239
240static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
241{
242 if (cpumask_empty(cpus))
243 return false;
244
245 smp_call_function_many(cpus, ack_flush, NULL, wait);
246 return true;
247}
248
249static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu,
250 unsigned int req, struct cpumask *tmp,
251 int current_cpu)
252{
253 int cpu;
254
255 kvm_make_request(req, vcpu);
256
257 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
258 return;
259
260
261
262
263
264
265
266
267
268
269
270 if (kvm_request_needs_ipi(vcpu, req)) {
271 cpu = READ_ONCE(vcpu->cpu);
272 if (cpu != -1 && cpu != current_cpu)
273 __cpumask_set_cpu(cpu, tmp);
274 }
275}
276
277bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
278 unsigned long *vcpu_bitmap)
279{
280 struct kvm_vcpu *vcpu;
281 struct cpumask *cpus;
282 int i, me;
283 bool called;
284
285 me = get_cpu();
286
287 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
288 cpumask_clear(cpus);
289
290 for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
291 vcpu = kvm_get_vcpu(kvm, i);
292 if (!vcpu)
293 continue;
294 kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
295 }
296
297 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
298 put_cpu();
299
300 return called;
301}
302
303bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
304 struct kvm_vcpu *except)
305{
306 struct kvm_vcpu *vcpu;
307 struct cpumask *cpus;
308 bool called;
309 int i, me;
310
311 me = get_cpu();
312
313 cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
314 cpumask_clear(cpus);
315
316 kvm_for_each_vcpu(i, vcpu, kvm) {
317 if (vcpu == except)
318 continue;
319 kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
320 }
321
322 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
323 put_cpu();
324
325 return called;
326}
327
328bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
329{
330 return kvm_make_all_cpus_request_except(kvm, req, NULL);
331}
332EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
333
334#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
335void kvm_flush_remote_tlbs(struct kvm *kvm)
336{
337 ++kvm->stat.generic.remote_tlb_flush_requests;
338
339
340
341
342
343
344
345
346
347
348
349
350 if (!kvm_arch_flush_remote_tlb(kvm)
351 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
352 ++kvm->stat.generic.remote_tlb_flush;
353}
354EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
355#endif
356
357void kvm_reload_remote_mmus(struct kvm *kvm)
358{
359 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
360}
361
362#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
363static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
364 gfp_t gfp_flags)
365{
366 gfp_flags |= mc->gfp_zero;
367
368 if (mc->kmem_cache)
369 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
370 else
371 return (void *)__get_free_page(gfp_flags);
372}
373
374int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
375{
376 void *obj;
377
378 if (mc->nobjs >= min)
379 return 0;
380 while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
381 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
382 if (!obj)
383 return mc->nobjs >= min ? 0 : -ENOMEM;
384 mc->objects[mc->nobjs++] = obj;
385 }
386 return 0;
387}
388
389int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
390{
391 return mc->nobjs;
392}
393
394void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
395{
396 while (mc->nobjs) {
397 if (mc->kmem_cache)
398 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
399 else
400 free_page((unsigned long)mc->objects[--mc->nobjs]);
401 }
402}
403
404void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
405{
406 void *p;
407
408 if (WARN_ON(!mc->nobjs))
409 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
410 else
411 p = mc->objects[--mc->nobjs];
412 BUG_ON(!p);
413 return p;
414}
415#endif
416
417static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
418{
419 mutex_init(&vcpu->mutex);
420 vcpu->cpu = -1;
421 vcpu->kvm = kvm;
422 vcpu->vcpu_id = id;
423 vcpu->pid = NULL;
424 rcuwait_init(&vcpu->wait);
425 kvm_async_pf_vcpu_init(vcpu);
426
427 vcpu->pre_pcpu = -1;
428 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
429
430 kvm_vcpu_set_in_spin_loop(vcpu, false);
431 kvm_vcpu_set_dy_eligible(vcpu, false);
432 vcpu->preempted = false;
433 vcpu->ready = false;
434 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
435 vcpu->last_used_slot = 0;
436}
437
438void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
439{
440 kvm_dirty_ring_free(&vcpu->dirty_ring);
441 kvm_arch_vcpu_destroy(vcpu);
442
443
444
445
446
447
448 put_pid(rcu_dereference_protected(vcpu->pid, 1));
449
450 free_page((unsigned long)vcpu->run);
451 kmem_cache_free(kvm_vcpu_cache, vcpu);
452}
453EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
454
455#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
456static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
457{
458 return container_of(mn, struct kvm, mmu_notifier);
459}
460
461static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
462 struct mm_struct *mm,
463 unsigned long start, unsigned long end)
464{
465 struct kvm *kvm = mmu_notifier_to_kvm(mn);
466 int idx;
467
468 idx = srcu_read_lock(&kvm->srcu);
469 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
470 srcu_read_unlock(&kvm->srcu, idx);
471}
472
473typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
474
475typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
476 unsigned long end);
477
478struct kvm_hva_range {
479 unsigned long start;
480 unsigned long end;
481 pte_t pte;
482 hva_handler_t handler;
483 on_lock_fn_t on_lock;
484 bool flush_on_ret;
485 bool may_block;
486};
487
488
489
490
491
492
493
494
495static void kvm_null_fn(void)
496{
497
498}
499#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
500
501static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
502 const struct kvm_hva_range *range)
503{
504 bool ret = false, locked = false;
505 struct kvm_gfn_range gfn_range;
506 struct kvm_memory_slot *slot;
507 struct kvm_memslots *slots;
508 int i, idx;
509
510
511 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
512 IS_KVM_NULL_FN(range->handler)))
513 return 0;
514
515 idx = srcu_read_lock(&kvm->srcu);
516
517 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
518 slots = __kvm_memslots(kvm, i);
519 kvm_for_each_memslot(slot, slots) {
520 unsigned long hva_start, hva_end;
521
522 hva_start = max(range->start, slot->userspace_addr);
523 hva_end = min(range->end, slot->userspace_addr +
524 (slot->npages << PAGE_SHIFT));
525 if (hva_start >= hva_end)
526 continue;
527
528
529
530
531
532
533
534 gfn_range.pte = range->pte;
535 gfn_range.may_block = range->may_block;
536
537
538
539
540
541 gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
542 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
543 gfn_range.slot = slot;
544
545 if (!locked) {
546 locked = true;
547 KVM_MMU_LOCK(kvm);
548 if (!IS_KVM_NULL_FN(range->on_lock))
549 range->on_lock(kvm, range->start, range->end);
550 if (IS_KVM_NULL_FN(range->handler))
551 break;
552 }
553 ret |= range->handler(kvm, &gfn_range);
554 }
555 }
556
557 if (range->flush_on_ret && ret)
558 kvm_flush_remote_tlbs(kvm);
559
560 if (locked)
561 KVM_MMU_UNLOCK(kvm);
562
563 srcu_read_unlock(&kvm->srcu, idx);
564
565
566 return (int)ret;
567}
568
569static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
570 unsigned long start,
571 unsigned long end,
572 pte_t pte,
573 hva_handler_t handler)
574{
575 struct kvm *kvm = mmu_notifier_to_kvm(mn);
576 const struct kvm_hva_range range = {
577 .start = start,
578 .end = end,
579 .pte = pte,
580 .handler = handler,
581 .on_lock = (void *)kvm_null_fn,
582 .flush_on_ret = true,
583 .may_block = false,
584 };
585
586 return __kvm_handle_hva_range(kvm, &range);
587}
588
589static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
590 unsigned long start,
591 unsigned long end,
592 hva_handler_t handler)
593{
594 struct kvm *kvm = mmu_notifier_to_kvm(mn);
595 const struct kvm_hva_range range = {
596 .start = start,
597 .end = end,
598 .pte = __pte(0),
599 .handler = handler,
600 .on_lock = (void *)kvm_null_fn,
601 .flush_on_ret = false,
602 .may_block = false,
603 };
604
605 return __kvm_handle_hva_range(kvm, &range);
606}
607static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
608 struct mm_struct *mm,
609 unsigned long address,
610 pte_t pte)
611{
612 struct kvm *kvm = mmu_notifier_to_kvm(mn);
613
614 trace_kvm_set_spte_hva(address);
615
616
617
618
619
620
621
622
623 WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
624 if (!READ_ONCE(kvm->mmu_notifier_count))
625 return;
626
627 kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
628}
629
630void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
631 unsigned long end)
632{
633
634
635
636
637
638 kvm->mmu_notifier_count++;
639 if (likely(kvm->mmu_notifier_count == 1)) {
640 kvm->mmu_notifier_range_start = start;
641 kvm->mmu_notifier_range_end = end;
642 } else {
643
644
645
646
647
648
649
650
651
652 kvm->mmu_notifier_range_start =
653 min(kvm->mmu_notifier_range_start, start);
654 kvm->mmu_notifier_range_end =
655 max(kvm->mmu_notifier_range_end, end);
656 }
657}
658
659static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
660 const struct mmu_notifier_range *range)
661{
662 struct kvm *kvm = mmu_notifier_to_kvm(mn);
663 const struct kvm_hva_range hva_range = {
664 .start = range->start,
665 .end = range->end,
666 .pte = __pte(0),
667 .handler = kvm_unmap_gfn_range,
668 .on_lock = kvm_inc_notifier_count,
669 .flush_on_ret = true,
670 .may_block = mmu_notifier_range_blockable(range),
671 };
672
673 trace_kvm_unmap_hva_range(range->start, range->end);
674
675
676
677
678
679
680
681
682
683 spin_lock(&kvm->mn_invalidate_lock);
684 kvm->mn_active_invalidate_count++;
685 spin_unlock(&kvm->mn_invalidate_lock);
686
687 __kvm_handle_hva_range(kvm, &hva_range);
688
689 return 0;
690}
691
692void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
693 unsigned long end)
694{
695
696
697
698
699
700 kvm->mmu_notifier_seq++;
701 smp_wmb();
702
703
704
705
706
707 kvm->mmu_notifier_count--;
708}
709
710static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
711 const struct mmu_notifier_range *range)
712{
713 struct kvm *kvm = mmu_notifier_to_kvm(mn);
714 const struct kvm_hva_range hva_range = {
715 .start = range->start,
716 .end = range->end,
717 .pte = __pte(0),
718 .handler = (void *)kvm_null_fn,
719 .on_lock = kvm_dec_notifier_count,
720 .flush_on_ret = false,
721 .may_block = mmu_notifier_range_blockable(range),
722 };
723 bool wake;
724
725 __kvm_handle_hva_range(kvm, &hva_range);
726
727
728 spin_lock(&kvm->mn_invalidate_lock);
729 wake = (--kvm->mn_active_invalidate_count == 0);
730 spin_unlock(&kvm->mn_invalidate_lock);
731
732
733
734
735
736 if (wake)
737 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
738
739 BUG_ON(kvm->mmu_notifier_count < 0);
740}
741
742static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
743 struct mm_struct *mm,
744 unsigned long start,
745 unsigned long end)
746{
747 trace_kvm_age_hva(start, end);
748
749 return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
750}
751
752static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
753 struct mm_struct *mm,
754 unsigned long start,
755 unsigned long end)
756{
757 trace_kvm_age_hva(start, end);
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
773}
774
775static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
776 struct mm_struct *mm,
777 unsigned long address)
778{
779 trace_kvm_test_age_hva(address);
780
781 return kvm_handle_hva_range_no_flush(mn, address, address + 1,
782 kvm_test_age_gfn);
783}
784
785static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
786 struct mm_struct *mm)
787{
788 struct kvm *kvm = mmu_notifier_to_kvm(mn);
789 int idx;
790
791 idx = srcu_read_lock(&kvm->srcu);
792 kvm_arch_flush_shadow_all(kvm);
793 srcu_read_unlock(&kvm->srcu, idx);
794}
795
796static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
797 .invalidate_range = kvm_mmu_notifier_invalidate_range,
798 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
799 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
800 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
801 .clear_young = kvm_mmu_notifier_clear_young,
802 .test_young = kvm_mmu_notifier_test_young,
803 .change_pte = kvm_mmu_notifier_change_pte,
804 .release = kvm_mmu_notifier_release,
805};
806
807static int kvm_init_mmu_notifier(struct kvm *kvm)
808{
809 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
810 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
811}
812
813#else
814
815static int kvm_init_mmu_notifier(struct kvm *kvm)
816{
817 return 0;
818}
819
820#endif
821
822#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
823static int kvm_pm_notifier_call(struct notifier_block *bl,
824 unsigned long state,
825 void *unused)
826{
827 struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
828
829 return kvm_arch_pm_notifier(kvm, state);
830}
831
832static void kvm_init_pm_notifier(struct kvm *kvm)
833{
834 kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
835
836 kvm->pm_notifier.priority = INT_MAX;
837 register_pm_notifier(&kvm->pm_notifier);
838}
839
840static void kvm_destroy_pm_notifier(struct kvm *kvm)
841{
842 unregister_pm_notifier(&kvm->pm_notifier);
843}
844#else
845static void kvm_init_pm_notifier(struct kvm *kvm)
846{
847}
848
849static void kvm_destroy_pm_notifier(struct kvm *kvm)
850{
851}
852#endif
853
854static struct kvm_memslots *kvm_alloc_memslots(void)
855{
856 int i;
857 struct kvm_memslots *slots;
858
859 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
860 if (!slots)
861 return NULL;
862
863 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
864 slots->id_to_index[i] = -1;
865
866 return slots;
867}
868
869static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
870{
871 if (!memslot->dirty_bitmap)
872 return;
873
874 kvfree(memslot->dirty_bitmap);
875 memslot->dirty_bitmap = NULL;
876}
877
878static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
879{
880 kvm_destroy_dirty_bitmap(slot);
881
882 kvm_arch_free_memslot(kvm, slot);
883
884 slot->flags = 0;
885 slot->npages = 0;
886}
887
888static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
889{
890 struct kvm_memory_slot *memslot;
891
892 if (!slots)
893 return;
894
895 kvm_for_each_memslot(memslot, slots)
896 kvm_free_memslot(kvm, memslot);
897
898 kvfree(slots);
899}
900
901static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
902{
903 switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
904 case KVM_STATS_TYPE_INSTANT:
905 return 0444;
906 case KVM_STATS_TYPE_CUMULATIVE:
907 case KVM_STATS_TYPE_PEAK:
908 default:
909 return 0644;
910 }
911}
912
913
914static void kvm_destroy_vm_debugfs(struct kvm *kvm)
915{
916 int i;
917 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
918 kvm_vcpu_stats_header.num_desc;
919
920 if (!kvm->debugfs_dentry)
921 return;
922
923 debugfs_remove_recursive(kvm->debugfs_dentry);
924
925 if (kvm->debugfs_stat_data) {
926 for (i = 0; i < kvm_debugfs_num_entries; i++)
927 kfree(kvm->debugfs_stat_data[i]);
928 kfree(kvm->debugfs_stat_data);
929 }
930}
931
932static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
933{
934 static DEFINE_MUTEX(kvm_debugfs_lock);
935 struct dentry *dent;
936 char dir_name[ITOA_MAX_LEN * 2];
937 struct kvm_stat_data *stat_data;
938 const struct _kvm_stats_desc *pdesc;
939 int i, ret;
940 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
941 kvm_vcpu_stats_header.num_desc;
942
943 if (!debugfs_initialized())
944 return 0;
945
946 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
947 mutex_lock(&kvm_debugfs_lock);
948 dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
949 if (dent) {
950 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
951 dput(dent);
952 mutex_unlock(&kvm_debugfs_lock);
953 return 0;
954 }
955 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
956 mutex_unlock(&kvm_debugfs_lock);
957 if (IS_ERR(dent))
958 return 0;
959
960 kvm->debugfs_dentry = dent;
961 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
962 sizeof(*kvm->debugfs_stat_data),
963 GFP_KERNEL_ACCOUNT);
964 if (!kvm->debugfs_stat_data)
965 return -ENOMEM;
966
967 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
968 pdesc = &kvm_vm_stats_desc[i];
969 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
970 if (!stat_data)
971 return -ENOMEM;
972
973 stat_data->kvm = kvm;
974 stat_data->desc = pdesc;
975 stat_data->kind = KVM_STAT_VM;
976 kvm->debugfs_stat_data[i] = stat_data;
977 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
978 kvm->debugfs_dentry, stat_data,
979 &stat_fops_per_vm);
980 }
981
982 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
983 pdesc = &kvm_vcpu_stats_desc[i];
984 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
985 if (!stat_data)
986 return -ENOMEM;
987
988 stat_data->kvm = kvm;
989 stat_data->desc = pdesc;
990 stat_data->kind = KVM_STAT_VCPU;
991 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
992 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
993 kvm->debugfs_dentry, stat_data,
994 &stat_fops_per_vm);
995 }
996
997 ret = kvm_arch_create_vm_debugfs(kvm);
998 if (ret) {
999 kvm_destroy_vm_debugfs(kvm);
1000 return i;
1001 }
1002
1003 return 0;
1004}
1005
1006
1007
1008
1009
1010int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1011{
1012 return 0;
1013}
1014
1015
1016
1017
1018
1019void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1020{
1021}
1022
1023
1024
1025
1026
1027
1028
1029int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1030{
1031 return 0;
1032}
1033
1034static struct kvm *kvm_create_vm(unsigned long type)
1035{
1036 struct kvm *kvm = kvm_arch_alloc_vm();
1037 int r = -ENOMEM;
1038 int i;
1039
1040 if (!kvm)
1041 return ERR_PTR(-ENOMEM);
1042
1043 KVM_MMU_LOCK_INIT(kvm);
1044 mmgrab(current->mm);
1045 kvm->mm = current->mm;
1046 kvm_eventfd_init(kvm);
1047 mutex_init(&kvm->lock);
1048 mutex_init(&kvm->irq_lock);
1049 mutex_init(&kvm->slots_lock);
1050 mutex_init(&kvm->slots_arch_lock);
1051 spin_lock_init(&kvm->mn_invalidate_lock);
1052 rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1053
1054 INIT_LIST_HEAD(&kvm->devices);
1055
1056 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1057
1058 if (init_srcu_struct(&kvm->srcu))
1059 goto out_err_no_srcu;
1060 if (init_srcu_struct(&kvm->irq_srcu))
1061 goto out_err_no_irq_srcu;
1062
1063 refcount_set(&kvm->users_count, 1);
1064 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1065 struct kvm_memslots *slots = kvm_alloc_memslots();
1066
1067 if (!slots)
1068 goto out_err_no_arch_destroy_vm;
1069
1070 slots->generation = i;
1071 rcu_assign_pointer(kvm->memslots[i], slots);
1072 }
1073
1074 for (i = 0; i < KVM_NR_BUSES; i++) {
1075 rcu_assign_pointer(kvm->buses[i],
1076 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1077 if (!kvm->buses[i])
1078 goto out_err_no_arch_destroy_vm;
1079 }
1080
1081 kvm->max_halt_poll_ns = halt_poll_ns;
1082
1083 r = kvm_arch_init_vm(kvm, type);
1084 if (r)
1085 goto out_err_no_arch_destroy_vm;
1086
1087 r = hardware_enable_all();
1088 if (r)
1089 goto out_err_no_disable;
1090
1091#ifdef CONFIG_HAVE_KVM_IRQFD
1092 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1093#endif
1094
1095 r = kvm_init_mmu_notifier(kvm);
1096 if (r)
1097 goto out_err_no_mmu_notifier;
1098
1099 r = kvm_arch_post_init_vm(kvm);
1100 if (r)
1101 goto out_err;
1102
1103 mutex_lock(&kvm_lock);
1104 list_add(&kvm->vm_list, &vm_list);
1105 mutex_unlock(&kvm_lock);
1106
1107 preempt_notifier_inc();
1108 kvm_init_pm_notifier(kvm);
1109
1110 return kvm;
1111
1112out_err:
1113#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1114 if (kvm->mmu_notifier.ops)
1115 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1116#endif
1117out_err_no_mmu_notifier:
1118 hardware_disable_all();
1119out_err_no_disable:
1120 kvm_arch_destroy_vm(kvm);
1121out_err_no_arch_destroy_vm:
1122 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1123 for (i = 0; i < KVM_NR_BUSES; i++)
1124 kfree(kvm_get_bus(kvm, i));
1125 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1126 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1127 cleanup_srcu_struct(&kvm->irq_srcu);
1128out_err_no_irq_srcu:
1129 cleanup_srcu_struct(&kvm->srcu);
1130out_err_no_srcu:
1131 kvm_arch_free_vm(kvm);
1132 mmdrop(current->mm);
1133 return ERR_PTR(r);
1134}
1135
1136static void kvm_destroy_devices(struct kvm *kvm)
1137{
1138 struct kvm_device *dev, *tmp;
1139
1140
1141
1142
1143
1144
1145 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1146 list_del(&dev->vm_node);
1147 dev->ops->destroy(dev);
1148 }
1149}
1150
1151static void kvm_destroy_vm(struct kvm *kvm)
1152{
1153 int i;
1154 struct mm_struct *mm = kvm->mm;
1155
1156 kvm_destroy_pm_notifier(kvm);
1157 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1158 kvm_destroy_vm_debugfs(kvm);
1159 kvm_arch_sync_events(kvm);
1160 mutex_lock(&kvm_lock);
1161 list_del(&kvm->vm_list);
1162 mutex_unlock(&kvm_lock);
1163 kvm_arch_pre_destroy_vm(kvm);
1164
1165 kvm_free_irq_routing(kvm);
1166 for (i = 0; i < KVM_NR_BUSES; i++) {
1167 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1168
1169 if (bus)
1170 kvm_io_bus_destroy(bus);
1171 kvm->buses[i] = NULL;
1172 }
1173 kvm_coalesced_mmio_free(kvm);
1174#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1175 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1176
1177
1178
1179
1180
1181
1182
1183
1184 WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1185 kvm->mn_active_invalidate_count = 0;
1186#else
1187 kvm_arch_flush_shadow_all(kvm);
1188#endif
1189 kvm_arch_destroy_vm(kvm);
1190 kvm_destroy_devices(kvm);
1191 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1192 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1193 cleanup_srcu_struct(&kvm->irq_srcu);
1194 cleanup_srcu_struct(&kvm->srcu);
1195 kvm_arch_free_vm(kvm);
1196 preempt_notifier_dec();
1197 hardware_disable_all();
1198 mmdrop(mm);
1199}
1200
1201void kvm_get_kvm(struct kvm *kvm)
1202{
1203 refcount_inc(&kvm->users_count);
1204}
1205EXPORT_SYMBOL_GPL(kvm_get_kvm);
1206
1207
1208
1209
1210
1211bool kvm_get_kvm_safe(struct kvm *kvm)
1212{
1213 return refcount_inc_not_zero(&kvm->users_count);
1214}
1215EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1216
1217void kvm_put_kvm(struct kvm *kvm)
1218{
1219 if (refcount_dec_and_test(&kvm->users_count))
1220 kvm_destroy_vm(kvm);
1221}
1222EXPORT_SYMBOL_GPL(kvm_put_kvm);
1223
1224
1225
1226
1227
1228
1229
1230
1231void kvm_put_kvm_no_destroy(struct kvm *kvm)
1232{
1233 WARN_ON(refcount_dec_and_test(&kvm->users_count));
1234}
1235EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1236
1237static int kvm_vm_release(struct inode *inode, struct file *filp)
1238{
1239 struct kvm *kvm = filp->private_data;
1240
1241 kvm_irqfd_release(kvm);
1242
1243 kvm_put_kvm(kvm);
1244 return 0;
1245}
1246
1247
1248
1249
1250
1251static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1252{
1253 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
1254
1255 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
1256 if (!memslot->dirty_bitmap)
1257 return -ENOMEM;
1258
1259 return 0;
1260}
1261
1262
1263
1264
1265
1266static inline void kvm_memslot_delete(struct kvm_memslots *slots,
1267 struct kvm_memory_slot *memslot)
1268{
1269 struct kvm_memory_slot *mslots = slots->memslots;
1270 int i;
1271
1272 if (WARN_ON(slots->id_to_index[memslot->id] == -1))
1273 return;
1274
1275 slots->used_slots--;
1276
1277 if (atomic_read(&slots->last_used_slot) >= slots->used_slots)
1278 atomic_set(&slots->last_used_slot, 0);
1279
1280 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
1281 mslots[i] = mslots[i + 1];
1282 slots->id_to_index[mslots[i].id] = i;
1283 }
1284 mslots[i] = *memslot;
1285 slots->id_to_index[memslot->id] = -1;
1286}
1287
1288
1289
1290
1291
1292static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
1293{
1294 return slots->used_slots++;
1295}
1296
1297
1298
1299
1300
1301
1302
1303
1304static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
1305 struct kvm_memory_slot *memslot)
1306{
1307 struct kvm_memory_slot *mslots = slots->memslots;
1308 int i;
1309
1310 if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
1311 WARN_ON_ONCE(!slots->used_slots))
1312 return -1;
1313
1314
1315
1316
1317
1318
1319 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
1320 if (memslot->base_gfn > mslots[i + 1].base_gfn)
1321 break;
1322
1323 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1324
1325
1326 mslots[i] = mslots[i + 1];
1327 slots->id_to_index[mslots[i].id] = i;
1328 }
1329 return i;
1330}
1331
1332
1333
1334
1335
1336
1337
1338
1339static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1340 struct kvm_memory_slot *memslot,
1341 int start)
1342{
1343 struct kvm_memory_slot *mslots = slots->memslots;
1344 int i;
1345
1346 for (i = start; i > 0; i--) {
1347 if (memslot->base_gfn < mslots[i - 1].base_gfn)
1348 break;
1349
1350 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1351
1352
1353 mslots[i] = mslots[i - 1];
1354 slots->id_to_index[mslots[i].id] = i;
1355 }
1356 return i;
1357}
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400static void update_memslots(struct kvm_memslots *slots,
1401 struct kvm_memory_slot *memslot,
1402 enum kvm_mr_change change)
1403{
1404 int i;
1405
1406 if (change == KVM_MR_DELETE) {
1407 kvm_memslot_delete(slots, memslot);
1408 } else {
1409 if (change == KVM_MR_CREATE)
1410 i = kvm_memslot_insert_back(slots);
1411 else
1412 i = kvm_memslot_move_backward(slots, memslot);
1413 i = kvm_memslot_move_forward(slots, memslot, i);
1414
1415
1416
1417
1418
1419 slots->memslots[i] = *memslot;
1420 slots->id_to_index[memslot->id] = i;
1421 }
1422}
1423
1424static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1425{
1426 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1427
1428#ifdef __KVM_HAVE_READONLY_MEM
1429 valid_flags |= KVM_MEM_READONLY;
1430#endif
1431
1432 if (mem->flags & ~valid_flags)
1433 return -EINVAL;
1434
1435 return 0;
1436}
1437
1438static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
1439 int as_id, struct kvm_memslots *slots)
1440{
1441 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
1442 u64 gen = old_memslots->generation;
1443
1444 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1445 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1446
1447
1448
1449
1450
1451
1452 spin_lock(&kvm->mn_invalidate_lock);
1453 prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1454 while (kvm->mn_active_invalidate_count) {
1455 set_current_state(TASK_UNINTERRUPTIBLE);
1456 spin_unlock(&kvm->mn_invalidate_lock);
1457 schedule();
1458 spin_lock(&kvm->mn_invalidate_lock);
1459 }
1460 finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1461 rcu_assign_pointer(kvm->memslots[as_id], slots);
1462 spin_unlock(&kvm->mn_invalidate_lock);
1463
1464
1465
1466
1467
1468
1469 mutex_unlock(&kvm->slots_arch_lock);
1470
1471 synchronize_srcu_expedited(&kvm->srcu);
1472
1473
1474
1475
1476
1477
1478
1479 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1480
1481
1482
1483
1484
1485
1486
1487
1488 gen += KVM_ADDRESS_SPACE_NUM;
1489
1490 kvm_arch_memslots_updated(kvm, gen);
1491
1492 slots->generation = gen;
1493
1494 return old_memslots;
1495}
1496
1497static size_t kvm_memslots_size(int slots)
1498{
1499 return sizeof(struct kvm_memslots) +
1500 (sizeof(struct kvm_memory_slot) * slots);
1501}
1502
1503static void kvm_copy_memslots(struct kvm_memslots *to,
1504 struct kvm_memslots *from)
1505{
1506 memcpy(to, from, kvm_memslots_size(from->used_slots));
1507}
1508
1509
1510
1511
1512
1513
1514static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1515 enum kvm_mr_change change)
1516{
1517 struct kvm_memslots *slots;
1518 size_t new_size;
1519
1520 if (change == KVM_MR_CREATE)
1521 new_size = kvm_memslots_size(old->used_slots + 1);
1522 else
1523 new_size = kvm_memslots_size(old->used_slots);
1524
1525 slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1526 if (likely(slots))
1527 kvm_copy_memslots(slots, old);
1528
1529 return slots;
1530}
1531
1532static int kvm_set_memslot(struct kvm *kvm,
1533 const struct kvm_userspace_memory_region *mem,
1534 struct kvm_memory_slot *new, int as_id,
1535 enum kvm_mr_change change)
1536{
1537 struct kvm_memory_slot *slot, old;
1538 struct kvm_memslots *slots;
1539 int r;
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555 mutex_lock(&kvm->slots_arch_lock);
1556
1557 slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
1558 if (!slots) {
1559 mutex_unlock(&kvm->slots_arch_lock);
1560 return -ENOMEM;
1561 }
1562
1563 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1564
1565
1566
1567
1568 slot = id_to_memslot(slots, new->id);
1569 slot->flags |= KVM_MEMSLOT_INVALID;
1570
1571
1572
1573
1574
1575
1576 slots = install_new_memslots(kvm, as_id, slots);
1577
1578
1579
1580
1581
1582
1583
1584
1585 kvm_arch_flush_shadow_memslot(kvm, slot);
1586
1587
1588 mutex_lock(&kvm->slots_arch_lock);
1589
1590
1591
1592
1593
1594
1595
1596 kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id));
1597 }
1598
1599
1600
1601
1602
1603
1604
1605
1606 slot = id_to_memslot(slots, new->id);
1607 if (slot) {
1608 old = *slot;
1609 } else {
1610 WARN_ON_ONCE(change != KVM_MR_CREATE);
1611 memset(&old, 0, sizeof(old));
1612 old.id = new->id;
1613 old.as_id = as_id;
1614 }
1615
1616
1617 memcpy(&new->arch, &old.arch, sizeof(old.arch));
1618
1619 r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
1620 if (r)
1621 goto out_slots;
1622
1623 update_memslots(slots, new, change);
1624 slots = install_new_memslots(kvm, as_id, slots);
1625
1626 kvm_arch_commit_memory_region(kvm, mem, &old, new, change);
1627
1628
1629 if (change == KVM_MR_DELETE)
1630 kvm_free_memslot(kvm, &old);
1631
1632 kvfree(slots);
1633 return 0;
1634
1635out_slots:
1636 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1637 slot = id_to_memslot(slots, new->id);
1638 slot->flags &= ~KVM_MEMSLOT_INVALID;
1639 slots = install_new_memslots(kvm, as_id, slots);
1640 } else {
1641 mutex_unlock(&kvm->slots_arch_lock);
1642 }
1643 kvfree(slots);
1644 return r;
1645}
1646
1647static int kvm_delete_memslot(struct kvm *kvm,
1648 const struct kvm_userspace_memory_region *mem,
1649 struct kvm_memory_slot *old, int as_id)
1650{
1651 struct kvm_memory_slot new;
1652
1653 if (!old->npages)
1654 return -EINVAL;
1655
1656 memset(&new, 0, sizeof(new));
1657 new.id = old->id;
1658
1659
1660
1661
1662 new.as_id = as_id;
1663
1664 return kvm_set_memslot(kvm, mem, &new, as_id, KVM_MR_DELETE);
1665}
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675int __kvm_set_memory_region(struct kvm *kvm,
1676 const struct kvm_userspace_memory_region *mem)
1677{
1678 struct kvm_memory_slot old, new;
1679 struct kvm_memory_slot *tmp;
1680 enum kvm_mr_change change;
1681 int as_id, id;
1682 int r;
1683
1684 r = check_memory_region_flags(mem);
1685 if (r)
1686 return r;
1687
1688 as_id = mem->slot >> 16;
1689 id = (u16)mem->slot;
1690
1691
1692 if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1693 (mem->memory_size != (unsigned long)mem->memory_size))
1694 return -EINVAL;
1695 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1696 return -EINVAL;
1697
1698 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1699 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1700 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1701 mem->memory_size))
1702 return -EINVAL;
1703 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1704 return -EINVAL;
1705 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1706 return -EINVAL;
1707
1708
1709
1710
1711
1712
1713
1714 tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1715 if (tmp) {
1716 old = *tmp;
1717 tmp = NULL;
1718 } else {
1719 memset(&old, 0, sizeof(old));
1720 old.id = id;
1721 }
1722
1723 if (!mem->memory_size)
1724 return kvm_delete_memslot(kvm, mem, &old, as_id);
1725
1726 new.as_id = as_id;
1727 new.id = id;
1728 new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1729 new.npages = mem->memory_size >> PAGE_SHIFT;
1730 new.flags = mem->flags;
1731 new.userspace_addr = mem->userspace_addr;
1732
1733 if (new.npages > KVM_MEM_MAX_NR_PAGES)
1734 return -EINVAL;
1735
1736 if (!old.npages) {
1737 change = KVM_MR_CREATE;
1738 new.dirty_bitmap = NULL;
1739 } else {
1740 if ((new.userspace_addr != old.userspace_addr) ||
1741 (new.npages != old.npages) ||
1742 ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1743 return -EINVAL;
1744
1745 if (new.base_gfn != old.base_gfn)
1746 change = KVM_MR_MOVE;
1747 else if (new.flags != old.flags)
1748 change = KVM_MR_FLAGS_ONLY;
1749 else
1750 return 0;
1751
1752
1753 new.dirty_bitmap = old.dirty_bitmap;
1754 }
1755
1756 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
1757
1758 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1759 if (tmp->id == id)
1760 continue;
1761 if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1762 (new.base_gfn >= tmp->base_gfn + tmp->npages)))
1763 return -EEXIST;
1764 }
1765 }
1766
1767
1768 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1769 new.dirty_bitmap = NULL;
1770 else if (!new.dirty_bitmap && !kvm->dirty_ring_size) {
1771 r = kvm_alloc_dirty_bitmap(&new);
1772 if (r)
1773 return r;
1774
1775 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1776 bitmap_set(new.dirty_bitmap, 0, new.npages);
1777 }
1778
1779 r = kvm_set_memslot(kvm, mem, &new, as_id, change);
1780 if (r)
1781 goto out_bitmap;
1782
1783 if (old.dirty_bitmap && !new.dirty_bitmap)
1784 kvm_destroy_dirty_bitmap(&old);
1785 return 0;
1786
1787out_bitmap:
1788 if (new.dirty_bitmap && !old.dirty_bitmap)
1789 kvm_destroy_dirty_bitmap(&new);
1790 return r;
1791}
1792EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1793
1794int kvm_set_memory_region(struct kvm *kvm,
1795 const struct kvm_userspace_memory_region *mem)
1796{
1797 int r;
1798
1799 mutex_lock(&kvm->slots_lock);
1800 r = __kvm_set_memory_region(kvm, mem);
1801 mutex_unlock(&kvm->slots_lock);
1802 return r;
1803}
1804EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1805
1806static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1807 struct kvm_userspace_memory_region *mem)
1808{
1809 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1810 return -EINVAL;
1811
1812 return kvm_set_memory_region(kvm, mem);
1813}
1814
1815#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1816
1817
1818
1819
1820
1821
1822
1823int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1824 int *is_dirty, struct kvm_memory_slot **memslot)
1825{
1826 struct kvm_memslots *slots;
1827 int i, as_id, id;
1828 unsigned long n;
1829 unsigned long any = 0;
1830
1831
1832 if (kvm->dirty_ring_size)
1833 return -ENXIO;
1834
1835 *memslot = NULL;
1836 *is_dirty = 0;
1837
1838 as_id = log->slot >> 16;
1839 id = (u16)log->slot;
1840 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1841 return -EINVAL;
1842
1843 slots = __kvm_memslots(kvm, as_id);
1844 *memslot = id_to_memslot(slots, id);
1845 if (!(*memslot) || !(*memslot)->dirty_bitmap)
1846 return -ENOENT;
1847
1848 kvm_arch_sync_dirty_log(kvm, *memslot);
1849
1850 n = kvm_dirty_bitmap_bytes(*memslot);
1851
1852 for (i = 0; !any && i < n/sizeof(long); ++i)
1853 any = (*memslot)->dirty_bitmap[i];
1854
1855 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1856 return -EFAULT;
1857
1858 if (any)
1859 *is_dirty = 1;
1860 return 0;
1861}
1862EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1863
1864#else
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
1887{
1888 struct kvm_memslots *slots;
1889 struct kvm_memory_slot *memslot;
1890 int i, as_id, id;
1891 unsigned long n;
1892 unsigned long *dirty_bitmap;
1893 unsigned long *dirty_bitmap_buffer;
1894 bool flush;
1895
1896
1897 if (kvm->dirty_ring_size)
1898 return -ENXIO;
1899
1900 as_id = log->slot >> 16;
1901 id = (u16)log->slot;
1902 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1903 return -EINVAL;
1904
1905 slots = __kvm_memslots(kvm, as_id);
1906 memslot = id_to_memslot(slots, id);
1907 if (!memslot || !memslot->dirty_bitmap)
1908 return -ENOENT;
1909
1910 dirty_bitmap = memslot->dirty_bitmap;
1911
1912 kvm_arch_sync_dirty_log(kvm, memslot);
1913
1914 n = kvm_dirty_bitmap_bytes(memslot);
1915 flush = false;
1916 if (kvm->manual_dirty_log_protect) {
1917
1918
1919
1920
1921
1922
1923
1924
1925 dirty_bitmap_buffer = dirty_bitmap;
1926 } else {
1927 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1928 memset(dirty_bitmap_buffer, 0, n);
1929
1930 KVM_MMU_LOCK(kvm);
1931 for (i = 0; i < n / sizeof(long); i++) {
1932 unsigned long mask;
1933 gfn_t offset;
1934
1935 if (!dirty_bitmap[i])
1936 continue;
1937
1938 flush = true;
1939 mask = xchg(&dirty_bitmap[i], 0);
1940 dirty_bitmap_buffer[i] = mask;
1941
1942 offset = i * BITS_PER_LONG;
1943 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1944 offset, mask);
1945 }
1946 KVM_MMU_UNLOCK(kvm);
1947 }
1948
1949 if (flush)
1950 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1951
1952 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1953 return -EFAULT;
1954 return 0;
1955}
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1978 struct kvm_dirty_log *log)
1979{
1980 int r;
1981
1982 mutex_lock(&kvm->slots_lock);
1983
1984 r = kvm_get_dirty_log_protect(kvm, log);
1985
1986 mutex_unlock(&kvm->slots_lock);
1987 return r;
1988}
1989
1990
1991
1992
1993
1994
1995
1996static int kvm_clear_dirty_log_protect(struct kvm *kvm,
1997 struct kvm_clear_dirty_log *log)
1998{
1999 struct kvm_memslots *slots;
2000 struct kvm_memory_slot *memslot;
2001 int as_id, id;
2002 gfn_t offset;
2003 unsigned long i, n;
2004 unsigned long *dirty_bitmap;
2005 unsigned long *dirty_bitmap_buffer;
2006 bool flush;
2007
2008
2009 if (kvm->dirty_ring_size)
2010 return -ENXIO;
2011
2012 as_id = log->slot >> 16;
2013 id = (u16)log->slot;
2014 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2015 return -EINVAL;
2016
2017 if (log->first_page & 63)
2018 return -EINVAL;
2019
2020 slots = __kvm_memslots(kvm, as_id);
2021 memslot = id_to_memslot(slots, id);
2022 if (!memslot || !memslot->dirty_bitmap)
2023 return -ENOENT;
2024
2025 dirty_bitmap = memslot->dirty_bitmap;
2026
2027 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2028
2029 if (log->first_page > memslot->npages ||
2030 log->num_pages > memslot->npages - log->first_page ||
2031 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2032 return -EINVAL;
2033
2034 kvm_arch_sync_dirty_log(kvm, memslot);
2035
2036 flush = false;
2037 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2038 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2039 return -EFAULT;
2040
2041 KVM_MMU_LOCK(kvm);
2042 for (offset = log->first_page, i = offset / BITS_PER_LONG,
2043 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2044 i++, offset += BITS_PER_LONG) {
2045 unsigned long mask = *dirty_bitmap_buffer++;
2046 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2047 if (!mask)
2048 continue;
2049
2050 mask &= atomic_long_fetch_andnot(mask, p);
2051
2052
2053
2054
2055
2056
2057
2058 if (mask) {
2059 flush = true;
2060 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2061 offset, mask);
2062 }
2063 }
2064 KVM_MMU_UNLOCK(kvm);
2065
2066 if (flush)
2067 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2068
2069 return 0;
2070}
2071
2072static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2073 struct kvm_clear_dirty_log *log)
2074{
2075 int r;
2076
2077 mutex_lock(&kvm->slots_lock);
2078
2079 r = kvm_clear_dirty_log_protect(kvm, log);
2080
2081 mutex_unlock(&kvm->slots_lock);
2082 return r;
2083}
2084#endif
2085
2086struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2087{
2088 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2089}
2090EXPORT_SYMBOL_GPL(gfn_to_memslot);
2091
2092struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2093{
2094 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2095 struct kvm_memory_slot *slot;
2096 int slot_index;
2097
2098 slot = try_get_memslot(slots, vcpu->last_used_slot, gfn);
2099 if (slot)
2100 return slot;
2101
2102
2103
2104
2105
2106
2107 slot = search_memslots(slots, gfn, &slot_index);
2108 if (slot) {
2109 vcpu->last_used_slot = slot_index;
2110 return slot;
2111 }
2112
2113 return NULL;
2114}
2115EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
2116
2117bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2118{
2119 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2120
2121 return kvm_is_visible_memslot(memslot);
2122}
2123EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2124
2125bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2126{
2127 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2128
2129 return kvm_is_visible_memslot(memslot);
2130}
2131EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2132
2133unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2134{
2135 struct vm_area_struct *vma;
2136 unsigned long addr, size;
2137
2138 size = PAGE_SIZE;
2139
2140 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2141 if (kvm_is_error_hva(addr))
2142 return PAGE_SIZE;
2143
2144 mmap_read_lock(current->mm);
2145 vma = find_vma(current->mm, addr);
2146 if (!vma)
2147 goto out;
2148
2149 size = vma_kernel_pagesize(vma);
2150
2151out:
2152 mmap_read_unlock(current->mm);
2153
2154 return size;
2155}
2156
2157static bool memslot_is_readonly(struct kvm_memory_slot *slot)
2158{
2159 return slot->flags & KVM_MEM_READONLY;
2160}
2161
2162static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2163 gfn_t *nr_pages, bool write)
2164{
2165 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2166 return KVM_HVA_ERR_BAD;
2167
2168 if (memslot_is_readonly(slot) && write)
2169 return KVM_HVA_ERR_RO_BAD;
2170
2171 if (nr_pages)
2172 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2173
2174 return __gfn_to_hva_memslot(slot, gfn);
2175}
2176
2177static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2178 gfn_t *nr_pages)
2179{
2180 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2181}
2182
2183unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2184 gfn_t gfn)
2185{
2186 return gfn_to_hva_many(slot, gfn, NULL);
2187}
2188EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2189
2190unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2191{
2192 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2193}
2194EXPORT_SYMBOL_GPL(gfn_to_hva);
2195
2196unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2197{
2198 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2199}
2200EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2211 gfn_t gfn, bool *writable)
2212{
2213 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2214
2215 if (!kvm_is_error_hva(hva) && writable)
2216 *writable = !memslot_is_readonly(slot);
2217
2218 return hva;
2219}
2220
2221unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2222{
2223 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2224
2225 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2226}
2227
2228unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2229{
2230 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2231
2232 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2233}
2234
2235static inline int check_user_page_hwpoison(unsigned long addr)
2236{
2237 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2238
2239 rc = get_user_pages(addr, 1, flags, NULL, NULL);
2240 return rc == -EHWPOISON;
2241}
2242
2243
2244
2245
2246
2247
2248static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2249 bool *writable, kvm_pfn_t *pfn)
2250{
2251 struct page *page[1];
2252
2253
2254
2255
2256
2257
2258 if (!(write_fault || writable))
2259 return false;
2260
2261 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2262 *pfn = page_to_pfn(page[0]);
2263
2264 if (writable)
2265 *writable = true;
2266 return true;
2267 }
2268
2269 return false;
2270}
2271
2272
2273
2274
2275
2276static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2277 bool *writable, kvm_pfn_t *pfn)
2278{
2279 unsigned int flags = FOLL_HWPOISON;
2280 struct page *page;
2281 int npages = 0;
2282
2283 might_sleep();
2284
2285 if (writable)
2286 *writable = write_fault;
2287
2288 if (write_fault)
2289 flags |= FOLL_WRITE;
2290 if (async)
2291 flags |= FOLL_NOWAIT;
2292
2293 npages = get_user_pages_unlocked(addr, 1, &page, flags);
2294 if (npages != 1)
2295 return npages;
2296
2297
2298 if (unlikely(!write_fault) && writable) {
2299 struct page *wpage;
2300
2301 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2302 *writable = true;
2303 put_page(page);
2304 page = wpage;
2305 }
2306 }
2307 *pfn = page_to_pfn(page);
2308 return npages;
2309}
2310
2311static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2312{
2313 if (unlikely(!(vma->vm_flags & VM_READ)))
2314 return false;
2315
2316 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2317 return false;
2318
2319 return true;
2320}
2321
2322static int kvm_try_get_pfn(kvm_pfn_t pfn)
2323{
2324 if (kvm_is_reserved_pfn(pfn))
2325 return 1;
2326 return get_page_unless_zero(pfn_to_page(pfn));
2327}
2328
2329static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2330 unsigned long addr, bool *async,
2331 bool write_fault, bool *writable,
2332 kvm_pfn_t *p_pfn)
2333{
2334 kvm_pfn_t pfn;
2335 pte_t *ptep;
2336 spinlock_t *ptl;
2337 int r;
2338
2339 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2340 if (r) {
2341
2342
2343
2344
2345 bool unlocked = false;
2346 r = fixup_user_fault(current->mm, addr,
2347 (write_fault ? FAULT_FLAG_WRITE : 0),
2348 &unlocked);
2349 if (unlocked)
2350 return -EAGAIN;
2351 if (r)
2352 return r;
2353
2354 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2355 if (r)
2356 return r;
2357 }
2358
2359 if (write_fault && !pte_write(*ptep)) {
2360 pfn = KVM_PFN_ERR_RO_FAULT;
2361 goto out;
2362 }
2363
2364 if (writable)
2365 *writable = pte_write(*ptep);
2366 pfn = pte_pfn(*ptep);
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385 if (!kvm_try_get_pfn(pfn))
2386 r = -EFAULT;
2387
2388out:
2389 pte_unmap_unlock(ptep, ptl);
2390 *p_pfn = pfn;
2391
2392 return r;
2393}
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
2410 bool write_fault, bool *writable)
2411{
2412 struct vm_area_struct *vma;
2413 kvm_pfn_t pfn = 0;
2414 int npages, r;
2415
2416
2417 BUG_ON(atomic && async);
2418
2419 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2420 return pfn;
2421
2422 if (atomic)
2423 return KVM_PFN_ERR_FAULT;
2424
2425 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
2426 if (npages == 1)
2427 return pfn;
2428
2429 mmap_read_lock(current->mm);
2430 if (npages == -EHWPOISON ||
2431 (!async && check_user_page_hwpoison(addr))) {
2432 pfn = KVM_PFN_ERR_HWPOISON;
2433 goto exit;
2434 }
2435
2436retry:
2437 vma = vma_lookup(current->mm, addr);
2438
2439 if (vma == NULL)
2440 pfn = KVM_PFN_ERR_FAULT;
2441 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2442 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
2443 if (r == -EAGAIN)
2444 goto retry;
2445 if (r < 0)
2446 pfn = KVM_PFN_ERR_FAULT;
2447 } else {
2448 if (async && vma_is_valid(vma, write_fault))
2449 *async = true;
2450 pfn = KVM_PFN_ERR_FAULT;
2451 }
2452exit:
2453 mmap_read_unlock(current->mm);
2454 return pfn;
2455}
2456
2457kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
2458 bool atomic, bool *async, bool write_fault,
2459 bool *writable, hva_t *hva)
2460{
2461 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2462
2463 if (hva)
2464 *hva = addr;
2465
2466 if (addr == KVM_HVA_ERR_RO_BAD) {
2467 if (writable)
2468 *writable = false;
2469 return KVM_PFN_ERR_RO_FAULT;
2470 }
2471
2472 if (kvm_is_error_hva(addr)) {
2473 if (writable)
2474 *writable = false;
2475 return KVM_PFN_NOSLOT;
2476 }
2477
2478
2479 if (writable && memslot_is_readonly(slot)) {
2480 *writable = false;
2481 writable = NULL;
2482 }
2483
2484 return hva_to_pfn(addr, atomic, async, write_fault,
2485 writable);
2486}
2487EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2488
2489kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2490 bool *writable)
2491{
2492 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2493 write_fault, writable, NULL);
2494}
2495EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2496
2497kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
2498{
2499 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
2500}
2501EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2502
2503kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
2504{
2505 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
2506}
2507EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2508
2509kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2510{
2511 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2512}
2513EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2514
2515kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2516{
2517 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2518}
2519EXPORT_SYMBOL_GPL(gfn_to_pfn);
2520
2521kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2522{
2523 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2524}
2525EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2526
2527int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2528 struct page **pages, int nr_pages)
2529{
2530 unsigned long addr;
2531 gfn_t entry = 0;
2532
2533 addr = gfn_to_hva_many(slot, gfn, &entry);
2534 if (kvm_is_error_hva(addr))
2535 return -1;
2536
2537 if (entry < nr_pages)
2538 return 0;
2539
2540 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2541}
2542EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2543
2544static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
2545{
2546 if (is_error_noslot_pfn(pfn))
2547 return KVM_ERR_PTR_BAD_PAGE;
2548
2549 if (kvm_is_reserved_pfn(pfn)) {
2550 WARN_ON(1);
2551 return KVM_ERR_PTR_BAD_PAGE;
2552 }
2553
2554 return pfn_to_page(pfn);
2555}
2556
2557struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2558{
2559 kvm_pfn_t pfn;
2560
2561 pfn = gfn_to_pfn(kvm, gfn);
2562
2563 return kvm_pfn_to_page(pfn);
2564}
2565EXPORT_SYMBOL_GPL(gfn_to_page);
2566
2567void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
2568{
2569 if (pfn == 0)
2570 return;
2571
2572 if (dirty)
2573 kvm_release_pfn_dirty(pfn);
2574 else
2575 kvm_release_pfn_clean(pfn);
2576}
2577
2578int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2579{
2580 kvm_pfn_t pfn;
2581 void *hva = NULL;
2582 struct page *page = KVM_UNMAPPED_PAGE;
2583
2584 if (!map)
2585 return -EINVAL;
2586
2587 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2588 if (is_error_noslot_pfn(pfn))
2589 return -EINVAL;
2590
2591 if (pfn_valid(pfn)) {
2592 page = pfn_to_page(pfn);
2593 hva = kmap(page);
2594#ifdef CONFIG_HAS_IOMEM
2595 } else {
2596 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2597#endif
2598 }
2599
2600 if (!hva)
2601 return -EFAULT;
2602
2603 map->page = page;
2604 map->hva = hva;
2605 map->pfn = pfn;
2606 map->gfn = gfn;
2607
2608 return 0;
2609}
2610EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2611
2612void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2613{
2614 if (!map)
2615 return;
2616
2617 if (!map->hva)
2618 return;
2619
2620 if (map->page != KVM_UNMAPPED_PAGE)
2621 kunmap(map->page);
2622#ifdef CONFIG_HAS_IOMEM
2623 else
2624 memunmap(map->hva);
2625#endif
2626
2627 if (dirty)
2628 kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
2629
2630 kvm_release_pfn(map->pfn, dirty);
2631
2632 map->hva = NULL;
2633 map->page = NULL;
2634}
2635EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2636
2637struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2638{
2639 kvm_pfn_t pfn;
2640
2641 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2642
2643 return kvm_pfn_to_page(pfn);
2644}
2645EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2646
2647void kvm_release_page_clean(struct page *page)
2648{
2649 WARN_ON(is_error_page(page));
2650
2651 kvm_release_pfn_clean(page_to_pfn(page));
2652}
2653EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2654
2655void kvm_release_pfn_clean(kvm_pfn_t pfn)
2656{
2657 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2658 put_page(pfn_to_page(pfn));
2659}
2660EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2661
2662void kvm_release_page_dirty(struct page *page)
2663{
2664 WARN_ON(is_error_page(page));
2665
2666 kvm_release_pfn_dirty(page_to_pfn(page));
2667}
2668EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2669
2670void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2671{
2672 kvm_set_pfn_dirty(pfn);
2673 kvm_release_pfn_clean(pfn);
2674}
2675EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2676
2677void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2678{
2679 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2680 SetPageDirty(pfn_to_page(pfn));
2681}
2682EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2683
2684void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2685{
2686 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2687 mark_page_accessed(pfn_to_page(pfn));
2688}
2689EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2690
2691static int next_segment(unsigned long len, int offset)
2692{
2693 if (len > PAGE_SIZE - offset)
2694 return PAGE_SIZE - offset;
2695 else
2696 return len;
2697}
2698
2699static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2700 void *data, int offset, int len)
2701{
2702 int r;
2703 unsigned long addr;
2704
2705 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2706 if (kvm_is_error_hva(addr))
2707 return -EFAULT;
2708 r = __copy_from_user(data, (void __user *)addr + offset, len);
2709 if (r)
2710 return -EFAULT;
2711 return 0;
2712}
2713
2714int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2715 int len)
2716{
2717 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2718
2719 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2720}
2721EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2722
2723int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2724 int offset, int len)
2725{
2726 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2727
2728 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2729}
2730EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2731
2732int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2733{
2734 gfn_t gfn = gpa >> PAGE_SHIFT;
2735 int seg;
2736 int offset = offset_in_page(gpa);
2737 int ret;
2738
2739 while ((seg = next_segment(len, offset)) != 0) {
2740 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2741 if (ret < 0)
2742 return ret;
2743 offset = 0;
2744 len -= seg;
2745 data += seg;
2746 ++gfn;
2747 }
2748 return 0;
2749}
2750EXPORT_SYMBOL_GPL(kvm_read_guest);
2751
2752int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2753{
2754 gfn_t gfn = gpa >> PAGE_SHIFT;
2755 int seg;
2756 int offset = offset_in_page(gpa);
2757 int ret;
2758
2759 while ((seg = next_segment(len, offset)) != 0) {
2760 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2761 if (ret < 0)
2762 return ret;
2763 offset = 0;
2764 len -= seg;
2765 data += seg;
2766 ++gfn;
2767 }
2768 return 0;
2769}
2770EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2771
2772static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2773 void *data, int offset, unsigned long len)
2774{
2775 int r;
2776 unsigned long addr;
2777
2778 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2779 if (kvm_is_error_hva(addr))
2780 return -EFAULT;
2781 pagefault_disable();
2782 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2783 pagefault_enable();
2784 if (r)
2785 return -EFAULT;
2786 return 0;
2787}
2788
2789int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2790 void *data, unsigned long len)
2791{
2792 gfn_t gfn = gpa >> PAGE_SHIFT;
2793 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2794 int offset = offset_in_page(gpa);
2795
2796 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2797}
2798EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2799
2800static int __kvm_write_guest_page(struct kvm *kvm,
2801 struct kvm_memory_slot *memslot, gfn_t gfn,
2802 const void *data, int offset, int len)
2803{
2804 int r;
2805 unsigned long addr;
2806
2807 addr = gfn_to_hva_memslot(memslot, gfn);
2808 if (kvm_is_error_hva(addr))
2809 return -EFAULT;
2810 r = __copy_to_user((void __user *)addr + offset, data, len);
2811 if (r)
2812 return -EFAULT;
2813 mark_page_dirty_in_slot(kvm, memslot, gfn);
2814 return 0;
2815}
2816
2817int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2818 const void *data, int offset, int len)
2819{
2820 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2821
2822 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
2823}
2824EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2825
2826int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2827 const void *data, int offset, int len)
2828{
2829 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2830
2831 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
2832}
2833EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2834
2835int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2836 unsigned long len)
2837{
2838 gfn_t gfn = gpa >> PAGE_SHIFT;
2839 int seg;
2840 int offset = offset_in_page(gpa);
2841 int ret;
2842
2843 while ((seg = next_segment(len, offset)) != 0) {
2844 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2845 if (ret < 0)
2846 return ret;
2847 offset = 0;
2848 len -= seg;
2849 data += seg;
2850 ++gfn;
2851 }
2852 return 0;
2853}
2854EXPORT_SYMBOL_GPL(kvm_write_guest);
2855
2856int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2857 unsigned long len)
2858{
2859 gfn_t gfn = gpa >> PAGE_SHIFT;
2860 int seg;
2861 int offset = offset_in_page(gpa);
2862 int ret;
2863
2864 while ((seg = next_segment(len, offset)) != 0) {
2865 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2866 if (ret < 0)
2867 return ret;
2868 offset = 0;
2869 len -= seg;
2870 data += seg;
2871 ++gfn;
2872 }
2873 return 0;
2874}
2875EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2876
2877static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2878 struct gfn_to_hva_cache *ghc,
2879 gpa_t gpa, unsigned long len)
2880{
2881 int offset = offset_in_page(gpa);
2882 gfn_t start_gfn = gpa >> PAGE_SHIFT;
2883 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2884 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2885 gfn_t nr_pages_avail;
2886
2887
2888 ghc->generation = slots->generation;
2889
2890 if (start_gfn > end_gfn) {
2891 ghc->hva = KVM_HVA_ERR_BAD;
2892 return -EINVAL;
2893 }
2894
2895
2896
2897
2898
2899 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
2900 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2901 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2902 &nr_pages_avail);
2903 if (kvm_is_error_hva(ghc->hva))
2904 return -EFAULT;
2905 }
2906
2907
2908 if (nr_pages_needed == 1)
2909 ghc->hva += offset;
2910 else
2911 ghc->memslot = NULL;
2912
2913 ghc->gpa = gpa;
2914 ghc->len = len;
2915 return 0;
2916}
2917
2918int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2919 gpa_t gpa, unsigned long len)
2920{
2921 struct kvm_memslots *slots = kvm_memslots(kvm);
2922 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
2923}
2924EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
2925
2926int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2927 void *data, unsigned int offset,
2928 unsigned long len)
2929{
2930 struct kvm_memslots *slots = kvm_memslots(kvm);
2931 int r;
2932 gpa_t gpa = ghc->gpa + offset;
2933
2934 if (WARN_ON_ONCE(len + offset > ghc->len))
2935 return -EINVAL;
2936
2937 if (slots->generation != ghc->generation) {
2938 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2939 return -EFAULT;
2940 }
2941
2942 if (kvm_is_error_hva(ghc->hva))
2943 return -EFAULT;
2944
2945 if (unlikely(!ghc->memslot))
2946 return kvm_write_guest(kvm, gpa, data, len);
2947
2948 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
2949 if (r)
2950 return -EFAULT;
2951 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
2952
2953 return 0;
2954}
2955EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
2956
2957int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2958 void *data, unsigned long len)
2959{
2960 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
2961}
2962EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
2963
2964int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2965 void *data, unsigned int offset,
2966 unsigned long len)
2967{
2968 struct kvm_memslots *slots = kvm_memslots(kvm);
2969 int r;
2970 gpa_t gpa = ghc->gpa + offset;
2971
2972 if (WARN_ON_ONCE(len + offset > ghc->len))
2973 return -EINVAL;
2974
2975 if (slots->generation != ghc->generation) {
2976 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2977 return -EFAULT;
2978 }
2979
2980 if (kvm_is_error_hva(ghc->hva))
2981 return -EFAULT;
2982
2983 if (unlikely(!ghc->memslot))
2984 return kvm_read_guest(kvm, gpa, data, len);
2985
2986 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
2987 if (r)
2988 return -EFAULT;
2989
2990 return 0;
2991}
2992EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
2993
2994int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2995 void *data, unsigned long len)
2996{
2997 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
2998}
2999EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3000
3001int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3002{
3003 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3004 gfn_t gfn = gpa >> PAGE_SHIFT;
3005 int seg;
3006 int offset = offset_in_page(gpa);
3007 int ret;
3008
3009 while ((seg = next_segment(len, offset)) != 0) {
3010 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3011 if (ret < 0)
3012 return ret;
3013 offset = 0;
3014 len -= seg;
3015 ++gfn;
3016 }
3017 return 0;
3018}
3019EXPORT_SYMBOL_GPL(kvm_clear_guest);
3020
3021void mark_page_dirty_in_slot(struct kvm *kvm,
3022 struct kvm_memory_slot *memslot,
3023 gfn_t gfn)
3024{
3025 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3026 unsigned long rel_gfn = gfn - memslot->base_gfn;
3027 u32 slot = (memslot->as_id << 16) | memslot->id;
3028
3029 if (kvm->dirty_ring_size)
3030 kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
3031 slot, rel_gfn);
3032 else
3033 set_bit_le(rel_gfn, memslot->dirty_bitmap);
3034 }
3035}
3036EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3037
3038void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3039{
3040 struct kvm_memory_slot *memslot;
3041
3042 memslot = gfn_to_memslot(kvm, gfn);
3043 mark_page_dirty_in_slot(kvm, memslot, gfn);
3044}
3045EXPORT_SYMBOL_GPL(mark_page_dirty);
3046
3047void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3048{
3049 struct kvm_memory_slot *memslot;
3050
3051 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3052 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3053}
3054EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3055
3056void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3057{
3058 if (!vcpu->sigset_active)
3059 return;
3060
3061
3062
3063
3064
3065
3066
3067 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked);
3068}
3069
3070void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3071{
3072 if (!vcpu->sigset_active)
3073 return;
3074
3075 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL);
3076 sigemptyset(¤t->real_blocked);
3077}
3078
3079static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3080{
3081 unsigned int old, val, grow, grow_start;
3082
3083 old = val = vcpu->halt_poll_ns;
3084 grow_start = READ_ONCE(halt_poll_ns_grow_start);
3085 grow = READ_ONCE(halt_poll_ns_grow);
3086 if (!grow)
3087 goto out;
3088
3089 val *= grow;
3090 if (val < grow_start)
3091 val = grow_start;
3092
3093 if (val > vcpu->kvm->max_halt_poll_ns)
3094 val = vcpu->kvm->max_halt_poll_ns;
3095
3096 vcpu->halt_poll_ns = val;
3097out:
3098 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3099}
3100
3101static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3102{
3103 unsigned int old, val, shrink, grow_start;
3104
3105 old = val = vcpu->halt_poll_ns;
3106 shrink = READ_ONCE(halt_poll_ns_shrink);
3107 grow_start = READ_ONCE(halt_poll_ns_grow_start);
3108 if (shrink == 0)
3109 val = 0;
3110 else
3111 val /= shrink;
3112
3113 if (val < grow_start)
3114 val = 0;
3115
3116 vcpu->halt_poll_ns = val;
3117 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3118}
3119
3120static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3121{
3122 int ret = -EINTR;
3123 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3124
3125 if (kvm_arch_vcpu_runnable(vcpu)) {
3126 kvm_make_request(KVM_REQ_UNHALT, vcpu);
3127 goto out;
3128 }
3129 if (kvm_cpu_has_pending_timer(vcpu))
3130 goto out;
3131 if (signal_pending(current))
3132 goto out;
3133 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3134 goto out;
3135
3136 ret = 0;
3137out:
3138 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3139 return ret;
3140}
3141
3142static inline void
3143update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
3144{
3145 if (waited)
3146 vcpu->stat.generic.halt_poll_fail_ns += poll_ns;
3147 else
3148 vcpu->stat.generic.halt_poll_success_ns += poll_ns;
3149}
3150
3151
3152
3153
3154void kvm_vcpu_block(struct kvm_vcpu *vcpu)
3155{
3156 ktime_t start, cur, poll_end;
3157 bool waited = false;
3158 u64 block_ns;
3159
3160 kvm_arch_vcpu_blocking(vcpu);
3161
3162 start = cur = poll_end = ktime_get();
3163 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
3164 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
3165
3166 ++vcpu->stat.generic.halt_attempted_poll;
3167 do {
3168
3169
3170
3171
3172 if (kvm_vcpu_check_block(vcpu) < 0) {
3173 ++vcpu->stat.generic.halt_successful_poll;
3174 if (!vcpu_valid_wakeup(vcpu))
3175 ++vcpu->stat.generic.halt_poll_invalid;
3176
3177 KVM_STATS_LOG_HIST_UPDATE(
3178 vcpu->stat.generic.halt_poll_success_hist,
3179 ktime_to_ns(ktime_get()) -
3180 ktime_to_ns(start));
3181 goto out;
3182 }
3183 cpu_relax();
3184 poll_end = cur = ktime_get();
3185 } while (kvm_vcpu_can_poll(cur, stop));
3186
3187 KVM_STATS_LOG_HIST_UPDATE(
3188 vcpu->stat.generic.halt_poll_fail_hist,
3189 ktime_to_ns(ktime_get()) - ktime_to_ns(start));
3190 }
3191
3192
3193 prepare_to_rcuwait(&vcpu->wait);
3194 for (;;) {
3195 set_current_state(TASK_INTERRUPTIBLE);
3196
3197 if (kvm_vcpu_check_block(vcpu) < 0)
3198 break;
3199
3200 waited = true;
3201 schedule();
3202 }
3203 finish_rcuwait(&vcpu->wait);
3204 cur = ktime_get();
3205 if (waited) {
3206 vcpu->stat.generic.halt_wait_ns +=
3207 ktime_to_ns(cur) - ktime_to_ns(poll_end);
3208 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3209 ktime_to_ns(cur) - ktime_to_ns(poll_end));
3210 }
3211out:
3212 kvm_arch_vcpu_unblocking(vcpu);
3213 block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3214
3215 update_halt_poll_stats(
3216 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
3217
3218 if (!kvm_arch_no_poll(vcpu)) {
3219 if (!vcpu_valid_wakeup(vcpu)) {
3220 shrink_halt_poll_ns(vcpu);
3221 } else if (vcpu->kvm->max_halt_poll_ns) {
3222 if (block_ns <= vcpu->halt_poll_ns)
3223 ;
3224
3225 else if (vcpu->halt_poll_ns &&
3226 block_ns > vcpu->kvm->max_halt_poll_ns)
3227 shrink_halt_poll_ns(vcpu);
3228
3229 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
3230 block_ns < vcpu->kvm->max_halt_poll_ns)
3231 grow_halt_poll_ns(vcpu);
3232 } else {
3233 vcpu->halt_poll_ns = 0;
3234 }
3235 }
3236
3237 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
3238 kvm_arch_vcpu_block_finish(vcpu);
3239}
3240EXPORT_SYMBOL_GPL(kvm_vcpu_block);
3241
3242bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3243{
3244 struct rcuwait *waitp;
3245
3246 waitp = kvm_arch_vcpu_get_wait(vcpu);
3247 if (rcuwait_wake_up(waitp)) {
3248 WRITE_ONCE(vcpu->ready, true);
3249 ++vcpu->stat.generic.halt_wakeup;
3250 return true;
3251 }
3252
3253 return false;
3254}
3255EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3256
3257#ifndef CONFIG_S390
3258
3259
3260
3261void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3262{
3263 int me, cpu;
3264
3265 if (kvm_vcpu_wake_up(vcpu))
3266 return;
3267
3268
3269
3270
3271
3272
3273
3274
3275 me = get_cpu();
3276 if (kvm_arch_vcpu_should_kick(vcpu)) {
3277 cpu = READ_ONCE(vcpu->cpu);
3278 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3279 smp_send_reschedule(cpu);
3280 }
3281 put_cpu();
3282}
3283EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3284#endif
3285
3286int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3287{
3288 struct pid *pid;
3289 struct task_struct *task = NULL;
3290 int ret = 0;
3291
3292 rcu_read_lock();
3293 pid = rcu_dereference(target->pid);
3294 if (pid)
3295 task = get_pid_task(pid, PIDTYPE_PID);
3296 rcu_read_unlock();
3297 if (!task)
3298 return ret;
3299 ret = yield_to(task, 1);
3300 put_task_struct(task);
3301
3302 return ret;
3303}
3304EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3329{
3330#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3331 bool eligible;
3332
3333 eligible = !vcpu->spin_loop.in_spin_loop ||
3334 vcpu->spin_loop.dy_eligible;
3335
3336 if (vcpu->spin_loop.in_spin_loop)
3337 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3338
3339 return eligible;
3340#else
3341 return true;
3342#endif
3343}
3344
3345
3346
3347
3348
3349
3350bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3351{
3352 return kvm_arch_vcpu_runnable(vcpu);
3353}
3354
3355static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3356{
3357 if (kvm_arch_dy_runnable(vcpu))
3358 return true;
3359
3360#ifdef CONFIG_KVM_ASYNC_PF
3361 if (!list_empty_careful(&vcpu->async_pf.done))
3362 return true;
3363#endif
3364
3365 return false;
3366}
3367
3368bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3369{
3370 return false;
3371}
3372
3373void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3374{
3375 struct kvm *kvm = me->kvm;
3376 struct kvm_vcpu *vcpu;
3377 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3378 int yielded = 0;
3379 int try = 3;
3380 int pass;
3381 int i;
3382
3383 kvm_vcpu_set_in_spin_loop(me, true);
3384
3385
3386
3387
3388
3389
3390
3391 for (pass = 0; pass < 2 && !yielded && try; pass++) {
3392 kvm_for_each_vcpu(i, vcpu, kvm) {
3393 if (!pass && i <= last_boosted_vcpu) {
3394 i = last_boosted_vcpu;
3395 continue;
3396 } else if (pass && i > last_boosted_vcpu)
3397 break;
3398 if (!READ_ONCE(vcpu->ready))
3399 continue;
3400 if (vcpu == me)
3401 continue;
3402 if (rcuwait_active(&vcpu->wait) &&
3403 !vcpu_dy_runnable(vcpu))
3404 continue;
3405 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3406 !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3407 !kvm_arch_vcpu_in_kernel(vcpu))
3408 continue;
3409 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3410 continue;
3411
3412 yielded = kvm_vcpu_yield_to(vcpu);
3413 if (yielded > 0) {
3414 kvm->last_boosted_vcpu = i;
3415 break;
3416 } else if (yielded < 0) {
3417 try--;
3418 if (!try)
3419 break;
3420 }
3421 }
3422 }
3423 kvm_vcpu_set_in_spin_loop(me, false);
3424
3425
3426 kvm_vcpu_set_dy_eligible(me, false);
3427}
3428EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3429
3430static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3431{
3432#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
3433 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3434 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3435 kvm->dirty_ring_size / PAGE_SIZE);
3436#else
3437 return false;
3438#endif
3439}
3440
3441static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3442{
3443 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3444 struct page *page;
3445
3446 if (vmf->pgoff == 0)
3447 page = virt_to_page(vcpu->run);
3448#ifdef CONFIG_X86
3449 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3450 page = virt_to_page(vcpu->arch.pio_data);
3451#endif
3452#ifdef CONFIG_KVM_MMIO
3453 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3454 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3455#endif
3456 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3457 page = kvm_dirty_ring_get_page(
3458 &vcpu->dirty_ring,
3459 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3460 else
3461 return kvm_arch_vcpu_fault(vcpu, vmf);
3462 get_page(page);
3463 vmf->page = page;
3464 return 0;
3465}
3466
3467static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3468 .fault = kvm_vcpu_fault,
3469};
3470
3471static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3472{
3473 struct kvm_vcpu *vcpu = file->private_data;
3474 unsigned long pages = vma_pages(vma);
3475
3476 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3477 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3478 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3479 return -EINVAL;
3480
3481 vma->vm_ops = &kvm_vcpu_vm_ops;
3482 return 0;
3483}
3484
3485static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3486{
3487 struct kvm_vcpu *vcpu = filp->private_data;
3488
3489 kvm_put_kvm(vcpu->kvm);
3490 return 0;
3491}
3492
3493static struct file_operations kvm_vcpu_fops = {
3494 .release = kvm_vcpu_release,
3495 .unlocked_ioctl = kvm_vcpu_ioctl,
3496 .mmap = kvm_vcpu_mmap,
3497 .llseek = noop_llseek,
3498 KVM_COMPAT(kvm_vcpu_compat_ioctl),
3499};
3500
3501
3502
3503
3504static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3505{
3506 char name[8 + 1 + ITOA_MAX_LEN + 1];
3507
3508 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3509 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3510}
3511
3512static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3513{
3514#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3515 struct dentry *debugfs_dentry;
3516 char dir_name[ITOA_MAX_LEN * 2];
3517
3518 if (!debugfs_initialized())
3519 return;
3520
3521 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3522 debugfs_dentry = debugfs_create_dir(dir_name,
3523 vcpu->kvm->debugfs_dentry);
3524
3525 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3526#endif
3527}
3528
3529
3530
3531
3532static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3533{
3534 int r;
3535 struct kvm_vcpu *vcpu;
3536 struct page *page;
3537
3538 if (id >= KVM_MAX_VCPU_IDS)
3539 return -EINVAL;
3540
3541 mutex_lock(&kvm->lock);
3542 if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3543 mutex_unlock(&kvm->lock);
3544 return -EINVAL;
3545 }
3546
3547 kvm->created_vcpus++;
3548 mutex_unlock(&kvm->lock);
3549
3550 r = kvm_arch_vcpu_precreate(kvm, id);
3551 if (r)
3552 goto vcpu_decrement;
3553
3554 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3555 if (!vcpu) {
3556 r = -ENOMEM;
3557 goto vcpu_decrement;
3558 }
3559
3560 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3561 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3562 if (!page) {
3563 r = -ENOMEM;
3564 goto vcpu_free;
3565 }
3566 vcpu->run = page_address(page);
3567
3568 kvm_vcpu_init(vcpu, kvm, id);
3569
3570 r = kvm_arch_vcpu_create(vcpu);
3571 if (r)
3572 goto vcpu_free_run_page;
3573
3574 if (kvm->dirty_ring_size) {
3575 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3576 id, kvm->dirty_ring_size);
3577 if (r)
3578 goto arch_vcpu_destroy;
3579 }
3580
3581 mutex_lock(&kvm->lock);
3582 if (kvm_get_vcpu_by_id(kvm, id)) {
3583 r = -EEXIST;
3584 goto unlock_vcpu_destroy;
3585 }
3586
3587 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3588 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
3589
3590
3591 snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
3592 task_pid_nr(current), id);
3593
3594
3595 kvm_get_kvm(kvm);
3596 r = create_vcpu_fd(vcpu);
3597 if (r < 0) {
3598 kvm_put_kvm_no_destroy(kvm);
3599 goto unlock_vcpu_destroy;
3600 }
3601
3602 kvm->vcpus[vcpu->vcpu_idx] = vcpu;
3603
3604
3605
3606
3607
3608 smp_wmb();
3609 atomic_inc(&kvm->online_vcpus);
3610
3611 mutex_unlock(&kvm->lock);
3612 kvm_arch_vcpu_postcreate(vcpu);
3613 kvm_create_vcpu_debugfs(vcpu);
3614 return r;
3615
3616unlock_vcpu_destroy:
3617 mutex_unlock(&kvm->lock);
3618 kvm_dirty_ring_free(&vcpu->dirty_ring);
3619arch_vcpu_destroy:
3620 kvm_arch_vcpu_destroy(vcpu);
3621vcpu_free_run_page:
3622 free_page((unsigned long)vcpu->run);
3623vcpu_free:
3624 kmem_cache_free(kvm_vcpu_cache, vcpu);
3625vcpu_decrement:
3626 mutex_lock(&kvm->lock);
3627 kvm->created_vcpus--;
3628 mutex_unlock(&kvm->lock);
3629 return r;
3630}
3631
3632static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3633{
3634 if (sigset) {
3635 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3636 vcpu->sigset_active = 1;
3637 vcpu->sigset = *sigset;
3638 } else
3639 vcpu->sigset_active = 0;
3640 return 0;
3641}
3642
3643static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
3644 size_t size, loff_t *offset)
3645{
3646 struct kvm_vcpu *vcpu = file->private_data;
3647
3648 return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
3649 &kvm_vcpu_stats_desc[0], &vcpu->stat,
3650 sizeof(vcpu->stat), user_buffer, size, offset);
3651}
3652
3653static const struct file_operations kvm_vcpu_stats_fops = {
3654 .read = kvm_vcpu_stats_read,
3655 .llseek = noop_llseek,
3656};
3657
3658static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
3659{
3660 int fd;
3661 struct file *file;
3662 char name[15 + ITOA_MAX_LEN + 1];
3663
3664 snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
3665
3666 fd = get_unused_fd_flags(O_CLOEXEC);
3667 if (fd < 0)
3668 return fd;
3669
3670 file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
3671 if (IS_ERR(file)) {
3672 put_unused_fd(fd);
3673 return PTR_ERR(file);
3674 }
3675 file->f_mode |= FMODE_PREAD;
3676 fd_install(fd, file);
3677
3678 return fd;
3679}
3680
3681static long kvm_vcpu_ioctl(struct file *filp,
3682 unsigned int ioctl, unsigned long arg)
3683{
3684 struct kvm_vcpu *vcpu = filp->private_data;
3685 void __user *argp = (void __user *)arg;
3686 int r;
3687 struct kvm_fpu *fpu = NULL;
3688 struct kvm_sregs *kvm_sregs = NULL;
3689
3690 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
3691 return -EIO;
3692
3693 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3694 return -EINVAL;
3695
3696
3697
3698
3699
3700 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3701 if (r != -ENOIOCTLCMD)
3702 return r;
3703
3704 if (mutex_lock_killable(&vcpu->mutex))
3705 return -EINTR;
3706 switch (ioctl) {
3707 case KVM_RUN: {
3708 struct pid *oldpid;
3709 r = -EINVAL;
3710 if (arg)
3711 goto out;
3712 oldpid = rcu_access_pointer(vcpu->pid);
3713 if (unlikely(oldpid != task_pid(current))) {
3714
3715 struct pid *newpid;
3716
3717 r = kvm_arch_vcpu_run_pid_change(vcpu);
3718 if (r)
3719 break;
3720
3721 newpid = get_task_pid(current, PIDTYPE_PID);
3722 rcu_assign_pointer(vcpu->pid, newpid);
3723 if (oldpid)
3724 synchronize_rcu();
3725 put_pid(oldpid);
3726 }
3727 r = kvm_arch_vcpu_ioctl_run(vcpu);
3728 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
3729 break;
3730 }
3731 case KVM_GET_REGS: {
3732 struct kvm_regs *kvm_regs;
3733
3734 r = -ENOMEM;
3735 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3736 if (!kvm_regs)
3737 goto out;
3738 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
3739 if (r)
3740 goto out_free1;
3741 r = -EFAULT;
3742 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3743 goto out_free1;
3744 r = 0;
3745out_free1:
3746 kfree(kvm_regs);
3747 break;
3748 }
3749 case KVM_SET_REGS: {
3750 struct kvm_regs *kvm_regs;
3751
3752 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3753 if (IS_ERR(kvm_regs)) {
3754 r = PTR_ERR(kvm_regs);
3755 goto out;
3756 }
3757 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3758 kfree(kvm_regs);
3759 break;
3760 }
3761 case KVM_GET_SREGS: {
3762 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3763 GFP_KERNEL_ACCOUNT);
3764 r = -ENOMEM;
3765 if (!kvm_sregs)
3766 goto out;
3767 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3768 if (r)
3769 goto out;
3770 r = -EFAULT;
3771 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3772 goto out;
3773 r = 0;
3774 break;
3775 }
3776 case KVM_SET_SREGS: {
3777 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3778 if (IS_ERR(kvm_sregs)) {
3779 r = PTR_ERR(kvm_sregs);
3780 kvm_sregs = NULL;
3781 goto out;
3782 }
3783 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3784 break;
3785 }
3786 case KVM_GET_MP_STATE: {
3787 struct kvm_mp_state mp_state;
3788
3789 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3790 if (r)
3791 goto out;
3792 r = -EFAULT;
3793 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3794 goto out;
3795 r = 0;
3796 break;
3797 }
3798 case KVM_SET_MP_STATE: {
3799 struct kvm_mp_state mp_state;
3800
3801 r = -EFAULT;
3802 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3803 goto out;
3804 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3805 break;
3806 }
3807 case KVM_TRANSLATE: {
3808 struct kvm_translation tr;
3809
3810 r = -EFAULT;
3811 if (copy_from_user(&tr, argp, sizeof(tr)))
3812 goto out;
3813 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3814 if (r)
3815 goto out;
3816 r = -EFAULT;
3817 if (copy_to_user(argp, &tr, sizeof(tr)))
3818 goto out;
3819 r = 0;
3820 break;
3821 }
3822 case KVM_SET_GUEST_DEBUG: {
3823 struct kvm_guest_debug dbg;
3824
3825 r = -EFAULT;
3826 if (copy_from_user(&dbg, argp, sizeof(dbg)))
3827 goto out;
3828 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
3829 break;
3830 }
3831 case KVM_SET_SIGNAL_MASK: {
3832 struct kvm_signal_mask __user *sigmask_arg = argp;
3833 struct kvm_signal_mask kvm_sigmask;
3834 sigset_t sigset, *p;
3835
3836 p = NULL;
3837 if (argp) {
3838 r = -EFAULT;
3839 if (copy_from_user(&kvm_sigmask, argp,
3840 sizeof(kvm_sigmask)))
3841 goto out;
3842 r = -EINVAL;
3843 if (kvm_sigmask.len != sizeof(sigset))
3844 goto out;
3845 r = -EFAULT;
3846 if (copy_from_user(&sigset, sigmask_arg->sigset,
3847 sizeof(sigset)))
3848 goto out;
3849 p = &sigset;
3850 }
3851 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
3852 break;
3853 }
3854 case KVM_GET_FPU: {
3855 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
3856 r = -ENOMEM;
3857 if (!fpu)
3858 goto out;
3859 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
3860 if (r)
3861 goto out;
3862 r = -EFAULT;
3863 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
3864 goto out;
3865 r = 0;
3866 break;
3867 }
3868 case KVM_SET_FPU: {
3869 fpu = memdup_user(argp, sizeof(*fpu));
3870 if (IS_ERR(fpu)) {
3871 r = PTR_ERR(fpu);
3872 fpu = NULL;
3873 goto out;
3874 }
3875 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
3876 break;
3877 }
3878 case KVM_GET_STATS_FD: {
3879 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
3880 break;
3881 }
3882 default:
3883 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
3884 }
3885out:
3886 mutex_unlock(&vcpu->mutex);
3887 kfree(fpu);
3888 kfree(kvm_sregs);
3889 return r;
3890}
3891
3892#ifdef CONFIG_KVM_COMPAT
3893static long kvm_vcpu_compat_ioctl(struct file *filp,
3894 unsigned int ioctl, unsigned long arg)
3895{
3896 struct kvm_vcpu *vcpu = filp->private_data;
3897 void __user *argp = compat_ptr(arg);
3898 int r;
3899
3900 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
3901 return -EIO;
3902
3903 switch (ioctl) {
3904 case KVM_SET_SIGNAL_MASK: {
3905 struct kvm_signal_mask __user *sigmask_arg = argp;
3906 struct kvm_signal_mask kvm_sigmask;
3907 sigset_t sigset;
3908
3909 if (argp) {
3910 r = -EFAULT;
3911 if (copy_from_user(&kvm_sigmask, argp,
3912 sizeof(kvm_sigmask)))
3913 goto out;
3914 r = -EINVAL;
3915 if (kvm_sigmask.len != sizeof(compat_sigset_t))
3916 goto out;
3917 r = -EFAULT;
3918 if (get_compat_sigset(&sigset,
3919 (compat_sigset_t __user *)sigmask_arg->sigset))
3920 goto out;
3921 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3922 } else
3923 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
3924 break;
3925 }
3926 default:
3927 r = kvm_vcpu_ioctl(filp, ioctl, arg);
3928 }
3929
3930out:
3931 return r;
3932}
3933#endif
3934
3935static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3936{
3937 struct kvm_device *dev = filp->private_data;
3938
3939 if (dev->ops->mmap)
3940 return dev->ops->mmap(dev, vma);
3941
3942 return -ENODEV;
3943}
3944
3945static int kvm_device_ioctl_attr(struct kvm_device *dev,
3946 int (*accessor)(struct kvm_device *dev,
3947 struct kvm_device_attr *attr),
3948 unsigned long arg)
3949{
3950 struct kvm_device_attr attr;
3951
3952 if (!accessor)
3953 return -EPERM;
3954
3955 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
3956 return -EFAULT;
3957
3958 return accessor(dev, &attr);
3959}
3960
3961static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
3962 unsigned long arg)
3963{
3964 struct kvm_device *dev = filp->private_data;
3965
3966 if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
3967 return -EIO;
3968
3969 switch (ioctl) {
3970 case KVM_SET_DEVICE_ATTR:
3971 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
3972 case KVM_GET_DEVICE_ATTR:
3973 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
3974 case KVM_HAS_DEVICE_ATTR:
3975 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
3976 default:
3977 if (dev->ops->ioctl)
3978 return dev->ops->ioctl(dev, ioctl, arg);
3979
3980 return -ENOTTY;
3981 }
3982}
3983
3984static int kvm_device_release(struct inode *inode, struct file *filp)
3985{
3986 struct kvm_device *dev = filp->private_data;
3987 struct kvm *kvm = dev->kvm;
3988
3989 if (dev->ops->release) {
3990 mutex_lock(&kvm->lock);
3991 list_del(&dev->vm_node);
3992 dev->ops->release(dev);
3993 mutex_unlock(&kvm->lock);
3994 }
3995
3996 kvm_put_kvm(kvm);
3997 return 0;
3998}
3999
4000static const struct file_operations kvm_device_fops = {
4001 .unlocked_ioctl = kvm_device_ioctl,
4002 .release = kvm_device_release,
4003 KVM_COMPAT(kvm_device_ioctl),
4004 .mmap = kvm_device_mmap,
4005};
4006
4007struct kvm_device *kvm_device_from_filp(struct file *filp)
4008{
4009 if (filp->f_op != &kvm_device_fops)
4010 return NULL;
4011
4012 return filp->private_data;
4013}
4014
4015static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4016#ifdef CONFIG_KVM_MPIC
4017 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4018 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
4019#endif
4020};
4021
4022int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4023{
4024 if (type >= ARRAY_SIZE(kvm_device_ops_table))
4025 return -ENOSPC;
4026
4027 if (kvm_device_ops_table[type] != NULL)
4028 return -EEXIST;
4029
4030 kvm_device_ops_table[type] = ops;
4031 return 0;
4032}
4033
4034void kvm_unregister_device_ops(u32 type)
4035{
4036 if (kvm_device_ops_table[type] != NULL)
4037 kvm_device_ops_table[type] = NULL;
4038}
4039
4040static int kvm_ioctl_create_device(struct kvm *kvm,
4041 struct kvm_create_device *cd)
4042{
4043 const struct kvm_device_ops *ops = NULL;
4044 struct kvm_device *dev;
4045 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4046 int type;
4047 int ret;
4048
4049 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4050 return -ENODEV;
4051
4052 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4053 ops = kvm_device_ops_table[type];
4054 if (ops == NULL)
4055 return -ENODEV;
4056
4057 if (test)
4058 return 0;
4059
4060 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4061 if (!dev)
4062 return -ENOMEM;
4063
4064 dev->ops = ops;
4065 dev->kvm = kvm;
4066
4067 mutex_lock(&kvm->lock);
4068 ret = ops->create(dev, type);
4069 if (ret < 0) {
4070 mutex_unlock(&kvm->lock);
4071 kfree(dev);
4072 return ret;
4073 }
4074 list_add(&dev->vm_node, &kvm->devices);
4075 mutex_unlock(&kvm->lock);
4076
4077 if (ops->init)
4078 ops->init(dev);
4079
4080 kvm_get_kvm(kvm);
4081 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4082 if (ret < 0) {
4083 kvm_put_kvm_no_destroy(kvm);
4084 mutex_lock(&kvm->lock);
4085 list_del(&dev->vm_node);
4086 mutex_unlock(&kvm->lock);
4087 ops->destroy(dev);
4088 return ret;
4089 }
4090
4091 cd->fd = ret;
4092 return 0;
4093}
4094
4095static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4096{
4097 switch (arg) {
4098 case KVM_CAP_USER_MEMORY:
4099 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4100 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4101 case KVM_CAP_INTERNAL_ERROR_DATA:
4102#ifdef CONFIG_HAVE_KVM_MSI
4103 case KVM_CAP_SIGNAL_MSI:
4104#endif
4105#ifdef CONFIG_HAVE_KVM_IRQFD
4106 case KVM_CAP_IRQFD:
4107 case KVM_CAP_IRQFD_RESAMPLE:
4108#endif
4109 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4110 case KVM_CAP_CHECK_EXTENSION_VM:
4111 case KVM_CAP_ENABLE_CAP_VM:
4112 case KVM_CAP_HALT_POLL:
4113 return 1;
4114#ifdef CONFIG_KVM_MMIO
4115 case KVM_CAP_COALESCED_MMIO:
4116 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4117 case KVM_CAP_COALESCED_PIO:
4118 return 1;
4119#endif
4120#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4121 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4122 return KVM_DIRTY_LOG_MANUAL_CAPS;
4123#endif
4124#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4125 case KVM_CAP_IRQ_ROUTING:
4126 return KVM_MAX_IRQ_ROUTES;
4127#endif
4128#if KVM_ADDRESS_SPACE_NUM > 1
4129 case KVM_CAP_MULTI_ADDRESS_SPACE:
4130 return KVM_ADDRESS_SPACE_NUM;
4131#endif
4132 case KVM_CAP_NR_MEMSLOTS:
4133 return KVM_USER_MEM_SLOTS;
4134 case KVM_CAP_DIRTY_LOG_RING:
4135#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
4136 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4137#else
4138 return 0;
4139#endif
4140 case KVM_CAP_BINARY_STATS_FD:
4141 return 1;
4142 default:
4143 break;
4144 }
4145 return kvm_vm_ioctl_check_extension(kvm, arg);
4146}
4147
4148static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4149{
4150 int r;
4151
4152 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4153 return -EINVAL;
4154
4155
4156 if (!size || (size & (size - 1)))
4157 return -EINVAL;
4158
4159
4160 if (size < kvm_dirty_ring_get_rsvd_entries() *
4161 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4162 return -EINVAL;
4163
4164 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4165 sizeof(struct kvm_dirty_gfn))
4166 return -E2BIG;
4167
4168
4169 if (kvm->dirty_ring_size)
4170 return -EINVAL;
4171
4172 mutex_lock(&kvm->lock);
4173
4174 if (kvm->created_vcpus) {
4175
4176 r = -EINVAL;
4177 } else {
4178 kvm->dirty_ring_size = size;
4179 r = 0;
4180 }
4181
4182 mutex_unlock(&kvm->lock);
4183 return r;
4184}
4185
4186static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4187{
4188 int i;
4189 struct kvm_vcpu *vcpu;
4190 int cleared = 0;
4191
4192 if (!kvm->dirty_ring_size)
4193 return -EINVAL;
4194
4195 mutex_lock(&kvm->slots_lock);
4196
4197 kvm_for_each_vcpu(i, vcpu, kvm)
4198 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4199
4200 mutex_unlock(&kvm->slots_lock);
4201
4202 if (cleared)
4203 kvm_flush_remote_tlbs(kvm);
4204
4205 return cleared;
4206}
4207
4208int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4209 struct kvm_enable_cap *cap)
4210{
4211 return -EINVAL;
4212}
4213
4214static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4215 struct kvm_enable_cap *cap)
4216{
4217 switch (cap->cap) {
4218#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4219 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4220 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4221
4222 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4223 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4224
4225 if (cap->flags || (cap->args[0] & ~allowed_options))
4226 return -EINVAL;
4227 kvm->manual_dirty_log_protect = cap->args[0];
4228 return 0;
4229 }
4230#endif
4231 case KVM_CAP_HALT_POLL: {
4232 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4233 return -EINVAL;
4234
4235 kvm->max_halt_poll_ns = cap->args[0];
4236 return 0;
4237 }
4238 case KVM_CAP_DIRTY_LOG_RING:
4239 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4240 default:
4241 return kvm_vm_ioctl_enable_cap(kvm, cap);
4242 }
4243}
4244
4245static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4246 size_t size, loff_t *offset)
4247{
4248 struct kvm *kvm = file->private_data;
4249
4250 return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4251 &kvm_vm_stats_desc[0], &kvm->stat,
4252 sizeof(kvm->stat), user_buffer, size, offset);
4253}
4254
4255static const struct file_operations kvm_vm_stats_fops = {
4256 .read = kvm_vm_stats_read,
4257 .llseek = noop_llseek,
4258};
4259
4260static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4261{
4262 int fd;
4263 struct file *file;
4264
4265 fd = get_unused_fd_flags(O_CLOEXEC);
4266 if (fd < 0)
4267 return fd;
4268
4269 file = anon_inode_getfile("kvm-vm-stats",
4270 &kvm_vm_stats_fops, kvm, O_RDONLY);
4271 if (IS_ERR(file)) {
4272 put_unused_fd(fd);
4273 return PTR_ERR(file);
4274 }
4275 file->f_mode |= FMODE_PREAD;
4276 fd_install(fd, file);
4277
4278 return fd;
4279}
4280
4281static long kvm_vm_ioctl(struct file *filp,
4282 unsigned int ioctl, unsigned long arg)
4283{
4284 struct kvm *kvm = filp->private_data;
4285 void __user *argp = (void __user *)arg;
4286 int r;
4287
4288 if (kvm->mm != current->mm || kvm->vm_dead)
4289 return -EIO;
4290 switch (ioctl) {
4291 case KVM_CREATE_VCPU:
4292 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4293 break;
4294 case KVM_ENABLE_CAP: {
4295 struct kvm_enable_cap cap;
4296
4297 r = -EFAULT;
4298 if (copy_from_user(&cap, argp, sizeof(cap)))
4299 goto out;
4300 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4301 break;
4302 }
4303 case KVM_SET_USER_MEMORY_REGION: {
4304 struct kvm_userspace_memory_region kvm_userspace_mem;
4305
4306 r = -EFAULT;
4307 if (copy_from_user(&kvm_userspace_mem, argp,
4308 sizeof(kvm_userspace_mem)))
4309 goto out;
4310
4311 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4312 break;
4313 }
4314 case KVM_GET_DIRTY_LOG: {
4315 struct kvm_dirty_log log;
4316
4317 r = -EFAULT;
4318 if (copy_from_user(&log, argp, sizeof(log)))
4319 goto out;
4320 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4321 break;
4322 }
4323#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4324 case KVM_CLEAR_DIRTY_LOG: {
4325 struct kvm_clear_dirty_log log;
4326
4327 r = -EFAULT;
4328 if (copy_from_user(&log, argp, sizeof(log)))
4329 goto out;
4330 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4331 break;
4332 }
4333#endif
4334#ifdef CONFIG_KVM_MMIO
4335 case KVM_REGISTER_COALESCED_MMIO: {
4336 struct kvm_coalesced_mmio_zone zone;
4337
4338 r = -EFAULT;
4339 if (copy_from_user(&zone, argp, sizeof(zone)))
4340 goto out;
4341 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4342 break;
4343 }
4344 case KVM_UNREGISTER_COALESCED_MMIO: {
4345 struct kvm_coalesced_mmio_zone zone;
4346
4347 r = -EFAULT;
4348 if (copy_from_user(&zone, argp, sizeof(zone)))
4349 goto out;
4350 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4351 break;
4352 }
4353#endif
4354 case KVM_IRQFD: {
4355 struct kvm_irqfd data;
4356
4357 r = -EFAULT;
4358 if (copy_from_user(&data, argp, sizeof(data)))
4359 goto out;
4360 r = kvm_irqfd(kvm, &data);
4361 break;
4362 }
4363 case KVM_IOEVENTFD: {
4364 struct kvm_ioeventfd data;
4365
4366 r = -EFAULT;
4367 if (copy_from_user(&data, argp, sizeof(data)))
4368 goto out;
4369 r = kvm_ioeventfd(kvm, &data);
4370 break;
4371 }
4372#ifdef CONFIG_HAVE_KVM_MSI
4373 case KVM_SIGNAL_MSI: {
4374 struct kvm_msi msi;
4375
4376 r = -EFAULT;
4377 if (copy_from_user(&msi, argp, sizeof(msi)))
4378 goto out;
4379 r = kvm_send_userspace_msi(kvm, &msi);
4380 break;
4381 }
4382#endif
4383#ifdef __KVM_HAVE_IRQ_LINE
4384 case KVM_IRQ_LINE_STATUS:
4385 case KVM_IRQ_LINE: {
4386 struct kvm_irq_level irq_event;
4387
4388 r = -EFAULT;
4389 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4390 goto out;
4391
4392 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4393 ioctl == KVM_IRQ_LINE_STATUS);
4394 if (r)
4395 goto out;
4396
4397 r = -EFAULT;
4398 if (ioctl == KVM_IRQ_LINE_STATUS) {
4399 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4400 goto out;
4401 }
4402
4403 r = 0;
4404 break;
4405 }
4406#endif
4407#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4408 case KVM_SET_GSI_ROUTING: {
4409 struct kvm_irq_routing routing;
4410 struct kvm_irq_routing __user *urouting;
4411 struct kvm_irq_routing_entry *entries = NULL;
4412
4413 r = -EFAULT;
4414 if (copy_from_user(&routing, argp, sizeof(routing)))
4415 goto out;
4416 r = -EINVAL;
4417 if (!kvm_arch_can_set_irq_routing(kvm))
4418 goto out;
4419 if (routing.nr > KVM_MAX_IRQ_ROUTES)
4420 goto out;
4421 if (routing.flags)
4422 goto out;
4423 if (routing.nr) {
4424 urouting = argp;
4425 entries = vmemdup_user(urouting->entries,
4426 array_size(sizeof(*entries),
4427 routing.nr));
4428 if (IS_ERR(entries)) {
4429 r = PTR_ERR(entries);
4430 goto out;
4431 }
4432 }
4433 r = kvm_set_irq_routing(kvm, entries, routing.nr,
4434 routing.flags);
4435 kvfree(entries);
4436 break;
4437 }
4438#endif
4439 case KVM_CREATE_DEVICE: {
4440 struct kvm_create_device cd;
4441
4442 r = -EFAULT;
4443 if (copy_from_user(&cd, argp, sizeof(cd)))
4444 goto out;
4445
4446 r = kvm_ioctl_create_device(kvm, &cd);
4447 if (r)
4448 goto out;
4449
4450 r = -EFAULT;
4451 if (copy_to_user(argp, &cd, sizeof(cd)))
4452 goto out;
4453
4454 r = 0;
4455 break;
4456 }
4457 case KVM_CHECK_EXTENSION:
4458 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4459 break;
4460 case KVM_RESET_DIRTY_RINGS:
4461 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4462 break;
4463 case KVM_GET_STATS_FD:
4464 r = kvm_vm_ioctl_get_stats_fd(kvm);
4465 break;
4466 default:
4467 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4468 }
4469out:
4470 return r;
4471}
4472
4473#ifdef CONFIG_KVM_COMPAT
4474struct compat_kvm_dirty_log {
4475 __u32 slot;
4476 __u32 padding1;
4477 union {
4478 compat_uptr_t dirty_bitmap;
4479 __u64 padding2;
4480 };
4481};
4482
4483struct compat_kvm_clear_dirty_log {
4484 __u32 slot;
4485 __u32 num_pages;
4486 __u64 first_page;
4487 union {
4488 compat_uptr_t dirty_bitmap;
4489 __u64 padding2;
4490 };
4491};
4492
4493static long kvm_vm_compat_ioctl(struct file *filp,
4494 unsigned int ioctl, unsigned long arg)
4495{
4496 struct kvm *kvm = filp->private_data;
4497 int r;
4498
4499 if (kvm->mm != current->mm || kvm->vm_dead)
4500 return -EIO;
4501 switch (ioctl) {
4502#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4503 case KVM_CLEAR_DIRTY_LOG: {
4504 struct compat_kvm_clear_dirty_log compat_log;
4505 struct kvm_clear_dirty_log log;
4506
4507 if (copy_from_user(&compat_log, (void __user *)arg,
4508 sizeof(compat_log)))
4509 return -EFAULT;
4510 log.slot = compat_log.slot;
4511 log.num_pages = compat_log.num_pages;
4512 log.first_page = compat_log.first_page;
4513 log.padding2 = compat_log.padding2;
4514 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4515
4516 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4517 break;
4518 }
4519#endif
4520 case KVM_GET_DIRTY_LOG: {
4521 struct compat_kvm_dirty_log compat_log;
4522 struct kvm_dirty_log log;
4523
4524 if (copy_from_user(&compat_log, (void __user *)arg,
4525 sizeof(compat_log)))
4526 return -EFAULT;
4527 log.slot = compat_log.slot;
4528 log.padding1 = compat_log.padding1;
4529 log.padding2 = compat_log.padding2;
4530 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4531
4532 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4533 break;
4534 }
4535 default:
4536 r = kvm_vm_ioctl(filp, ioctl, arg);
4537 }
4538 return r;
4539}
4540#endif
4541
4542static struct file_operations kvm_vm_fops = {
4543 .release = kvm_vm_release,
4544 .unlocked_ioctl = kvm_vm_ioctl,
4545 .llseek = noop_llseek,
4546 KVM_COMPAT(kvm_vm_compat_ioctl),
4547};
4548
4549bool file_is_kvm(struct file *file)
4550{
4551 return file && file->f_op == &kvm_vm_fops;
4552}
4553EXPORT_SYMBOL_GPL(file_is_kvm);
4554
4555static int kvm_dev_ioctl_create_vm(unsigned long type)
4556{
4557 int r;
4558 struct kvm *kvm;
4559 struct file *file;
4560
4561 kvm = kvm_create_vm(type);
4562 if (IS_ERR(kvm))
4563 return PTR_ERR(kvm);
4564#ifdef CONFIG_KVM_MMIO
4565 r = kvm_coalesced_mmio_init(kvm);
4566 if (r < 0)
4567 goto put_kvm;
4568#endif
4569 r = get_unused_fd_flags(O_CLOEXEC);
4570 if (r < 0)
4571 goto put_kvm;
4572
4573 snprintf(kvm->stats_id, sizeof(kvm->stats_id),
4574 "kvm-%d", task_pid_nr(current));
4575
4576 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4577 if (IS_ERR(file)) {
4578 put_unused_fd(r);
4579 r = PTR_ERR(file);
4580 goto put_kvm;
4581 }
4582
4583
4584
4585
4586
4587
4588
4589 if (kvm_create_vm_debugfs(kvm, r) < 0) {
4590 put_unused_fd(r);
4591 fput(file);
4592 return -ENOMEM;
4593 }
4594 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
4595
4596 fd_install(r, file);
4597 return r;
4598
4599put_kvm:
4600 kvm_put_kvm(kvm);
4601 return r;
4602}
4603
4604static long kvm_dev_ioctl(struct file *filp,
4605 unsigned int ioctl, unsigned long arg)
4606{
4607 long r = -EINVAL;
4608
4609 switch (ioctl) {
4610 case KVM_GET_API_VERSION:
4611 if (arg)
4612 goto out;
4613 r = KVM_API_VERSION;
4614 break;
4615 case KVM_CREATE_VM:
4616 r = kvm_dev_ioctl_create_vm(arg);
4617 break;
4618 case KVM_CHECK_EXTENSION:
4619 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
4620 break;
4621 case KVM_GET_VCPU_MMAP_SIZE:
4622 if (arg)
4623 goto out;
4624 r = PAGE_SIZE;
4625#ifdef CONFIG_X86
4626 r += PAGE_SIZE;
4627#endif
4628#ifdef CONFIG_KVM_MMIO
4629 r += PAGE_SIZE;
4630#endif
4631 break;
4632 case KVM_TRACE_ENABLE:
4633 case KVM_TRACE_PAUSE:
4634 case KVM_TRACE_DISABLE:
4635 r = -EOPNOTSUPP;
4636 break;
4637 default:
4638 return kvm_arch_dev_ioctl(filp, ioctl, arg);
4639 }
4640out:
4641 return r;
4642}
4643
4644static struct file_operations kvm_chardev_ops = {
4645 .unlocked_ioctl = kvm_dev_ioctl,
4646 .llseek = noop_llseek,
4647 KVM_COMPAT(kvm_dev_ioctl),
4648};
4649
4650static struct miscdevice kvm_dev = {
4651 KVM_MINOR,
4652 "kvm",
4653 &kvm_chardev_ops,
4654};
4655
4656static void hardware_enable_nolock(void *junk)
4657{
4658 int cpu = raw_smp_processor_id();
4659 int r;
4660
4661 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
4662 return;
4663
4664 cpumask_set_cpu(cpu, cpus_hardware_enabled);
4665
4666 r = kvm_arch_hardware_enable();
4667
4668 if (r) {
4669 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4670 atomic_inc(&hardware_enable_failed);
4671 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
4672 }
4673}
4674
4675static int kvm_starting_cpu(unsigned int cpu)
4676{
4677 raw_spin_lock(&kvm_count_lock);
4678 if (kvm_usage_count)
4679 hardware_enable_nolock(NULL);
4680 raw_spin_unlock(&kvm_count_lock);
4681 return 0;
4682}
4683
4684static void hardware_disable_nolock(void *junk)
4685{
4686 int cpu = raw_smp_processor_id();
4687
4688 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
4689 return;
4690 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4691 kvm_arch_hardware_disable();
4692}
4693
4694static int kvm_dying_cpu(unsigned int cpu)
4695{
4696 raw_spin_lock(&kvm_count_lock);
4697 if (kvm_usage_count)
4698 hardware_disable_nolock(NULL);
4699 raw_spin_unlock(&kvm_count_lock);
4700 return 0;
4701}
4702
4703static void hardware_disable_all_nolock(void)
4704{
4705 BUG_ON(!kvm_usage_count);
4706
4707 kvm_usage_count--;
4708 if (!kvm_usage_count)
4709 on_each_cpu(hardware_disable_nolock, NULL, 1);
4710}
4711
4712static void hardware_disable_all(void)
4713{
4714 raw_spin_lock(&kvm_count_lock);
4715 hardware_disable_all_nolock();
4716 raw_spin_unlock(&kvm_count_lock);
4717}
4718
4719static int hardware_enable_all(void)
4720{
4721 int r = 0;
4722
4723 raw_spin_lock(&kvm_count_lock);
4724
4725 kvm_usage_count++;
4726 if (kvm_usage_count == 1) {
4727 atomic_set(&hardware_enable_failed, 0);
4728 on_each_cpu(hardware_enable_nolock, NULL, 1);
4729
4730 if (atomic_read(&hardware_enable_failed)) {
4731 hardware_disable_all_nolock();
4732 r = -EBUSY;
4733 }
4734 }
4735
4736 raw_spin_unlock(&kvm_count_lock);
4737
4738 return r;
4739}
4740
4741static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
4742 void *v)
4743{
4744
4745
4746
4747
4748
4749
4750 pr_info("kvm: exiting hardware virtualization\n");
4751 kvm_rebooting = true;
4752 on_each_cpu(hardware_disable_nolock, NULL, 1);
4753 return NOTIFY_OK;
4754}
4755
4756static struct notifier_block kvm_reboot_notifier = {
4757 .notifier_call = kvm_reboot,
4758 .priority = 0,
4759};
4760
4761static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
4762{
4763 int i;
4764
4765 for (i = 0; i < bus->dev_count; i++) {
4766 struct kvm_io_device *pos = bus->range[i].dev;
4767
4768 kvm_iodevice_destructor(pos);
4769 }
4770 kfree(bus);
4771}
4772
4773static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
4774 const struct kvm_io_range *r2)
4775{
4776 gpa_t addr1 = r1->addr;
4777 gpa_t addr2 = r2->addr;
4778
4779 if (addr1 < addr2)
4780 return -1;
4781
4782
4783
4784
4785
4786
4787 if (r2->len) {
4788 addr1 += r1->len;
4789 addr2 += r2->len;
4790 }
4791
4792 if (addr1 > addr2)
4793 return 1;
4794
4795 return 0;
4796}
4797
4798static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4799{
4800 return kvm_io_bus_cmp(p1, p2);
4801}
4802
4803static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
4804 gpa_t addr, int len)
4805{
4806 struct kvm_io_range *range, key;
4807 int off;
4808
4809 key = (struct kvm_io_range) {
4810 .addr = addr,
4811 .len = len,
4812 };
4813
4814 range = bsearch(&key, bus->range, bus->dev_count,
4815 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4816 if (range == NULL)
4817 return -ENOENT;
4818
4819 off = range - bus->range;
4820
4821 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
4822 off--;
4823
4824 return off;
4825}
4826
4827static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4828 struct kvm_io_range *range, const void *val)
4829{
4830 int idx;
4831
4832 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4833 if (idx < 0)
4834 return -EOPNOTSUPP;
4835
4836 while (idx < bus->dev_count &&
4837 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4838 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
4839 range->len, val))
4840 return idx;
4841 idx++;
4842 }
4843
4844 return -EOPNOTSUPP;
4845}
4846
4847
4848int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4849 int len, const void *val)
4850{
4851 struct kvm_io_bus *bus;
4852 struct kvm_io_range range;
4853 int r;
4854
4855 range = (struct kvm_io_range) {
4856 .addr = addr,
4857 .len = len,
4858 };
4859
4860 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4861 if (!bus)
4862 return -ENOMEM;
4863 r = __kvm_io_bus_write(vcpu, bus, &range, val);
4864 return r < 0 ? r : 0;
4865}
4866EXPORT_SYMBOL_GPL(kvm_io_bus_write);
4867
4868
4869int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
4870 gpa_t addr, int len, const void *val, long cookie)
4871{
4872 struct kvm_io_bus *bus;
4873 struct kvm_io_range range;
4874
4875 range = (struct kvm_io_range) {
4876 .addr = addr,
4877 .len = len,
4878 };
4879
4880 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4881 if (!bus)
4882 return -ENOMEM;
4883
4884
4885 if ((cookie >= 0) && (cookie < bus->dev_count) &&
4886 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
4887 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
4888 val))
4889 return cookie;
4890
4891
4892
4893
4894
4895 return __kvm_io_bus_write(vcpu, bus, &range, val);
4896}
4897
4898static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4899 struct kvm_io_range *range, void *val)
4900{
4901 int idx;
4902
4903 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4904 if (idx < 0)
4905 return -EOPNOTSUPP;
4906
4907 while (idx < bus->dev_count &&
4908 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4909 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
4910 range->len, val))
4911 return idx;
4912 idx++;
4913 }
4914
4915 return -EOPNOTSUPP;
4916}
4917
4918
4919int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4920 int len, void *val)
4921{
4922 struct kvm_io_bus *bus;
4923 struct kvm_io_range range;
4924 int r;
4925
4926 range = (struct kvm_io_range) {
4927 .addr = addr,
4928 .len = len,
4929 };
4930
4931 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4932 if (!bus)
4933 return -ENOMEM;
4934 r = __kvm_io_bus_read(vcpu, bus, &range, val);
4935 return r < 0 ? r : 0;
4936}
4937
4938
4939int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
4940 int len, struct kvm_io_device *dev)
4941{
4942 int i;
4943 struct kvm_io_bus *new_bus, *bus;
4944 struct kvm_io_range range;
4945
4946 bus = kvm_get_bus(kvm, bus_idx);
4947 if (!bus)
4948 return -ENOMEM;
4949
4950
4951 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
4952 return -ENOSPC;
4953
4954 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
4955 GFP_KERNEL_ACCOUNT);
4956 if (!new_bus)
4957 return -ENOMEM;
4958
4959 range = (struct kvm_io_range) {
4960 .addr = addr,
4961 .len = len,
4962 .dev = dev,
4963 };
4964
4965 for (i = 0; i < bus->dev_count; i++)
4966 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
4967 break;
4968
4969 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
4970 new_bus->dev_count++;
4971 new_bus->range[i] = range;
4972 memcpy(new_bus->range + i + 1, bus->range + i,
4973 (bus->dev_count - i) * sizeof(struct kvm_io_range));
4974 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4975 synchronize_srcu_expedited(&kvm->srcu);
4976 kfree(bus);
4977
4978 return 0;
4979}
4980
4981int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4982 struct kvm_io_device *dev)
4983{
4984 int i, j;
4985 struct kvm_io_bus *new_bus, *bus;
4986
4987 lockdep_assert_held(&kvm->slots_lock);
4988
4989 bus = kvm_get_bus(kvm, bus_idx);
4990 if (!bus)
4991 return 0;
4992
4993 for (i = 0; i < bus->dev_count; i++) {
4994 if (bus->range[i].dev == dev) {
4995 break;
4996 }
4997 }
4998
4999 if (i == bus->dev_count)
5000 return 0;
5001
5002 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5003 GFP_KERNEL_ACCOUNT);
5004 if (new_bus) {
5005 memcpy(new_bus, bus, struct_size(bus, range, i));
5006 new_bus->dev_count--;
5007 memcpy(new_bus->range + i, bus->range + i + 1,
5008 flex_array_size(new_bus, range, new_bus->dev_count - i));
5009 }
5010
5011 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5012 synchronize_srcu_expedited(&kvm->srcu);
5013
5014
5015 if (!new_bus) {
5016 pr_err("kvm: failed to shrink bus, removing it completely\n");
5017 for (j = 0; j < bus->dev_count; j++) {
5018 if (j == i)
5019 continue;
5020 kvm_iodevice_destructor(bus->range[j].dev);
5021 }
5022 }
5023
5024 kfree(bus);
5025 return new_bus ? 0 : -ENOMEM;
5026}
5027
5028struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5029 gpa_t addr)
5030{
5031 struct kvm_io_bus *bus;
5032 int dev_idx, srcu_idx;
5033 struct kvm_io_device *iodev = NULL;
5034
5035 srcu_idx = srcu_read_lock(&kvm->srcu);
5036
5037 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5038 if (!bus)
5039 goto out_unlock;
5040
5041 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5042 if (dev_idx < 0)
5043 goto out_unlock;
5044
5045 iodev = bus->range[dev_idx].dev;
5046
5047out_unlock:
5048 srcu_read_unlock(&kvm->srcu, srcu_idx);
5049
5050 return iodev;
5051}
5052EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5053
5054static int kvm_debugfs_open(struct inode *inode, struct file *file,
5055 int (*get)(void *, u64 *), int (*set)(void *, u64),
5056 const char *fmt)
5057{
5058 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5059 inode->i_private;
5060
5061
5062
5063
5064
5065
5066 if (!kvm_get_kvm_safe(stat_data->kvm))
5067 return -ENOENT;
5068
5069 if (simple_attr_open(inode, file, get,
5070 kvm_stats_debugfs_mode(stat_data->desc) & 0222
5071 ? set : NULL,
5072 fmt)) {
5073 kvm_put_kvm(stat_data->kvm);
5074 return -ENOMEM;
5075 }
5076
5077 return 0;
5078}
5079
5080static int kvm_debugfs_release(struct inode *inode, struct file *file)
5081{
5082 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5083 inode->i_private;
5084
5085 simple_attr_release(inode, file);
5086 kvm_put_kvm(stat_data->kvm);
5087
5088 return 0;
5089}
5090
5091static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5092{
5093 *val = *(u64 *)((void *)(&kvm->stat) + offset);
5094
5095 return 0;
5096}
5097
5098static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5099{
5100 *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5101
5102 return 0;
5103}
5104
5105static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5106{
5107 int i;
5108 struct kvm_vcpu *vcpu;
5109
5110 *val = 0;
5111
5112 kvm_for_each_vcpu(i, vcpu, kvm)
5113 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5114
5115 return 0;
5116}
5117
5118static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5119{
5120 int i;
5121 struct kvm_vcpu *vcpu;
5122
5123 kvm_for_each_vcpu(i, vcpu, kvm)
5124 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5125
5126 return 0;
5127}
5128
5129static int kvm_stat_data_get(void *data, u64 *val)
5130{
5131 int r = -EFAULT;
5132 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5133
5134 switch (stat_data->kind) {
5135 case KVM_STAT_VM:
5136 r = kvm_get_stat_per_vm(stat_data->kvm,
5137 stat_data->desc->desc.offset, val);
5138 break;
5139 case KVM_STAT_VCPU:
5140 r = kvm_get_stat_per_vcpu(stat_data->kvm,
5141 stat_data->desc->desc.offset, val);
5142 break;
5143 }
5144
5145 return r;
5146}
5147
5148static int kvm_stat_data_clear(void *data, u64 val)
5149{
5150 int r = -EFAULT;
5151 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5152
5153 if (val)
5154 return -EINVAL;
5155
5156 switch (stat_data->kind) {
5157 case KVM_STAT_VM:
5158 r = kvm_clear_stat_per_vm(stat_data->kvm,
5159 stat_data->desc->desc.offset);
5160 break;
5161 case KVM_STAT_VCPU:
5162 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5163 stat_data->desc->desc.offset);
5164 break;
5165 }
5166
5167 return r;
5168}
5169
5170static int kvm_stat_data_open(struct inode *inode, struct file *file)
5171{
5172 __simple_attr_check_format("%llu\n", 0ull);
5173 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5174 kvm_stat_data_clear, "%llu\n");
5175}
5176
5177static const struct file_operations stat_fops_per_vm = {
5178 .owner = THIS_MODULE,
5179 .open = kvm_stat_data_open,
5180 .release = kvm_debugfs_release,
5181 .read = simple_attr_read,
5182 .write = simple_attr_write,
5183 .llseek = no_llseek,
5184};
5185
5186static int vm_stat_get(void *_offset, u64 *val)
5187{
5188 unsigned offset = (long)_offset;
5189 struct kvm *kvm;
5190 u64 tmp_val;
5191
5192 *val = 0;
5193 mutex_lock(&kvm_lock);
5194 list_for_each_entry(kvm, &vm_list, vm_list) {
5195 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5196 *val += tmp_val;
5197 }
5198 mutex_unlock(&kvm_lock);
5199 return 0;
5200}
5201
5202static int vm_stat_clear(void *_offset, u64 val)
5203{
5204 unsigned offset = (long)_offset;
5205 struct kvm *kvm;
5206
5207 if (val)
5208 return -EINVAL;
5209
5210 mutex_lock(&kvm_lock);
5211 list_for_each_entry(kvm, &vm_list, vm_list) {
5212 kvm_clear_stat_per_vm(kvm, offset);
5213 }
5214 mutex_unlock(&kvm_lock);
5215
5216 return 0;
5217}
5218
5219DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5220DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5221
5222static int vcpu_stat_get(void *_offset, u64 *val)
5223{
5224 unsigned offset = (long)_offset;
5225 struct kvm *kvm;
5226 u64 tmp_val;
5227
5228 *val = 0;
5229 mutex_lock(&kvm_lock);
5230 list_for_each_entry(kvm, &vm_list, vm_list) {
5231 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5232 *val += tmp_val;
5233 }
5234 mutex_unlock(&kvm_lock);
5235 return 0;
5236}
5237
5238static int vcpu_stat_clear(void *_offset, u64 val)
5239{
5240 unsigned offset = (long)_offset;
5241 struct kvm *kvm;
5242
5243 if (val)
5244 return -EINVAL;
5245
5246 mutex_lock(&kvm_lock);
5247 list_for_each_entry(kvm, &vm_list, vm_list) {
5248 kvm_clear_stat_per_vcpu(kvm, offset);
5249 }
5250 mutex_unlock(&kvm_lock);
5251
5252 return 0;
5253}
5254
5255DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5256 "%llu\n");
5257DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5258
5259static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5260{
5261 struct kobj_uevent_env *env;
5262 unsigned long long created, active;
5263
5264 if (!kvm_dev.this_device || !kvm)
5265 return;
5266
5267 mutex_lock(&kvm_lock);
5268 if (type == KVM_EVENT_CREATE_VM) {
5269 kvm_createvm_count++;
5270 kvm_active_vms++;
5271 } else if (type == KVM_EVENT_DESTROY_VM) {
5272 kvm_active_vms--;
5273 }
5274 created = kvm_createvm_count;
5275 active = kvm_active_vms;
5276 mutex_unlock(&kvm_lock);
5277
5278 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5279 if (!env)
5280 return;
5281
5282 add_uevent_var(env, "CREATED=%llu", created);
5283 add_uevent_var(env, "COUNT=%llu", active);
5284
5285 if (type == KVM_EVENT_CREATE_VM) {
5286 add_uevent_var(env, "EVENT=create");
5287 kvm->userspace_pid = task_pid_nr(current);
5288 } else if (type == KVM_EVENT_DESTROY_VM) {
5289 add_uevent_var(env, "EVENT=destroy");
5290 }
5291 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5292
5293 if (kvm->debugfs_dentry) {
5294 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5295
5296 if (p) {
5297 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5298 if (!IS_ERR(tmp))
5299 add_uevent_var(env, "STATS_PATH=%s", tmp);
5300 kfree(p);
5301 }
5302 }
5303
5304 env->envp[env->envp_idx++] = NULL;
5305 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5306 kfree(env);
5307}
5308
5309static void kvm_init_debug(void)
5310{
5311 const struct file_operations *fops;
5312 const struct _kvm_stats_desc *pdesc;
5313 int i;
5314
5315 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
5316
5317 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5318 pdesc = &kvm_vm_stats_desc[i];
5319 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5320 fops = &vm_stat_fops;
5321 else
5322 fops = &vm_stat_readonly_fops;
5323 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5324 kvm_debugfs_dir,
5325 (void *)(long)pdesc->desc.offset, fops);
5326 }
5327
5328 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5329 pdesc = &kvm_vcpu_stats_desc[i];
5330 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5331 fops = &vcpu_stat_fops;
5332 else
5333 fops = &vcpu_stat_readonly_fops;
5334 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5335 kvm_debugfs_dir,
5336 (void *)(long)pdesc->desc.offset, fops);
5337 }
5338}
5339
5340static int kvm_suspend(void)
5341{
5342 if (kvm_usage_count)
5343 hardware_disable_nolock(NULL);
5344 return 0;
5345}
5346
5347static void kvm_resume(void)
5348{
5349 if (kvm_usage_count) {
5350#ifdef CONFIG_LOCKDEP
5351 WARN_ON(lockdep_is_held(&kvm_count_lock));
5352#endif
5353 hardware_enable_nolock(NULL);
5354 }
5355}
5356
5357static struct syscore_ops kvm_syscore_ops = {
5358 .suspend = kvm_suspend,
5359 .resume = kvm_resume,
5360};
5361
5362static inline
5363struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5364{
5365 return container_of(pn, struct kvm_vcpu, preempt_notifier);
5366}
5367
5368static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5369{
5370 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5371
5372 WRITE_ONCE(vcpu->preempted, false);
5373 WRITE_ONCE(vcpu->ready, false);
5374
5375 __this_cpu_write(kvm_running_vcpu, vcpu);
5376 kvm_arch_sched_in(vcpu, cpu);
5377 kvm_arch_vcpu_load(vcpu, cpu);
5378}
5379
5380static void kvm_sched_out(struct preempt_notifier *pn,
5381 struct task_struct *next)
5382{
5383 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5384
5385 if (current->on_rq) {
5386 WRITE_ONCE(vcpu->preempted, true);
5387 WRITE_ONCE(vcpu->ready, true);
5388 }
5389 kvm_arch_vcpu_put(vcpu);
5390 __this_cpu_write(kvm_running_vcpu, NULL);
5391}
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402struct kvm_vcpu *kvm_get_running_vcpu(void)
5403{
5404 struct kvm_vcpu *vcpu;
5405
5406 preempt_disable();
5407 vcpu = __this_cpu_read(kvm_running_vcpu);
5408 preempt_enable();
5409
5410 return vcpu;
5411}
5412EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5413
5414
5415
5416
5417struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5418{
5419 return &kvm_running_vcpu;
5420}
5421
5422struct kvm_cpu_compat_check {
5423 void *opaque;
5424 int *ret;
5425};
5426
5427static void check_processor_compat(void *data)
5428{
5429 struct kvm_cpu_compat_check *c = data;
5430
5431 *c->ret = kvm_arch_check_processor_compat(c->opaque);
5432}
5433
5434int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
5435 struct module *module)
5436{
5437 struct kvm_cpu_compat_check c;
5438 int r;
5439 int cpu;
5440
5441 r = kvm_arch_init(opaque);
5442 if (r)
5443 goto out_fail;
5444
5445
5446
5447
5448
5449
5450
5451
5452 r = kvm_irqfd_init();
5453 if (r)
5454 goto out_irqfd;
5455
5456 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
5457 r = -ENOMEM;
5458 goto out_free_0;
5459 }
5460
5461 r = kvm_arch_hardware_setup(opaque);
5462 if (r < 0)
5463 goto out_free_1;
5464
5465 c.ret = &r;
5466 c.opaque = opaque;
5467 for_each_online_cpu(cpu) {
5468 smp_call_function_single(cpu, check_processor_compat, &c, 1);
5469 if (r < 0)
5470 goto out_free_2;
5471 }
5472
5473 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
5474 kvm_starting_cpu, kvm_dying_cpu);
5475 if (r)
5476 goto out_free_2;
5477 register_reboot_notifier(&kvm_reboot_notifier);
5478
5479
5480 if (!vcpu_align)
5481 vcpu_align = __alignof__(struct kvm_vcpu);
5482 kvm_vcpu_cache =
5483 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5484 SLAB_ACCOUNT,
5485 offsetof(struct kvm_vcpu, arch),
5486 offsetofend(struct kvm_vcpu, stats_id)
5487 - offsetof(struct kvm_vcpu, arch),
5488 NULL);
5489 if (!kvm_vcpu_cache) {
5490 r = -ENOMEM;
5491 goto out_free_3;
5492 }
5493
5494 for_each_possible_cpu(cpu) {
5495 if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
5496 GFP_KERNEL, cpu_to_node(cpu))) {
5497 r = -ENOMEM;
5498 goto out_free_4;
5499 }
5500 }
5501
5502 r = kvm_async_pf_init();
5503 if (r)
5504 goto out_free_5;
5505
5506 kvm_chardev_ops.owner = module;
5507 kvm_vm_fops.owner = module;
5508 kvm_vcpu_fops.owner = module;
5509
5510 r = misc_register(&kvm_dev);
5511 if (r) {
5512 pr_err("kvm: misc device register failed\n");
5513 goto out_unreg;
5514 }
5515
5516 register_syscore_ops(&kvm_syscore_ops);
5517
5518 kvm_preempt_ops.sched_in = kvm_sched_in;
5519 kvm_preempt_ops.sched_out = kvm_sched_out;
5520
5521 kvm_init_debug();
5522
5523 r = kvm_vfio_ops_init();
5524 WARN_ON(r);
5525
5526 return 0;
5527
5528out_unreg:
5529 kvm_async_pf_deinit();
5530out_free_5:
5531 for_each_possible_cpu(cpu)
5532 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
5533out_free_4:
5534 kmem_cache_destroy(kvm_vcpu_cache);
5535out_free_3:
5536 unregister_reboot_notifier(&kvm_reboot_notifier);
5537 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5538out_free_2:
5539 kvm_arch_hardware_unsetup();
5540out_free_1:
5541 free_cpumask_var(cpus_hardware_enabled);
5542out_free_0:
5543 kvm_irqfd_exit();
5544out_irqfd:
5545 kvm_arch_exit();
5546out_fail:
5547 return r;
5548}
5549EXPORT_SYMBOL_GPL(kvm_init);
5550
5551void kvm_exit(void)
5552{
5553 int cpu;
5554
5555 debugfs_remove_recursive(kvm_debugfs_dir);
5556 misc_deregister(&kvm_dev);
5557 for_each_possible_cpu(cpu)
5558 free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
5559 kmem_cache_destroy(kvm_vcpu_cache);
5560 kvm_async_pf_deinit();
5561 unregister_syscore_ops(&kvm_syscore_ops);
5562 unregister_reboot_notifier(&kvm_reboot_notifier);
5563 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5564 on_each_cpu(hardware_disable_nolock, NULL, 1);
5565 kvm_arch_hardware_unsetup();
5566 kvm_arch_exit();
5567 kvm_irqfd_exit();
5568 free_cpumask_var(cpus_hardware_enabled);
5569 kvm_vfio_ops_exit();
5570}
5571EXPORT_SYMBOL_GPL(kvm_exit);
5572
5573struct kvm_vm_worker_thread_context {
5574 struct kvm *kvm;
5575 struct task_struct *parent;
5576 struct completion init_done;
5577 kvm_vm_thread_fn_t thread_fn;
5578 uintptr_t data;
5579 int err;
5580};
5581
5582static int kvm_vm_worker_thread(void *context)
5583{
5584
5585
5586
5587
5588 struct kvm_vm_worker_thread_context *init_context = context;
5589 struct kvm *kvm = init_context->kvm;
5590 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5591 uintptr_t data = init_context->data;
5592 int err;
5593
5594 err = kthread_park(current);
5595
5596 WARN_ON(err != 0);
5597 if (err)
5598 goto init_complete;
5599
5600 err = cgroup_attach_task_all(init_context->parent, current);
5601 if (err) {
5602 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5603 __func__, err);
5604 goto init_complete;
5605 }
5606
5607 set_user_nice(current, task_nice(init_context->parent));
5608
5609init_complete:
5610 init_context->err = err;
5611 complete(&init_context->init_done);
5612 init_context = NULL;
5613
5614 if (err)
5615 return err;
5616
5617
5618 kthread_parkme();
5619
5620 if (!kthread_should_stop())
5621 err = thread_fn(kvm, data);
5622
5623 return err;
5624}
5625
5626int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5627 uintptr_t data, const char *name,
5628 struct task_struct **thread_ptr)
5629{
5630 struct kvm_vm_worker_thread_context init_context = {};
5631 struct task_struct *thread;
5632
5633 *thread_ptr = NULL;
5634 init_context.kvm = kvm;
5635 init_context.parent = current;
5636 init_context.thread_fn = thread_fn;
5637 init_context.data = data;
5638 init_completion(&init_context.init_done);
5639
5640 thread = kthread_run(kvm_vm_worker_thread, &init_context,
5641 "%s-%d", name, task_pid_nr(current));
5642 if (IS_ERR(thread))
5643 return PTR_ERR(thread);
5644
5645
5646 WARN_ON(thread == NULL);
5647
5648 wait_for_completion(&init_context.init_done);
5649
5650 if (!init_context.err)
5651 *thread_ptr = thread;
5652
5653 return init_context.err;
5654}
5655