1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <kvm/iodev.h>
17
18#include <linux/kvm_host.h>
19#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
22#include <linux/percpu.h>
23#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
26#include <linux/reboot.h>
27#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
30#include <linux/syscore_ops.h>
31#include <linux/cpu.h>
32#include <linux/sched/signal.h>
33#include <linux/sched/mm.h>
34#include <linux/sched/stat.h>
35#include <linux/cpumask.h>
36#include <linux/smp.h>
37#include <linux/anon_inodes.h>
38#include <linux/profile.h>
39#include <linux/kvm_para.h>
40#include <linux/pagemap.h>
41#include <linux/mman.h>
42#include <linux/swap.h>
43#include <linux/bitops.h>
44#include <linux/spinlock.h>
45#include <linux/compat.h>
46#include <linux/srcu.h>
47#include <linux/hugetlb.h>
48#include <linux/slab.h>
49#include <linux/sort.h>
50#include <linux/bsearch.h>
51#include <linux/io.h>
52#include <linux/lockdep.h>
53#include <linux/kthread.h>
54#include <linux/suspend.h>
55
56#include <asm/processor.h>
57#include <asm/ioctl.h>
58#include <linux/uaccess.h>
59
60#include "coalesced_mmio.h"
61#include "async_pf.h"
62#include "mmu_lock.h"
63#include "vfio.h"
64
65#define CREATE_TRACE_POINTS
66#include <trace/events/kvm.h>
67
68#include <linux/kvm_dirty_ring.h>
69
70
71#define ITOA_MAX_LEN 12
72
73MODULE_AUTHOR("Qumranet");
74MODULE_LICENSE("GPL");
75
76
77unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
78module_param(halt_poll_ns, uint, 0644);
79EXPORT_SYMBOL_GPL(halt_poll_ns);
80
81
82unsigned int halt_poll_ns_grow = 2;
83module_param(halt_poll_ns_grow, uint, 0644);
84EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
85
86
87unsigned int halt_poll_ns_grow_start = 10000;
88module_param(halt_poll_ns_grow_start, uint, 0644);
89EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
90
91
92unsigned int halt_poll_ns_shrink;
93module_param(halt_poll_ns_shrink, uint, 0644);
94EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
95
96
97
98
99
100
101
102DEFINE_MUTEX(kvm_lock);
103static DEFINE_RAW_SPINLOCK(kvm_count_lock);
104LIST_HEAD(vm_list);
105
106static cpumask_var_t cpus_hardware_enabled;
107static int kvm_usage_count;
108static atomic_t hardware_enable_failed;
109
110static struct kmem_cache *kvm_vcpu_cache;
111
112static __read_mostly struct preempt_ops kvm_preempt_ops;
113static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
114
115struct dentry *kvm_debugfs_dir;
116EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
117
118static const struct file_operations stat_fops_per_vm;
119
120static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
121 unsigned long arg);
122#ifdef CONFIG_KVM_COMPAT
123static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
124 unsigned long arg);
125#define KVM_COMPAT(c) .compat_ioctl = (c)
126#else
127
128
129
130
131
132
133
134static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
135 unsigned long arg) { return -EINVAL; }
136
137static int kvm_no_compat_open(struct inode *inode, struct file *file)
138{
139 return is_compat_task() ? -ENODEV : 0;
140}
141#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
142 .open = kvm_no_compat_open
143#endif
144static int hardware_enable_all(void);
145static void hardware_disable_all(void);
146
147static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
148
149__visible bool kvm_rebooting;
150EXPORT_SYMBOL_GPL(kvm_rebooting);
151
152#define KVM_EVENT_CREATE_VM 0
153#define KVM_EVENT_DESTROY_VM 1
154static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
155static unsigned long long kvm_createvm_count;
156static unsigned long long kvm_active_vms;
157
158__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
159 unsigned long start, unsigned long end)
160{
161}
162
163bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
164{
165
166
167
168
169
170
171 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
172 return false;
173
174 return is_zone_device_page(pfn_to_page(pfn));
175}
176
177bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
178{
179
180
181
182
183
184 if (pfn_valid(pfn))
185 return PageReserved(pfn_to_page(pfn)) &&
186 !is_zero_pfn(pfn) &&
187 !kvm_is_zone_device_pfn(pfn);
188
189 return true;
190}
191
192
193
194
195void vcpu_load(struct kvm_vcpu *vcpu)
196{
197 int cpu = get_cpu();
198
199 __this_cpu_write(kvm_running_vcpu, vcpu);
200 preempt_notifier_register(&vcpu->preempt_notifier);
201 kvm_arch_vcpu_load(vcpu, cpu);
202 put_cpu();
203}
204EXPORT_SYMBOL_GPL(vcpu_load);
205
206void vcpu_put(struct kvm_vcpu *vcpu)
207{
208 preempt_disable();
209 kvm_arch_vcpu_put(vcpu);
210 preempt_notifier_unregister(&vcpu->preempt_notifier);
211 __this_cpu_write(kvm_running_vcpu, NULL);
212 preempt_enable();
213}
214EXPORT_SYMBOL_GPL(vcpu_put);
215
216
217static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
218{
219 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
220
221
222
223
224
225 if (req & KVM_REQUEST_WAIT)
226 return mode != OUTSIDE_GUEST_MODE;
227
228
229
230
231 return mode == IN_GUEST_MODE;
232}
233
234static void ack_flush(void *_completed)
235{
236}
237
238static inline bool kvm_kick_many_cpus(cpumask_var_t tmp, bool wait)
239{
240 const struct cpumask *cpus;
241
242 if (likely(cpumask_available(tmp)))
243 cpus = tmp;
244 else
245 cpus = cpu_online_mask;
246
247 if (cpumask_empty(cpus))
248 return false;
249
250 smp_call_function_many(cpus, ack_flush, NULL, wait);
251 return true;
252}
253
254bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
255 struct kvm_vcpu *except,
256 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
257{
258 int i, cpu, me;
259 struct kvm_vcpu *vcpu;
260 bool called;
261
262 me = get_cpu();
263
264 kvm_for_each_vcpu(i, vcpu, kvm) {
265 if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
266 vcpu == except)
267 continue;
268
269 kvm_make_request(req, vcpu);
270
271 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
272 continue;
273
274
275
276
277
278
279 if (!cpumask_available(tmp))
280 continue;
281
282
283
284
285
286
287
288
289
290
291
292
293 if (kvm_request_needs_ipi(vcpu, req)) {
294 cpu = READ_ONCE(vcpu->cpu);
295 if (cpu != -1 && cpu != me)
296 __cpumask_set_cpu(cpu, tmp);
297 }
298 }
299
300 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
301 put_cpu();
302
303 return called;
304}
305
306bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
307 struct kvm_vcpu *except)
308{
309 cpumask_var_t cpus;
310 bool called;
311
312 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
313
314 called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus);
315
316 free_cpumask_var(cpus);
317 return called;
318}
319
320bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
321{
322 return kvm_make_all_cpus_request_except(kvm, req, NULL);
323}
324EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
325
326#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
327void kvm_flush_remote_tlbs(struct kvm *kvm)
328{
329 ++kvm->stat.generic.remote_tlb_flush_requests;
330
331
332
333
334
335
336
337
338
339
340
341
342 if (!kvm_arch_flush_remote_tlb(kvm)
343 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
344 ++kvm->stat.generic.remote_tlb_flush;
345}
346EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
347#endif
348
349void kvm_reload_remote_mmus(struct kvm *kvm)
350{
351 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
352}
353
354#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
355static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
356 gfp_t gfp_flags)
357{
358 gfp_flags |= mc->gfp_zero;
359
360 if (mc->kmem_cache)
361 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
362 else
363 return (void *)__get_free_page(gfp_flags);
364}
365
366int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
367{
368 void *obj;
369
370 if (mc->nobjs >= min)
371 return 0;
372 while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
373 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
374 if (!obj)
375 return mc->nobjs >= min ? 0 : -ENOMEM;
376 mc->objects[mc->nobjs++] = obj;
377 }
378 return 0;
379}
380
381int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
382{
383 return mc->nobjs;
384}
385
386void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
387{
388 while (mc->nobjs) {
389 if (mc->kmem_cache)
390 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
391 else
392 free_page((unsigned long)mc->objects[--mc->nobjs]);
393 }
394}
395
396void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
397{
398 void *p;
399
400 if (WARN_ON(!mc->nobjs))
401 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
402 else
403 p = mc->objects[--mc->nobjs];
404 BUG_ON(!p);
405 return p;
406}
407#endif
408
409static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
410{
411 mutex_init(&vcpu->mutex);
412 vcpu->cpu = -1;
413 vcpu->kvm = kvm;
414 vcpu->vcpu_id = id;
415 vcpu->pid = NULL;
416 rcuwait_init(&vcpu->wait);
417 kvm_async_pf_vcpu_init(vcpu);
418
419 vcpu->pre_pcpu = -1;
420 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
421
422 kvm_vcpu_set_in_spin_loop(vcpu, false);
423 kvm_vcpu_set_dy_eligible(vcpu, false);
424 vcpu->preempted = false;
425 vcpu->ready = false;
426 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
427 vcpu->last_used_slot = 0;
428}
429
430void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
431{
432 kvm_dirty_ring_free(&vcpu->dirty_ring);
433 kvm_arch_vcpu_destroy(vcpu);
434
435
436
437
438
439
440 put_pid(rcu_dereference_protected(vcpu->pid, 1));
441
442 free_page((unsigned long)vcpu->run);
443 kmem_cache_free(kvm_vcpu_cache, vcpu);
444}
445EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
446
447#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
448static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
449{
450 return container_of(mn, struct kvm, mmu_notifier);
451}
452
453static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
454 struct mm_struct *mm,
455 unsigned long start, unsigned long end)
456{
457 struct kvm *kvm = mmu_notifier_to_kvm(mn);
458 int idx;
459
460 idx = srcu_read_lock(&kvm->srcu);
461 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
462 srcu_read_unlock(&kvm->srcu, idx);
463}
464
465typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
466
467typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
468 unsigned long end);
469
470struct kvm_hva_range {
471 unsigned long start;
472 unsigned long end;
473 pte_t pte;
474 hva_handler_t handler;
475 on_lock_fn_t on_lock;
476 bool flush_on_ret;
477 bool may_block;
478};
479
480
481
482
483
484
485
486
487static void kvm_null_fn(void)
488{
489
490}
491#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
492
493static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
494 const struct kvm_hva_range *range)
495{
496 bool ret = false, locked = false;
497 struct kvm_gfn_range gfn_range;
498 struct kvm_memory_slot *slot;
499 struct kvm_memslots *slots;
500 int i, idx;
501
502
503 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
504 IS_KVM_NULL_FN(range->handler)))
505 return 0;
506
507 idx = srcu_read_lock(&kvm->srcu);
508
509 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
510 slots = __kvm_memslots(kvm, i);
511 kvm_for_each_memslot(slot, slots) {
512 unsigned long hva_start, hva_end;
513
514 hva_start = max(range->start, slot->userspace_addr);
515 hva_end = min(range->end, slot->userspace_addr +
516 (slot->npages << PAGE_SHIFT));
517 if (hva_start >= hva_end)
518 continue;
519
520
521
522
523
524
525
526 gfn_range.pte = range->pte;
527 gfn_range.may_block = range->may_block;
528
529
530
531
532
533 gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
534 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
535 gfn_range.slot = slot;
536
537 if (!locked) {
538 locked = true;
539 KVM_MMU_LOCK(kvm);
540 if (!IS_KVM_NULL_FN(range->on_lock))
541 range->on_lock(kvm, range->start, range->end);
542 if (IS_KVM_NULL_FN(range->handler))
543 break;
544 }
545 ret |= range->handler(kvm, &gfn_range);
546 }
547 }
548
549 if (range->flush_on_ret && ret)
550 kvm_flush_remote_tlbs(kvm);
551
552 if (locked)
553 KVM_MMU_UNLOCK(kvm);
554
555 srcu_read_unlock(&kvm->srcu, idx);
556
557
558 return (int)ret;
559}
560
561static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
562 unsigned long start,
563 unsigned long end,
564 pte_t pte,
565 hva_handler_t handler)
566{
567 struct kvm *kvm = mmu_notifier_to_kvm(mn);
568 const struct kvm_hva_range range = {
569 .start = start,
570 .end = end,
571 .pte = pte,
572 .handler = handler,
573 .on_lock = (void *)kvm_null_fn,
574 .flush_on_ret = true,
575 .may_block = false,
576 };
577
578 return __kvm_handle_hva_range(kvm, &range);
579}
580
581static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
582 unsigned long start,
583 unsigned long end,
584 hva_handler_t handler)
585{
586 struct kvm *kvm = mmu_notifier_to_kvm(mn);
587 const struct kvm_hva_range range = {
588 .start = start,
589 .end = end,
590 .pte = __pte(0),
591 .handler = handler,
592 .on_lock = (void *)kvm_null_fn,
593 .flush_on_ret = false,
594 .may_block = false,
595 };
596
597 return __kvm_handle_hva_range(kvm, &range);
598}
599static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
600 struct mm_struct *mm,
601 unsigned long address,
602 pte_t pte)
603{
604 struct kvm *kvm = mmu_notifier_to_kvm(mn);
605
606 trace_kvm_set_spte_hva(address);
607
608
609
610
611
612
613
614
615 WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
616 if (!READ_ONCE(kvm->mmu_notifier_count))
617 return;
618
619 kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
620}
621
622void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
623 unsigned long end)
624{
625
626
627
628
629
630 kvm->mmu_notifier_count++;
631 if (likely(kvm->mmu_notifier_count == 1)) {
632 kvm->mmu_notifier_range_start = start;
633 kvm->mmu_notifier_range_end = end;
634 } else {
635
636
637
638
639
640
641
642
643
644 kvm->mmu_notifier_range_start =
645 min(kvm->mmu_notifier_range_start, start);
646 kvm->mmu_notifier_range_end =
647 max(kvm->mmu_notifier_range_end, end);
648 }
649}
650
651static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
652 const struct mmu_notifier_range *range)
653{
654 struct kvm *kvm = mmu_notifier_to_kvm(mn);
655 const struct kvm_hva_range hva_range = {
656 .start = range->start,
657 .end = range->end,
658 .pte = __pte(0),
659 .handler = kvm_unmap_gfn_range,
660 .on_lock = kvm_inc_notifier_count,
661 .flush_on_ret = true,
662 .may_block = mmu_notifier_range_blockable(range),
663 };
664
665 trace_kvm_unmap_hva_range(range->start, range->end);
666
667
668
669
670
671
672
673
674
675 spin_lock(&kvm->mn_invalidate_lock);
676 kvm->mn_active_invalidate_count++;
677 spin_unlock(&kvm->mn_invalidate_lock);
678
679 __kvm_handle_hva_range(kvm, &hva_range);
680
681 return 0;
682}
683
684void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
685 unsigned long end)
686{
687
688
689
690
691
692 kvm->mmu_notifier_seq++;
693 smp_wmb();
694
695
696
697
698
699 kvm->mmu_notifier_count--;
700}
701
702static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
703 const struct mmu_notifier_range *range)
704{
705 struct kvm *kvm = mmu_notifier_to_kvm(mn);
706 const struct kvm_hva_range hva_range = {
707 .start = range->start,
708 .end = range->end,
709 .pte = __pte(0),
710 .handler = (void *)kvm_null_fn,
711 .on_lock = kvm_dec_notifier_count,
712 .flush_on_ret = false,
713 .may_block = mmu_notifier_range_blockable(range),
714 };
715 bool wake;
716
717 __kvm_handle_hva_range(kvm, &hva_range);
718
719
720 spin_lock(&kvm->mn_invalidate_lock);
721 wake = (--kvm->mn_active_invalidate_count == 0);
722 spin_unlock(&kvm->mn_invalidate_lock);
723
724
725
726
727
728 if (wake)
729 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
730
731 BUG_ON(kvm->mmu_notifier_count < 0);
732}
733
734static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
735 struct mm_struct *mm,
736 unsigned long start,
737 unsigned long end)
738{
739 trace_kvm_age_hva(start, end);
740
741 return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
742}
743
744static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
745 struct mm_struct *mm,
746 unsigned long start,
747 unsigned long end)
748{
749 trace_kvm_age_hva(start, end);
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
765}
766
767static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
768 struct mm_struct *mm,
769 unsigned long address)
770{
771 trace_kvm_test_age_hva(address);
772
773 return kvm_handle_hva_range_no_flush(mn, address, address + 1,
774 kvm_test_age_gfn);
775}
776
777static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
778 struct mm_struct *mm)
779{
780 struct kvm *kvm = mmu_notifier_to_kvm(mn);
781 int idx;
782
783 idx = srcu_read_lock(&kvm->srcu);
784 kvm_arch_flush_shadow_all(kvm);
785 srcu_read_unlock(&kvm->srcu, idx);
786}
787
788static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
789 .invalidate_range = kvm_mmu_notifier_invalidate_range,
790 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
791 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
792 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
793 .clear_young = kvm_mmu_notifier_clear_young,
794 .test_young = kvm_mmu_notifier_test_young,
795 .change_pte = kvm_mmu_notifier_change_pte,
796 .release = kvm_mmu_notifier_release,
797};
798
799static int kvm_init_mmu_notifier(struct kvm *kvm)
800{
801 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
802 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
803}
804
805#else
806
807static int kvm_init_mmu_notifier(struct kvm *kvm)
808{
809 return 0;
810}
811
812#endif
813
814#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
815static int kvm_pm_notifier_call(struct notifier_block *bl,
816 unsigned long state,
817 void *unused)
818{
819 struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
820
821 return kvm_arch_pm_notifier(kvm, state);
822}
823
824static void kvm_init_pm_notifier(struct kvm *kvm)
825{
826 kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
827
828 kvm->pm_notifier.priority = INT_MAX;
829 register_pm_notifier(&kvm->pm_notifier);
830}
831
832static void kvm_destroy_pm_notifier(struct kvm *kvm)
833{
834 unregister_pm_notifier(&kvm->pm_notifier);
835}
836#else
837static void kvm_init_pm_notifier(struct kvm *kvm)
838{
839}
840
841static void kvm_destroy_pm_notifier(struct kvm *kvm)
842{
843}
844#endif
845
846static struct kvm_memslots *kvm_alloc_memslots(void)
847{
848 int i;
849 struct kvm_memslots *slots;
850
851 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
852 if (!slots)
853 return NULL;
854
855 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
856 slots->id_to_index[i] = -1;
857
858 return slots;
859}
860
861static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
862{
863 if (!memslot->dirty_bitmap)
864 return;
865
866 kvfree(memslot->dirty_bitmap);
867 memslot->dirty_bitmap = NULL;
868}
869
870static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
871{
872 kvm_destroy_dirty_bitmap(slot);
873
874 kvm_arch_free_memslot(kvm, slot);
875
876 slot->flags = 0;
877 slot->npages = 0;
878}
879
880static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
881{
882 struct kvm_memory_slot *memslot;
883
884 if (!slots)
885 return;
886
887 kvm_for_each_memslot(memslot, slots)
888 kvm_free_memslot(kvm, memslot);
889
890 kvfree(slots);
891}
892
893static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
894{
895 switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
896 case KVM_STATS_TYPE_INSTANT:
897 return 0444;
898 case KVM_STATS_TYPE_CUMULATIVE:
899 case KVM_STATS_TYPE_PEAK:
900 default:
901 return 0644;
902 }
903}
904
905
906static void kvm_destroy_vm_debugfs(struct kvm *kvm)
907{
908 int i;
909 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
910 kvm_vcpu_stats_header.num_desc;
911
912 if (!kvm->debugfs_dentry)
913 return;
914
915 debugfs_remove_recursive(kvm->debugfs_dentry);
916
917 if (kvm->debugfs_stat_data) {
918 for (i = 0; i < kvm_debugfs_num_entries; i++)
919 kfree(kvm->debugfs_stat_data[i]);
920 kfree(kvm->debugfs_stat_data);
921 }
922}
923
924static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
925{
926 static DEFINE_MUTEX(kvm_debugfs_lock);
927 struct dentry *dent;
928 char dir_name[ITOA_MAX_LEN * 2];
929 struct kvm_stat_data *stat_data;
930 const struct _kvm_stats_desc *pdesc;
931 int i, ret;
932 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
933 kvm_vcpu_stats_header.num_desc;
934
935 if (!debugfs_initialized())
936 return 0;
937
938 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
939 mutex_lock(&kvm_debugfs_lock);
940 dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
941 if (dent) {
942 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
943 dput(dent);
944 mutex_unlock(&kvm_debugfs_lock);
945 return 0;
946 }
947 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
948 mutex_unlock(&kvm_debugfs_lock);
949 if (IS_ERR(dent))
950 return 0;
951
952 kvm->debugfs_dentry = dent;
953 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
954 sizeof(*kvm->debugfs_stat_data),
955 GFP_KERNEL_ACCOUNT);
956 if (!kvm->debugfs_stat_data)
957 return -ENOMEM;
958
959 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
960 pdesc = &kvm_vm_stats_desc[i];
961 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
962 if (!stat_data)
963 return -ENOMEM;
964
965 stat_data->kvm = kvm;
966 stat_data->desc = pdesc;
967 stat_data->kind = KVM_STAT_VM;
968 kvm->debugfs_stat_data[i] = stat_data;
969 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
970 kvm->debugfs_dentry, stat_data,
971 &stat_fops_per_vm);
972 }
973
974 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
975 pdesc = &kvm_vcpu_stats_desc[i];
976 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
977 if (!stat_data)
978 return -ENOMEM;
979
980 stat_data->kvm = kvm;
981 stat_data->desc = pdesc;
982 stat_data->kind = KVM_STAT_VCPU;
983 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
984 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
985 kvm->debugfs_dentry, stat_data,
986 &stat_fops_per_vm);
987 }
988
989 ret = kvm_arch_create_vm_debugfs(kvm);
990 if (ret) {
991 kvm_destroy_vm_debugfs(kvm);
992 return i;
993 }
994
995 return 0;
996}
997
998
999
1000
1001
1002int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1003{
1004 return 0;
1005}
1006
1007
1008
1009
1010
1011void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1012{
1013}
1014
1015
1016
1017
1018
1019
1020
1021int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1022{
1023 return 0;
1024}
1025
1026static struct kvm *kvm_create_vm(unsigned long type)
1027{
1028 struct kvm *kvm = kvm_arch_alloc_vm();
1029 int r = -ENOMEM;
1030 int i;
1031
1032 if (!kvm)
1033 return ERR_PTR(-ENOMEM);
1034
1035 KVM_MMU_LOCK_INIT(kvm);
1036 mmgrab(current->mm);
1037 kvm->mm = current->mm;
1038 kvm_eventfd_init(kvm);
1039 mutex_init(&kvm->lock);
1040 mutex_init(&kvm->irq_lock);
1041 mutex_init(&kvm->slots_lock);
1042 mutex_init(&kvm->slots_arch_lock);
1043 spin_lock_init(&kvm->mn_invalidate_lock);
1044 rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1045
1046 INIT_LIST_HEAD(&kvm->devices);
1047
1048 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1049
1050 if (init_srcu_struct(&kvm->srcu))
1051 goto out_err_no_srcu;
1052 if (init_srcu_struct(&kvm->irq_srcu))
1053 goto out_err_no_irq_srcu;
1054
1055 refcount_set(&kvm->users_count, 1);
1056 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1057 struct kvm_memslots *slots = kvm_alloc_memslots();
1058
1059 if (!slots)
1060 goto out_err_no_arch_destroy_vm;
1061
1062 slots->generation = i;
1063 rcu_assign_pointer(kvm->memslots[i], slots);
1064 }
1065
1066 for (i = 0; i < KVM_NR_BUSES; i++) {
1067 rcu_assign_pointer(kvm->buses[i],
1068 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1069 if (!kvm->buses[i])
1070 goto out_err_no_arch_destroy_vm;
1071 }
1072
1073 kvm->max_halt_poll_ns = halt_poll_ns;
1074
1075 r = kvm_arch_init_vm(kvm, type);
1076 if (r)
1077 goto out_err_no_arch_destroy_vm;
1078
1079 r = hardware_enable_all();
1080 if (r)
1081 goto out_err_no_disable;
1082
1083#ifdef CONFIG_HAVE_KVM_IRQFD
1084 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1085#endif
1086
1087 r = kvm_init_mmu_notifier(kvm);
1088 if (r)
1089 goto out_err_no_mmu_notifier;
1090
1091 r = kvm_arch_post_init_vm(kvm);
1092 if (r)
1093 goto out_err;
1094
1095 mutex_lock(&kvm_lock);
1096 list_add(&kvm->vm_list, &vm_list);
1097 mutex_unlock(&kvm_lock);
1098
1099 preempt_notifier_inc();
1100 kvm_init_pm_notifier(kvm);
1101
1102 return kvm;
1103
1104out_err:
1105#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1106 if (kvm->mmu_notifier.ops)
1107 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1108#endif
1109out_err_no_mmu_notifier:
1110 hardware_disable_all();
1111out_err_no_disable:
1112 kvm_arch_destroy_vm(kvm);
1113out_err_no_arch_destroy_vm:
1114 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1115 for (i = 0; i < KVM_NR_BUSES; i++)
1116 kfree(kvm_get_bus(kvm, i));
1117 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1118 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1119 cleanup_srcu_struct(&kvm->irq_srcu);
1120out_err_no_irq_srcu:
1121 cleanup_srcu_struct(&kvm->srcu);
1122out_err_no_srcu:
1123 kvm_arch_free_vm(kvm);
1124 mmdrop(current->mm);
1125 return ERR_PTR(r);
1126}
1127
1128static void kvm_destroy_devices(struct kvm *kvm)
1129{
1130 struct kvm_device *dev, *tmp;
1131
1132
1133
1134
1135
1136
1137 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1138 list_del(&dev->vm_node);
1139 dev->ops->destroy(dev);
1140 }
1141}
1142
1143static void kvm_destroy_vm(struct kvm *kvm)
1144{
1145 int i;
1146 struct mm_struct *mm = kvm->mm;
1147
1148 kvm_destroy_pm_notifier(kvm);
1149 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1150 kvm_destroy_vm_debugfs(kvm);
1151 kvm_arch_sync_events(kvm);
1152 mutex_lock(&kvm_lock);
1153 list_del(&kvm->vm_list);
1154 mutex_unlock(&kvm_lock);
1155 kvm_arch_pre_destroy_vm(kvm);
1156
1157 kvm_free_irq_routing(kvm);
1158 for (i = 0; i < KVM_NR_BUSES; i++) {
1159 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1160
1161 if (bus)
1162 kvm_io_bus_destroy(bus);
1163 kvm->buses[i] = NULL;
1164 }
1165 kvm_coalesced_mmio_free(kvm);
1166#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1167 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1168
1169
1170
1171
1172
1173
1174
1175
1176 WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1177 kvm->mn_active_invalidate_count = 0;
1178#else
1179 kvm_arch_flush_shadow_all(kvm);
1180#endif
1181 kvm_arch_destroy_vm(kvm);
1182 kvm_destroy_devices(kvm);
1183 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1184 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1185 cleanup_srcu_struct(&kvm->irq_srcu);
1186 cleanup_srcu_struct(&kvm->srcu);
1187 kvm_arch_free_vm(kvm);
1188 preempt_notifier_dec();
1189 hardware_disable_all();
1190 mmdrop(mm);
1191}
1192
1193void kvm_get_kvm(struct kvm *kvm)
1194{
1195 refcount_inc(&kvm->users_count);
1196}
1197EXPORT_SYMBOL_GPL(kvm_get_kvm);
1198
1199
1200
1201
1202
1203bool kvm_get_kvm_safe(struct kvm *kvm)
1204{
1205 return refcount_inc_not_zero(&kvm->users_count);
1206}
1207EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1208
1209void kvm_put_kvm(struct kvm *kvm)
1210{
1211 if (refcount_dec_and_test(&kvm->users_count))
1212 kvm_destroy_vm(kvm);
1213}
1214EXPORT_SYMBOL_GPL(kvm_put_kvm);
1215
1216
1217
1218
1219
1220
1221
1222
1223void kvm_put_kvm_no_destroy(struct kvm *kvm)
1224{
1225 WARN_ON(refcount_dec_and_test(&kvm->users_count));
1226}
1227EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1228
1229static int kvm_vm_release(struct inode *inode, struct file *filp)
1230{
1231 struct kvm *kvm = filp->private_data;
1232
1233 kvm_irqfd_release(kvm);
1234
1235 kvm_put_kvm(kvm);
1236 return 0;
1237}
1238
1239
1240
1241
1242
1243static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1244{
1245 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
1246
1247 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
1248 if (!memslot->dirty_bitmap)
1249 return -ENOMEM;
1250
1251 return 0;
1252}
1253
1254
1255
1256
1257
1258static inline void kvm_memslot_delete(struct kvm_memslots *slots,
1259 struct kvm_memory_slot *memslot)
1260{
1261 struct kvm_memory_slot *mslots = slots->memslots;
1262 int i;
1263
1264 if (WARN_ON(slots->id_to_index[memslot->id] == -1))
1265 return;
1266
1267 slots->used_slots--;
1268
1269 if (atomic_read(&slots->last_used_slot) >= slots->used_slots)
1270 atomic_set(&slots->last_used_slot, 0);
1271
1272 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
1273 mslots[i] = mslots[i + 1];
1274 slots->id_to_index[mslots[i].id] = i;
1275 }
1276 mslots[i] = *memslot;
1277 slots->id_to_index[memslot->id] = -1;
1278}
1279
1280
1281
1282
1283
1284static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
1285{
1286 return slots->used_slots++;
1287}
1288
1289
1290
1291
1292
1293
1294
1295
1296static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
1297 struct kvm_memory_slot *memslot)
1298{
1299 struct kvm_memory_slot *mslots = slots->memslots;
1300 int i;
1301
1302 if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
1303 WARN_ON_ONCE(!slots->used_slots))
1304 return -1;
1305
1306
1307
1308
1309
1310
1311 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
1312 if (memslot->base_gfn > mslots[i + 1].base_gfn)
1313 break;
1314
1315 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1316
1317
1318 mslots[i] = mslots[i + 1];
1319 slots->id_to_index[mslots[i].id] = i;
1320 }
1321 return i;
1322}
1323
1324
1325
1326
1327
1328
1329
1330
1331static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1332 struct kvm_memory_slot *memslot,
1333 int start)
1334{
1335 struct kvm_memory_slot *mslots = slots->memslots;
1336 int i;
1337
1338 for (i = start; i > 0; i--) {
1339 if (memslot->base_gfn < mslots[i - 1].base_gfn)
1340 break;
1341
1342 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1343
1344
1345 mslots[i] = mslots[i - 1];
1346 slots->id_to_index[mslots[i].id] = i;
1347 }
1348 return i;
1349}
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392static void update_memslots(struct kvm_memslots *slots,
1393 struct kvm_memory_slot *memslot,
1394 enum kvm_mr_change change)
1395{
1396 int i;
1397
1398 if (change == KVM_MR_DELETE) {
1399 kvm_memslot_delete(slots, memslot);
1400 } else {
1401 if (change == KVM_MR_CREATE)
1402 i = kvm_memslot_insert_back(slots);
1403 else
1404 i = kvm_memslot_move_backward(slots, memslot);
1405 i = kvm_memslot_move_forward(slots, memslot, i);
1406
1407
1408
1409
1410
1411 slots->memslots[i] = *memslot;
1412 slots->id_to_index[memslot->id] = i;
1413 }
1414}
1415
1416static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1417{
1418 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1419
1420#ifdef __KVM_HAVE_READONLY_MEM
1421 valid_flags |= KVM_MEM_READONLY;
1422#endif
1423
1424 if (mem->flags & ~valid_flags)
1425 return -EINVAL;
1426
1427 return 0;
1428}
1429
1430static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
1431 int as_id, struct kvm_memslots *slots)
1432{
1433 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
1434 u64 gen = old_memslots->generation;
1435
1436 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1437 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1438
1439
1440
1441
1442
1443
1444 spin_lock(&kvm->mn_invalidate_lock);
1445 prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1446 while (kvm->mn_active_invalidate_count) {
1447 set_current_state(TASK_UNINTERRUPTIBLE);
1448 spin_unlock(&kvm->mn_invalidate_lock);
1449 schedule();
1450 spin_lock(&kvm->mn_invalidate_lock);
1451 }
1452 finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1453 rcu_assign_pointer(kvm->memslots[as_id], slots);
1454 spin_unlock(&kvm->mn_invalidate_lock);
1455
1456
1457
1458
1459
1460
1461 mutex_unlock(&kvm->slots_arch_lock);
1462
1463 synchronize_srcu_expedited(&kvm->srcu);
1464
1465
1466
1467
1468
1469
1470
1471 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1472
1473
1474
1475
1476
1477
1478
1479
1480 gen += KVM_ADDRESS_SPACE_NUM;
1481
1482 kvm_arch_memslots_updated(kvm, gen);
1483
1484 slots->generation = gen;
1485
1486 return old_memslots;
1487}
1488
1489static size_t kvm_memslots_size(int slots)
1490{
1491 return sizeof(struct kvm_memslots) +
1492 (sizeof(struct kvm_memory_slot) * slots);
1493}
1494
1495static void kvm_copy_memslots(struct kvm_memslots *to,
1496 struct kvm_memslots *from)
1497{
1498 memcpy(to, from, kvm_memslots_size(from->used_slots));
1499}
1500
1501
1502
1503
1504
1505
1506static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1507 enum kvm_mr_change change)
1508{
1509 struct kvm_memslots *slots;
1510 size_t new_size;
1511
1512 if (change == KVM_MR_CREATE)
1513 new_size = kvm_memslots_size(old->used_slots + 1);
1514 else
1515 new_size = kvm_memslots_size(old->used_slots);
1516
1517 slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1518 if (likely(slots))
1519 kvm_copy_memslots(slots, old);
1520
1521 return slots;
1522}
1523
1524static int kvm_set_memslot(struct kvm *kvm,
1525 const struct kvm_userspace_memory_region *mem,
1526 struct kvm_memory_slot *old,
1527 struct kvm_memory_slot *new, int as_id,
1528 enum kvm_mr_change change)
1529{
1530 struct kvm_memory_slot *slot;
1531 struct kvm_memslots *slots;
1532 int r;
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548 mutex_lock(&kvm->slots_arch_lock);
1549
1550 slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
1551 if (!slots) {
1552 mutex_unlock(&kvm->slots_arch_lock);
1553 return -ENOMEM;
1554 }
1555
1556 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1557
1558
1559
1560
1561 slot = id_to_memslot(slots, old->id);
1562 slot->flags |= KVM_MEMSLOT_INVALID;
1563
1564
1565
1566
1567
1568
1569 slots = install_new_memslots(kvm, as_id, slots);
1570
1571
1572
1573
1574
1575
1576
1577
1578 kvm_arch_flush_shadow_memslot(kvm, slot);
1579
1580
1581 mutex_lock(&kvm->slots_arch_lock);
1582
1583
1584
1585
1586
1587
1588
1589 kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id));
1590 }
1591
1592 r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
1593 if (r)
1594 goto out_slots;
1595
1596 update_memslots(slots, new, change);
1597 slots = install_new_memslots(kvm, as_id, slots);
1598
1599 kvm_arch_commit_memory_region(kvm, mem, old, new, change);
1600
1601 kvfree(slots);
1602 return 0;
1603
1604out_slots:
1605 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1606 slot = id_to_memslot(slots, old->id);
1607 slot->flags &= ~KVM_MEMSLOT_INVALID;
1608 slots = install_new_memslots(kvm, as_id, slots);
1609 } else {
1610 mutex_unlock(&kvm->slots_arch_lock);
1611 }
1612 kvfree(slots);
1613 return r;
1614}
1615
1616static int kvm_delete_memslot(struct kvm *kvm,
1617 const struct kvm_userspace_memory_region *mem,
1618 struct kvm_memory_slot *old, int as_id)
1619{
1620 struct kvm_memory_slot new;
1621 int r;
1622
1623 if (!old->npages)
1624 return -EINVAL;
1625
1626 memset(&new, 0, sizeof(new));
1627 new.id = old->id;
1628
1629
1630
1631
1632 new.as_id = as_id;
1633
1634 r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
1635 if (r)
1636 return r;
1637
1638 kvm_free_memslot(kvm, old);
1639 return 0;
1640}
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650int __kvm_set_memory_region(struct kvm *kvm,
1651 const struct kvm_userspace_memory_region *mem)
1652{
1653 struct kvm_memory_slot old, new;
1654 struct kvm_memory_slot *tmp;
1655 enum kvm_mr_change change;
1656 int as_id, id;
1657 int r;
1658
1659 r = check_memory_region_flags(mem);
1660 if (r)
1661 return r;
1662
1663 as_id = mem->slot >> 16;
1664 id = (u16)mem->slot;
1665
1666
1667 if (mem->memory_size & (PAGE_SIZE - 1))
1668 return -EINVAL;
1669 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1670 return -EINVAL;
1671
1672 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1673 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1674 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1675 mem->memory_size))
1676 return -EINVAL;
1677 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1678 return -EINVAL;
1679 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1680 return -EINVAL;
1681
1682
1683
1684
1685
1686
1687
1688 tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1689 if (tmp) {
1690 old = *tmp;
1691 tmp = NULL;
1692 } else {
1693 memset(&old, 0, sizeof(old));
1694 old.id = id;
1695 }
1696
1697 if (!mem->memory_size)
1698 return kvm_delete_memslot(kvm, mem, &old, as_id);
1699
1700 new.as_id = as_id;
1701 new.id = id;
1702 new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1703 new.npages = mem->memory_size >> PAGE_SHIFT;
1704 new.flags = mem->flags;
1705 new.userspace_addr = mem->userspace_addr;
1706
1707 if (new.npages > KVM_MEM_MAX_NR_PAGES)
1708 return -EINVAL;
1709
1710 if (!old.npages) {
1711 change = KVM_MR_CREATE;
1712 new.dirty_bitmap = NULL;
1713 memset(&new.arch, 0, sizeof(new.arch));
1714 } else {
1715 if ((new.userspace_addr != old.userspace_addr) ||
1716 (new.npages != old.npages) ||
1717 ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1718 return -EINVAL;
1719
1720 if (new.base_gfn != old.base_gfn)
1721 change = KVM_MR_MOVE;
1722 else if (new.flags != old.flags)
1723 change = KVM_MR_FLAGS_ONLY;
1724 else
1725 return 0;
1726
1727
1728 new.dirty_bitmap = old.dirty_bitmap;
1729 memcpy(&new.arch, &old.arch, sizeof(new.arch));
1730 }
1731
1732 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
1733
1734 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1735 if (tmp->id == id)
1736 continue;
1737 if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1738 (new.base_gfn >= tmp->base_gfn + tmp->npages)))
1739 return -EEXIST;
1740 }
1741 }
1742
1743
1744 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1745 new.dirty_bitmap = NULL;
1746 else if (!new.dirty_bitmap && !kvm->dirty_ring_size) {
1747 r = kvm_alloc_dirty_bitmap(&new);
1748 if (r)
1749 return r;
1750
1751 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1752 bitmap_set(new.dirty_bitmap, 0, new.npages);
1753 }
1754
1755 r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
1756 if (r)
1757 goto out_bitmap;
1758
1759 if (old.dirty_bitmap && !new.dirty_bitmap)
1760 kvm_destroy_dirty_bitmap(&old);
1761 return 0;
1762
1763out_bitmap:
1764 if (new.dirty_bitmap && !old.dirty_bitmap)
1765 kvm_destroy_dirty_bitmap(&new);
1766 return r;
1767}
1768EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1769
1770int kvm_set_memory_region(struct kvm *kvm,
1771 const struct kvm_userspace_memory_region *mem)
1772{
1773 int r;
1774
1775 mutex_lock(&kvm->slots_lock);
1776 r = __kvm_set_memory_region(kvm, mem);
1777 mutex_unlock(&kvm->slots_lock);
1778 return r;
1779}
1780EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1781
1782static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1783 struct kvm_userspace_memory_region *mem)
1784{
1785 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1786 return -EINVAL;
1787
1788 return kvm_set_memory_region(kvm, mem);
1789}
1790
1791#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1792
1793
1794
1795
1796
1797
1798
1799int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1800 int *is_dirty, struct kvm_memory_slot **memslot)
1801{
1802 struct kvm_memslots *slots;
1803 int i, as_id, id;
1804 unsigned long n;
1805 unsigned long any = 0;
1806
1807
1808 if (kvm->dirty_ring_size)
1809 return -ENXIO;
1810
1811 *memslot = NULL;
1812 *is_dirty = 0;
1813
1814 as_id = log->slot >> 16;
1815 id = (u16)log->slot;
1816 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1817 return -EINVAL;
1818
1819 slots = __kvm_memslots(kvm, as_id);
1820 *memslot = id_to_memslot(slots, id);
1821 if (!(*memslot) || !(*memslot)->dirty_bitmap)
1822 return -ENOENT;
1823
1824 kvm_arch_sync_dirty_log(kvm, *memslot);
1825
1826 n = kvm_dirty_bitmap_bytes(*memslot);
1827
1828 for (i = 0; !any && i < n/sizeof(long); ++i)
1829 any = (*memslot)->dirty_bitmap[i];
1830
1831 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1832 return -EFAULT;
1833
1834 if (any)
1835 *is_dirty = 1;
1836 return 0;
1837}
1838EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1839
1840#else
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
1863{
1864 struct kvm_memslots *slots;
1865 struct kvm_memory_slot *memslot;
1866 int i, as_id, id;
1867 unsigned long n;
1868 unsigned long *dirty_bitmap;
1869 unsigned long *dirty_bitmap_buffer;
1870 bool flush;
1871
1872
1873 if (kvm->dirty_ring_size)
1874 return -ENXIO;
1875
1876 as_id = log->slot >> 16;
1877 id = (u16)log->slot;
1878 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1879 return -EINVAL;
1880
1881 slots = __kvm_memslots(kvm, as_id);
1882 memslot = id_to_memslot(slots, id);
1883 if (!memslot || !memslot->dirty_bitmap)
1884 return -ENOENT;
1885
1886 dirty_bitmap = memslot->dirty_bitmap;
1887
1888 kvm_arch_sync_dirty_log(kvm, memslot);
1889
1890 n = kvm_dirty_bitmap_bytes(memslot);
1891 flush = false;
1892 if (kvm->manual_dirty_log_protect) {
1893
1894
1895
1896
1897
1898
1899
1900
1901 dirty_bitmap_buffer = dirty_bitmap;
1902 } else {
1903 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1904 memset(dirty_bitmap_buffer, 0, n);
1905
1906 KVM_MMU_LOCK(kvm);
1907 for (i = 0; i < n / sizeof(long); i++) {
1908 unsigned long mask;
1909 gfn_t offset;
1910
1911 if (!dirty_bitmap[i])
1912 continue;
1913
1914 flush = true;
1915 mask = xchg(&dirty_bitmap[i], 0);
1916 dirty_bitmap_buffer[i] = mask;
1917
1918 offset = i * BITS_PER_LONG;
1919 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1920 offset, mask);
1921 }
1922 KVM_MMU_UNLOCK(kvm);
1923 }
1924
1925 if (flush)
1926 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1927
1928 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1929 return -EFAULT;
1930 return 0;
1931}
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1954 struct kvm_dirty_log *log)
1955{
1956 int r;
1957
1958 mutex_lock(&kvm->slots_lock);
1959
1960 r = kvm_get_dirty_log_protect(kvm, log);
1961
1962 mutex_unlock(&kvm->slots_lock);
1963 return r;
1964}
1965
1966
1967
1968
1969
1970
1971
1972static int kvm_clear_dirty_log_protect(struct kvm *kvm,
1973 struct kvm_clear_dirty_log *log)
1974{
1975 struct kvm_memslots *slots;
1976 struct kvm_memory_slot *memslot;
1977 int as_id, id;
1978 gfn_t offset;
1979 unsigned long i, n;
1980 unsigned long *dirty_bitmap;
1981 unsigned long *dirty_bitmap_buffer;
1982 bool flush;
1983
1984
1985 if (kvm->dirty_ring_size)
1986 return -ENXIO;
1987
1988 as_id = log->slot >> 16;
1989 id = (u16)log->slot;
1990 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1991 return -EINVAL;
1992
1993 if (log->first_page & 63)
1994 return -EINVAL;
1995
1996 slots = __kvm_memslots(kvm, as_id);
1997 memslot = id_to_memslot(slots, id);
1998 if (!memslot || !memslot->dirty_bitmap)
1999 return -ENOENT;
2000
2001 dirty_bitmap = memslot->dirty_bitmap;
2002
2003 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2004
2005 if (log->first_page > memslot->npages ||
2006 log->num_pages > memslot->npages - log->first_page ||
2007 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2008 return -EINVAL;
2009
2010 kvm_arch_sync_dirty_log(kvm, memslot);
2011
2012 flush = false;
2013 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2014 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2015 return -EFAULT;
2016
2017 KVM_MMU_LOCK(kvm);
2018 for (offset = log->first_page, i = offset / BITS_PER_LONG,
2019 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2020 i++, offset += BITS_PER_LONG) {
2021 unsigned long mask = *dirty_bitmap_buffer++;
2022 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2023 if (!mask)
2024 continue;
2025
2026 mask &= atomic_long_fetch_andnot(mask, p);
2027
2028
2029
2030
2031
2032
2033
2034 if (mask) {
2035 flush = true;
2036 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2037 offset, mask);
2038 }
2039 }
2040 KVM_MMU_UNLOCK(kvm);
2041
2042 if (flush)
2043 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2044
2045 return 0;
2046}
2047
2048static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2049 struct kvm_clear_dirty_log *log)
2050{
2051 int r;
2052
2053 mutex_lock(&kvm->slots_lock);
2054
2055 r = kvm_clear_dirty_log_protect(kvm, log);
2056
2057 mutex_unlock(&kvm->slots_lock);
2058 return r;
2059}
2060#endif
2061
2062struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2063{
2064 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2065}
2066EXPORT_SYMBOL_GPL(gfn_to_memslot);
2067
2068struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2069{
2070 struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2071 struct kvm_memory_slot *slot;
2072 int slot_index;
2073
2074 slot = try_get_memslot(slots, vcpu->last_used_slot, gfn);
2075 if (slot)
2076 return slot;
2077
2078
2079
2080
2081
2082
2083 slot = search_memslots(slots, gfn, &slot_index);
2084 if (slot) {
2085 vcpu->last_used_slot = slot_index;
2086 return slot;
2087 }
2088
2089 return NULL;
2090}
2091EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
2092
2093bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2094{
2095 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2096
2097 return kvm_is_visible_memslot(memslot);
2098}
2099EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2100
2101bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2102{
2103 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2104
2105 return kvm_is_visible_memslot(memslot);
2106}
2107EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2108
2109unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2110{
2111 struct vm_area_struct *vma;
2112 unsigned long addr, size;
2113
2114 size = PAGE_SIZE;
2115
2116 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2117 if (kvm_is_error_hva(addr))
2118 return PAGE_SIZE;
2119
2120 mmap_read_lock(current->mm);
2121 vma = find_vma(current->mm, addr);
2122 if (!vma)
2123 goto out;
2124
2125 size = vma_kernel_pagesize(vma);
2126
2127out:
2128 mmap_read_unlock(current->mm);
2129
2130 return size;
2131}
2132
2133static bool memslot_is_readonly(struct kvm_memory_slot *slot)
2134{
2135 return slot->flags & KVM_MEM_READONLY;
2136}
2137
2138static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2139 gfn_t *nr_pages, bool write)
2140{
2141 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2142 return KVM_HVA_ERR_BAD;
2143
2144 if (memslot_is_readonly(slot) && write)
2145 return KVM_HVA_ERR_RO_BAD;
2146
2147 if (nr_pages)
2148 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2149
2150 return __gfn_to_hva_memslot(slot, gfn);
2151}
2152
2153static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2154 gfn_t *nr_pages)
2155{
2156 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2157}
2158
2159unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2160 gfn_t gfn)
2161{
2162 return gfn_to_hva_many(slot, gfn, NULL);
2163}
2164EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2165
2166unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2167{
2168 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2169}
2170EXPORT_SYMBOL_GPL(gfn_to_hva);
2171
2172unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2173{
2174 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2175}
2176EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2187 gfn_t gfn, bool *writable)
2188{
2189 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2190
2191 if (!kvm_is_error_hva(hva) && writable)
2192 *writable = !memslot_is_readonly(slot);
2193
2194 return hva;
2195}
2196
2197unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2198{
2199 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2200
2201 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2202}
2203
2204unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2205{
2206 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2207
2208 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2209}
2210
2211static inline int check_user_page_hwpoison(unsigned long addr)
2212{
2213 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2214
2215 rc = get_user_pages(addr, 1, flags, NULL, NULL);
2216 return rc == -EHWPOISON;
2217}
2218
2219
2220
2221
2222
2223
2224static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2225 bool *writable, kvm_pfn_t *pfn)
2226{
2227 struct page *page[1];
2228
2229
2230
2231
2232
2233
2234 if (!(write_fault || writable))
2235 return false;
2236
2237 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2238 *pfn = page_to_pfn(page[0]);
2239
2240 if (writable)
2241 *writable = true;
2242 return true;
2243 }
2244
2245 return false;
2246}
2247
2248
2249
2250
2251
2252static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2253 bool *writable, kvm_pfn_t *pfn)
2254{
2255 unsigned int flags = FOLL_HWPOISON;
2256 struct page *page;
2257 int npages = 0;
2258
2259 might_sleep();
2260
2261 if (writable)
2262 *writable = write_fault;
2263
2264 if (write_fault)
2265 flags |= FOLL_WRITE;
2266 if (async)
2267 flags |= FOLL_NOWAIT;
2268
2269 npages = get_user_pages_unlocked(addr, 1, &page, flags);
2270 if (npages != 1)
2271 return npages;
2272
2273
2274 if (unlikely(!write_fault) && writable) {
2275 struct page *wpage;
2276
2277 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2278 *writable = true;
2279 put_page(page);
2280 page = wpage;
2281 }
2282 }
2283 *pfn = page_to_pfn(page);
2284 return npages;
2285}
2286
2287static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2288{
2289 if (unlikely(!(vma->vm_flags & VM_READ)))
2290 return false;
2291
2292 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2293 return false;
2294
2295 return true;
2296}
2297
2298static int kvm_try_get_pfn(kvm_pfn_t pfn)
2299{
2300 if (kvm_is_reserved_pfn(pfn))
2301 return 1;
2302 return get_page_unless_zero(pfn_to_page(pfn));
2303}
2304
2305static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2306 unsigned long addr, bool *async,
2307 bool write_fault, bool *writable,
2308 kvm_pfn_t *p_pfn)
2309{
2310 kvm_pfn_t pfn;
2311 pte_t *ptep;
2312 spinlock_t *ptl;
2313 int r;
2314
2315 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2316 if (r) {
2317
2318
2319
2320
2321 bool unlocked = false;
2322 r = fixup_user_fault(current->mm, addr,
2323 (write_fault ? FAULT_FLAG_WRITE : 0),
2324 &unlocked);
2325 if (unlocked)
2326 return -EAGAIN;
2327 if (r)
2328 return r;
2329
2330 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2331 if (r)
2332 return r;
2333 }
2334
2335 if (write_fault && !pte_write(*ptep)) {
2336 pfn = KVM_PFN_ERR_RO_FAULT;
2337 goto out;
2338 }
2339
2340 if (writable)
2341 *writable = pte_write(*ptep);
2342 pfn = pte_pfn(*ptep);
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361 if (!kvm_try_get_pfn(pfn))
2362 r = -EFAULT;
2363
2364out:
2365 pte_unmap_unlock(ptep, ptl);
2366 *p_pfn = pfn;
2367
2368 return r;
2369}
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
2386 bool write_fault, bool *writable)
2387{
2388 struct vm_area_struct *vma;
2389 kvm_pfn_t pfn = 0;
2390 int npages, r;
2391
2392
2393 BUG_ON(atomic && async);
2394
2395 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2396 return pfn;
2397
2398 if (atomic)
2399 return KVM_PFN_ERR_FAULT;
2400
2401 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
2402 if (npages == 1)
2403 return pfn;
2404
2405 mmap_read_lock(current->mm);
2406 if (npages == -EHWPOISON ||
2407 (!async && check_user_page_hwpoison(addr))) {
2408 pfn = KVM_PFN_ERR_HWPOISON;
2409 goto exit;
2410 }
2411
2412retry:
2413 vma = vma_lookup(current->mm, addr);
2414
2415 if (vma == NULL)
2416 pfn = KVM_PFN_ERR_FAULT;
2417 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2418 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
2419 if (r == -EAGAIN)
2420 goto retry;
2421 if (r < 0)
2422 pfn = KVM_PFN_ERR_FAULT;
2423 } else {
2424 if (async && vma_is_valid(vma, write_fault))
2425 *async = true;
2426 pfn = KVM_PFN_ERR_FAULT;
2427 }
2428exit:
2429 mmap_read_unlock(current->mm);
2430 return pfn;
2431}
2432
2433kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
2434 bool atomic, bool *async, bool write_fault,
2435 bool *writable, hva_t *hva)
2436{
2437 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2438
2439 if (hva)
2440 *hva = addr;
2441
2442 if (addr == KVM_HVA_ERR_RO_BAD) {
2443 if (writable)
2444 *writable = false;
2445 return KVM_PFN_ERR_RO_FAULT;
2446 }
2447
2448 if (kvm_is_error_hva(addr)) {
2449 if (writable)
2450 *writable = false;
2451 return KVM_PFN_NOSLOT;
2452 }
2453
2454
2455 if (writable && memslot_is_readonly(slot)) {
2456 *writable = false;
2457 writable = NULL;
2458 }
2459
2460 return hva_to_pfn(addr, atomic, async, write_fault,
2461 writable);
2462}
2463EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2464
2465kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2466 bool *writable)
2467{
2468 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2469 write_fault, writable, NULL);
2470}
2471EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2472
2473kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
2474{
2475 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
2476}
2477EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2478
2479kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
2480{
2481 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
2482}
2483EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2484
2485kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2486{
2487 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2488}
2489EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2490
2491kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2492{
2493 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2494}
2495EXPORT_SYMBOL_GPL(gfn_to_pfn);
2496
2497kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2498{
2499 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2500}
2501EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2502
2503int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2504 struct page **pages, int nr_pages)
2505{
2506 unsigned long addr;
2507 gfn_t entry = 0;
2508
2509 addr = gfn_to_hva_many(slot, gfn, &entry);
2510 if (kvm_is_error_hva(addr))
2511 return -1;
2512
2513 if (entry < nr_pages)
2514 return 0;
2515
2516 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2517}
2518EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2519
2520static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
2521{
2522 if (is_error_noslot_pfn(pfn))
2523 return KVM_ERR_PTR_BAD_PAGE;
2524
2525 if (kvm_is_reserved_pfn(pfn)) {
2526 WARN_ON(1);
2527 return KVM_ERR_PTR_BAD_PAGE;
2528 }
2529
2530 return pfn_to_page(pfn);
2531}
2532
2533struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2534{
2535 kvm_pfn_t pfn;
2536
2537 pfn = gfn_to_pfn(kvm, gfn);
2538
2539 return kvm_pfn_to_page(pfn);
2540}
2541EXPORT_SYMBOL_GPL(gfn_to_page);
2542
2543void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
2544{
2545 if (pfn == 0)
2546 return;
2547
2548 if (cache)
2549 cache->pfn = cache->gfn = 0;
2550
2551 if (dirty)
2552 kvm_release_pfn_dirty(pfn);
2553 else
2554 kvm_release_pfn_clean(pfn);
2555}
2556
2557static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
2558 struct gfn_to_pfn_cache *cache, u64 gen)
2559{
2560 kvm_release_pfn(cache->pfn, cache->dirty, cache);
2561
2562 cache->pfn = gfn_to_pfn_memslot(slot, gfn);
2563 cache->gfn = gfn;
2564 cache->dirty = false;
2565 cache->generation = gen;
2566}
2567
2568static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
2569 struct kvm_host_map *map,
2570 struct gfn_to_pfn_cache *cache,
2571 bool atomic)
2572{
2573 kvm_pfn_t pfn;
2574 void *hva = NULL;
2575 struct page *page = KVM_UNMAPPED_PAGE;
2576 struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
2577 u64 gen = slots->generation;
2578
2579 if (!map)
2580 return -EINVAL;
2581
2582 if (cache) {
2583 if (!cache->pfn || cache->gfn != gfn ||
2584 cache->generation != gen) {
2585 if (atomic)
2586 return -EAGAIN;
2587 kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
2588 }
2589 pfn = cache->pfn;
2590 } else {
2591 if (atomic)
2592 return -EAGAIN;
2593 pfn = gfn_to_pfn_memslot(slot, gfn);
2594 }
2595 if (is_error_noslot_pfn(pfn))
2596 return -EINVAL;
2597
2598 if (pfn_valid(pfn)) {
2599 page = pfn_to_page(pfn);
2600 if (atomic)
2601 hva = kmap_atomic(page);
2602 else
2603 hva = kmap(page);
2604#ifdef CONFIG_HAS_IOMEM
2605 } else if (!atomic) {
2606 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2607 } else {
2608 return -EINVAL;
2609#endif
2610 }
2611
2612 if (!hva)
2613 return -EFAULT;
2614
2615 map->page = page;
2616 map->hva = hva;
2617 map->pfn = pfn;
2618 map->gfn = gfn;
2619
2620 return 0;
2621}
2622
2623int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
2624 struct gfn_to_pfn_cache *cache, bool atomic)
2625{
2626 return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
2627 cache, atomic);
2628}
2629EXPORT_SYMBOL_GPL(kvm_map_gfn);
2630
2631int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2632{
2633 return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
2634 NULL, false);
2635}
2636EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2637
2638static void __kvm_unmap_gfn(struct kvm *kvm,
2639 struct kvm_memory_slot *memslot,
2640 struct kvm_host_map *map,
2641 struct gfn_to_pfn_cache *cache,
2642 bool dirty, bool atomic)
2643{
2644 if (!map)
2645 return;
2646
2647 if (!map->hva)
2648 return;
2649
2650 if (map->page != KVM_UNMAPPED_PAGE) {
2651 if (atomic)
2652 kunmap_atomic(map->hva);
2653 else
2654 kunmap(map->page);
2655 }
2656#ifdef CONFIG_HAS_IOMEM
2657 else if (!atomic)
2658 memunmap(map->hva);
2659 else
2660 WARN_ONCE(1, "Unexpected unmapping in atomic context");
2661#endif
2662
2663 if (dirty)
2664 mark_page_dirty_in_slot(kvm, memslot, map->gfn);
2665
2666 if (cache)
2667 cache->dirty |= dirty;
2668 else
2669 kvm_release_pfn(map->pfn, dirty, NULL);
2670
2671 map->hva = NULL;
2672 map->page = NULL;
2673}
2674
2675int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
2676 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
2677{
2678 __kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map,
2679 cache, dirty, atomic);
2680 return 0;
2681}
2682EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
2683
2684void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2685{
2686 __kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn),
2687 map, NULL, dirty, false);
2688}
2689EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2690
2691struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2692{
2693 kvm_pfn_t pfn;
2694
2695 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2696
2697 return kvm_pfn_to_page(pfn);
2698}
2699EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2700
2701void kvm_release_page_clean(struct page *page)
2702{
2703 WARN_ON(is_error_page(page));
2704
2705 kvm_release_pfn_clean(page_to_pfn(page));
2706}
2707EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2708
2709void kvm_release_pfn_clean(kvm_pfn_t pfn)
2710{
2711 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2712 put_page(pfn_to_page(pfn));
2713}
2714EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2715
2716void kvm_release_page_dirty(struct page *page)
2717{
2718 WARN_ON(is_error_page(page));
2719
2720 kvm_release_pfn_dirty(page_to_pfn(page));
2721}
2722EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2723
2724void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2725{
2726 kvm_set_pfn_dirty(pfn);
2727 kvm_release_pfn_clean(pfn);
2728}
2729EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2730
2731void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2732{
2733 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2734 SetPageDirty(pfn_to_page(pfn));
2735}
2736EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2737
2738void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2739{
2740 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2741 mark_page_accessed(pfn_to_page(pfn));
2742}
2743EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2744
2745static int next_segment(unsigned long len, int offset)
2746{
2747 if (len > PAGE_SIZE - offset)
2748 return PAGE_SIZE - offset;
2749 else
2750 return len;
2751}
2752
2753static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2754 void *data, int offset, int len)
2755{
2756 int r;
2757 unsigned long addr;
2758
2759 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2760 if (kvm_is_error_hva(addr))
2761 return -EFAULT;
2762 r = __copy_from_user(data, (void __user *)addr + offset, len);
2763 if (r)
2764 return -EFAULT;
2765 return 0;
2766}
2767
2768int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2769 int len)
2770{
2771 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2772
2773 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2774}
2775EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2776
2777int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2778 int offset, int len)
2779{
2780 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2781
2782 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2783}
2784EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2785
2786int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2787{
2788 gfn_t gfn = gpa >> PAGE_SHIFT;
2789 int seg;
2790 int offset = offset_in_page(gpa);
2791 int ret;
2792
2793 while ((seg = next_segment(len, offset)) != 0) {
2794 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2795 if (ret < 0)
2796 return ret;
2797 offset = 0;
2798 len -= seg;
2799 data += seg;
2800 ++gfn;
2801 }
2802 return 0;
2803}
2804EXPORT_SYMBOL_GPL(kvm_read_guest);
2805
2806int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2807{
2808 gfn_t gfn = gpa >> PAGE_SHIFT;
2809 int seg;
2810 int offset = offset_in_page(gpa);
2811 int ret;
2812
2813 while ((seg = next_segment(len, offset)) != 0) {
2814 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2815 if (ret < 0)
2816 return ret;
2817 offset = 0;
2818 len -= seg;
2819 data += seg;
2820 ++gfn;
2821 }
2822 return 0;
2823}
2824EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2825
2826static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2827 void *data, int offset, unsigned long len)
2828{
2829 int r;
2830 unsigned long addr;
2831
2832 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2833 if (kvm_is_error_hva(addr))
2834 return -EFAULT;
2835 pagefault_disable();
2836 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2837 pagefault_enable();
2838 if (r)
2839 return -EFAULT;
2840 return 0;
2841}
2842
2843int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2844 void *data, unsigned long len)
2845{
2846 gfn_t gfn = gpa >> PAGE_SHIFT;
2847 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2848 int offset = offset_in_page(gpa);
2849
2850 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2851}
2852EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2853
2854static int __kvm_write_guest_page(struct kvm *kvm,
2855 struct kvm_memory_slot *memslot, gfn_t gfn,
2856 const void *data, int offset, int len)
2857{
2858 int r;
2859 unsigned long addr;
2860
2861 addr = gfn_to_hva_memslot(memslot, gfn);
2862 if (kvm_is_error_hva(addr))
2863 return -EFAULT;
2864 r = __copy_to_user((void __user *)addr + offset, data, len);
2865 if (r)
2866 return -EFAULT;
2867 mark_page_dirty_in_slot(kvm, memslot, gfn);
2868 return 0;
2869}
2870
2871int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2872 const void *data, int offset, int len)
2873{
2874 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2875
2876 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
2877}
2878EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2879
2880int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2881 const void *data, int offset, int len)
2882{
2883 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2884
2885 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
2886}
2887EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2888
2889int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2890 unsigned long len)
2891{
2892 gfn_t gfn = gpa >> PAGE_SHIFT;
2893 int seg;
2894 int offset = offset_in_page(gpa);
2895 int ret;
2896
2897 while ((seg = next_segment(len, offset)) != 0) {
2898 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2899 if (ret < 0)
2900 return ret;
2901 offset = 0;
2902 len -= seg;
2903 data += seg;
2904 ++gfn;
2905 }
2906 return 0;
2907}
2908EXPORT_SYMBOL_GPL(kvm_write_guest);
2909
2910int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2911 unsigned long len)
2912{
2913 gfn_t gfn = gpa >> PAGE_SHIFT;
2914 int seg;
2915 int offset = offset_in_page(gpa);
2916 int ret;
2917
2918 while ((seg = next_segment(len, offset)) != 0) {
2919 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2920 if (ret < 0)
2921 return ret;
2922 offset = 0;
2923 len -= seg;
2924 data += seg;
2925 ++gfn;
2926 }
2927 return 0;
2928}
2929EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2930
2931static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2932 struct gfn_to_hva_cache *ghc,
2933 gpa_t gpa, unsigned long len)
2934{
2935 int offset = offset_in_page(gpa);
2936 gfn_t start_gfn = gpa >> PAGE_SHIFT;
2937 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2938 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2939 gfn_t nr_pages_avail;
2940
2941
2942 ghc->generation = slots->generation;
2943
2944 if (start_gfn > end_gfn) {
2945 ghc->hva = KVM_HVA_ERR_BAD;
2946 return -EINVAL;
2947 }
2948
2949
2950
2951
2952
2953 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
2954 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2955 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2956 &nr_pages_avail);
2957 if (kvm_is_error_hva(ghc->hva))
2958 return -EFAULT;
2959 }
2960
2961
2962 if (nr_pages_needed == 1)
2963 ghc->hva += offset;
2964 else
2965 ghc->memslot = NULL;
2966
2967 ghc->gpa = gpa;
2968 ghc->len = len;
2969 return 0;
2970}
2971
2972int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2973 gpa_t gpa, unsigned long len)
2974{
2975 struct kvm_memslots *slots = kvm_memslots(kvm);
2976 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
2977}
2978EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
2979
2980int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2981 void *data, unsigned int offset,
2982 unsigned long len)
2983{
2984 struct kvm_memslots *slots = kvm_memslots(kvm);
2985 int r;
2986 gpa_t gpa = ghc->gpa + offset;
2987
2988 BUG_ON(len + offset > ghc->len);
2989
2990 if (slots->generation != ghc->generation) {
2991 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2992 return -EFAULT;
2993 }
2994
2995 if (kvm_is_error_hva(ghc->hva))
2996 return -EFAULT;
2997
2998 if (unlikely(!ghc->memslot))
2999 return kvm_write_guest(kvm, gpa, data, len);
3000
3001 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3002 if (r)
3003 return -EFAULT;
3004 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3005
3006 return 0;
3007}
3008EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3009
3010int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3011 void *data, unsigned long len)
3012{
3013 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3014}
3015EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3016
3017int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3018 void *data, unsigned int offset,
3019 unsigned long len)
3020{
3021 struct kvm_memslots *slots = kvm_memslots(kvm);
3022 int r;
3023 gpa_t gpa = ghc->gpa + offset;
3024
3025 BUG_ON(len + offset > ghc->len);
3026
3027 if (slots->generation != ghc->generation) {
3028 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3029 return -EFAULT;
3030 }
3031
3032 if (kvm_is_error_hva(ghc->hva))
3033 return -EFAULT;
3034
3035 if (unlikely(!ghc->memslot))
3036 return kvm_read_guest(kvm, gpa, data, len);
3037
3038 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3039 if (r)
3040 return -EFAULT;
3041
3042 return 0;
3043}
3044EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3045
3046int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3047 void *data, unsigned long len)
3048{
3049 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3050}
3051EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3052
3053int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3054{
3055 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3056 gfn_t gfn = gpa >> PAGE_SHIFT;
3057 int seg;
3058 int offset = offset_in_page(gpa);
3059 int ret;
3060
3061 while ((seg = next_segment(len, offset)) != 0) {
3062 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3063 if (ret < 0)
3064 return ret;
3065 offset = 0;
3066 len -= seg;
3067 ++gfn;
3068 }
3069 return 0;
3070}
3071EXPORT_SYMBOL_GPL(kvm_clear_guest);
3072
3073void mark_page_dirty_in_slot(struct kvm *kvm,
3074 struct kvm_memory_slot *memslot,
3075 gfn_t gfn)
3076{
3077 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3078 unsigned long rel_gfn = gfn - memslot->base_gfn;
3079 u32 slot = (memslot->as_id << 16) | memslot->id;
3080
3081 if (kvm->dirty_ring_size)
3082 kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
3083 slot, rel_gfn);
3084 else
3085 set_bit_le(rel_gfn, memslot->dirty_bitmap);
3086 }
3087}
3088EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3089
3090void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3091{
3092 struct kvm_memory_slot *memslot;
3093
3094 memslot = gfn_to_memslot(kvm, gfn);
3095 mark_page_dirty_in_slot(kvm, memslot, gfn);
3096}
3097EXPORT_SYMBOL_GPL(mark_page_dirty);
3098
3099void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3100{
3101 struct kvm_memory_slot *memslot;
3102
3103 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3104 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3105}
3106EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3107
3108void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3109{
3110 if (!vcpu->sigset_active)
3111 return;
3112
3113
3114
3115
3116
3117
3118
3119 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked);
3120}
3121
3122void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3123{
3124 if (!vcpu->sigset_active)
3125 return;
3126
3127 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL);
3128 sigemptyset(¤t->real_blocked);
3129}
3130
3131static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3132{
3133 unsigned int old, val, grow, grow_start;
3134
3135 old = val = vcpu->halt_poll_ns;
3136 grow_start = READ_ONCE(halt_poll_ns_grow_start);
3137 grow = READ_ONCE(halt_poll_ns_grow);
3138 if (!grow)
3139 goto out;
3140
3141 val *= grow;
3142 if (val < grow_start)
3143 val = grow_start;
3144
3145 if (val > vcpu->kvm->max_halt_poll_ns)
3146 val = vcpu->kvm->max_halt_poll_ns;
3147
3148 vcpu->halt_poll_ns = val;
3149out:
3150 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3151}
3152
3153static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3154{
3155 unsigned int old, val, shrink, grow_start;
3156
3157 old = val = vcpu->halt_poll_ns;
3158 shrink = READ_ONCE(halt_poll_ns_shrink);
3159 grow_start = READ_ONCE(halt_poll_ns_grow_start);
3160 if (shrink == 0)
3161 val = 0;
3162 else
3163 val /= shrink;
3164
3165 if (val < grow_start)
3166 val = 0;
3167
3168 vcpu->halt_poll_ns = val;
3169 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3170}
3171
3172static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3173{
3174 int ret = -EINTR;
3175 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3176
3177 if (kvm_arch_vcpu_runnable(vcpu)) {
3178 kvm_make_request(KVM_REQ_UNHALT, vcpu);
3179 goto out;
3180 }
3181 if (kvm_cpu_has_pending_timer(vcpu))
3182 goto out;
3183 if (signal_pending(current))
3184 goto out;
3185 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3186 goto out;
3187
3188 ret = 0;
3189out:
3190 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3191 return ret;
3192}
3193
3194static inline void
3195update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
3196{
3197 if (waited)
3198 vcpu->stat.generic.halt_poll_fail_ns += poll_ns;
3199 else
3200 vcpu->stat.generic.halt_poll_success_ns += poll_ns;
3201}
3202
3203
3204
3205
3206void kvm_vcpu_block(struct kvm_vcpu *vcpu)
3207{
3208 ktime_t start, cur, poll_end;
3209 bool waited = false;
3210 u64 block_ns;
3211
3212 kvm_arch_vcpu_blocking(vcpu);
3213
3214 start = cur = poll_end = ktime_get();
3215 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
3216 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
3217
3218 ++vcpu->stat.generic.halt_attempted_poll;
3219 do {
3220
3221
3222
3223
3224 if (kvm_vcpu_check_block(vcpu) < 0) {
3225 ++vcpu->stat.generic.halt_successful_poll;
3226 if (!vcpu_valid_wakeup(vcpu))
3227 ++vcpu->stat.generic.halt_poll_invalid;
3228
3229 KVM_STATS_LOG_HIST_UPDATE(
3230 vcpu->stat.generic.halt_poll_success_hist,
3231 ktime_to_ns(ktime_get()) -
3232 ktime_to_ns(start));
3233 goto out;
3234 }
3235 cpu_relax();
3236 poll_end = cur = ktime_get();
3237 } while (kvm_vcpu_can_poll(cur, stop));
3238
3239 KVM_STATS_LOG_HIST_UPDATE(
3240 vcpu->stat.generic.halt_poll_fail_hist,
3241 ktime_to_ns(ktime_get()) - ktime_to_ns(start));
3242 }
3243
3244
3245 prepare_to_rcuwait(&vcpu->wait);
3246 for (;;) {
3247 set_current_state(TASK_INTERRUPTIBLE);
3248
3249 if (kvm_vcpu_check_block(vcpu) < 0)
3250 break;
3251
3252 waited = true;
3253 schedule();
3254 }
3255 finish_rcuwait(&vcpu->wait);
3256 cur = ktime_get();
3257 if (waited) {
3258 vcpu->stat.generic.halt_wait_ns +=
3259 ktime_to_ns(cur) - ktime_to_ns(poll_end);
3260 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3261 ktime_to_ns(cur) - ktime_to_ns(poll_end));
3262 }
3263out:
3264 kvm_arch_vcpu_unblocking(vcpu);
3265 block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3266
3267 update_halt_poll_stats(
3268 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
3269
3270 if (!kvm_arch_no_poll(vcpu)) {
3271 if (!vcpu_valid_wakeup(vcpu)) {
3272 shrink_halt_poll_ns(vcpu);
3273 } else if (vcpu->kvm->max_halt_poll_ns) {
3274 if (block_ns <= vcpu->halt_poll_ns)
3275 ;
3276
3277 else if (vcpu->halt_poll_ns &&
3278 block_ns > vcpu->kvm->max_halt_poll_ns)
3279 shrink_halt_poll_ns(vcpu);
3280
3281 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
3282 block_ns < vcpu->kvm->max_halt_poll_ns)
3283 grow_halt_poll_ns(vcpu);
3284 } else {
3285 vcpu->halt_poll_ns = 0;
3286 }
3287 }
3288
3289 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
3290 kvm_arch_vcpu_block_finish(vcpu);
3291}
3292EXPORT_SYMBOL_GPL(kvm_vcpu_block);
3293
3294bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3295{
3296 struct rcuwait *waitp;
3297
3298 waitp = kvm_arch_vcpu_get_wait(vcpu);
3299 if (rcuwait_wake_up(waitp)) {
3300 WRITE_ONCE(vcpu->ready, true);
3301 ++vcpu->stat.generic.halt_wakeup;
3302 return true;
3303 }
3304
3305 return false;
3306}
3307EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3308
3309#ifndef CONFIG_S390
3310
3311
3312
3313void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3314{
3315 int me, cpu;
3316
3317 if (kvm_vcpu_wake_up(vcpu))
3318 return;
3319
3320
3321
3322
3323
3324
3325
3326
3327 me = get_cpu();
3328 if (kvm_arch_vcpu_should_kick(vcpu)) {
3329 cpu = READ_ONCE(vcpu->cpu);
3330 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3331 smp_send_reschedule(cpu);
3332 }
3333 put_cpu();
3334}
3335EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3336#endif
3337
3338int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3339{
3340 struct pid *pid;
3341 struct task_struct *task = NULL;
3342 int ret = 0;
3343
3344 rcu_read_lock();
3345 pid = rcu_dereference(target->pid);
3346 if (pid)
3347 task = get_pid_task(pid, PIDTYPE_PID);
3348 rcu_read_unlock();
3349 if (!task)
3350 return ret;
3351 ret = yield_to(task, 1);
3352 put_task_struct(task);
3353
3354 return ret;
3355}
3356EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3381{
3382#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3383 bool eligible;
3384
3385 eligible = !vcpu->spin_loop.in_spin_loop ||
3386 vcpu->spin_loop.dy_eligible;
3387
3388 if (vcpu->spin_loop.in_spin_loop)
3389 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3390
3391 return eligible;
3392#else
3393 return true;
3394#endif
3395}
3396
3397
3398
3399
3400
3401
3402bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3403{
3404 return kvm_arch_vcpu_runnable(vcpu);
3405}
3406
3407static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3408{
3409 if (kvm_arch_dy_runnable(vcpu))
3410 return true;
3411
3412#ifdef CONFIG_KVM_ASYNC_PF
3413 if (!list_empty_careful(&vcpu->async_pf.done))
3414 return true;
3415#endif
3416
3417 return false;
3418}
3419
3420bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3421{
3422 return false;
3423}
3424
3425void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3426{
3427 struct kvm *kvm = me->kvm;
3428 struct kvm_vcpu *vcpu;
3429 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3430 int yielded = 0;
3431 int try = 3;
3432 int pass;
3433 int i;
3434
3435 kvm_vcpu_set_in_spin_loop(me, true);
3436
3437
3438
3439
3440
3441
3442
3443 for (pass = 0; pass < 2 && !yielded && try; pass++) {
3444 kvm_for_each_vcpu(i, vcpu, kvm) {
3445 if (!pass && i <= last_boosted_vcpu) {
3446 i = last_boosted_vcpu;
3447 continue;
3448 } else if (pass && i > last_boosted_vcpu)
3449 break;
3450 if (!READ_ONCE(vcpu->ready))
3451 continue;
3452 if (vcpu == me)
3453 continue;
3454 if (rcuwait_active(&vcpu->wait) &&
3455 !vcpu_dy_runnable(vcpu))
3456 continue;
3457 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3458 !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3459 !kvm_arch_vcpu_in_kernel(vcpu))
3460 continue;
3461 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3462 continue;
3463
3464 yielded = kvm_vcpu_yield_to(vcpu);
3465 if (yielded > 0) {
3466 kvm->last_boosted_vcpu = i;
3467 break;
3468 } else if (yielded < 0) {
3469 try--;
3470 if (!try)
3471 break;
3472 }
3473 }
3474 }
3475 kvm_vcpu_set_in_spin_loop(me, false);
3476
3477
3478 kvm_vcpu_set_dy_eligible(me, false);
3479}
3480EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3481
3482static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3483{
3484#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
3485 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3486 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3487 kvm->dirty_ring_size / PAGE_SIZE);
3488#else
3489 return false;
3490#endif
3491}
3492
3493static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3494{
3495 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3496 struct page *page;
3497
3498 if (vmf->pgoff == 0)
3499 page = virt_to_page(vcpu->run);
3500#ifdef CONFIG_X86
3501 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3502 page = virt_to_page(vcpu->arch.pio_data);
3503#endif
3504#ifdef CONFIG_KVM_MMIO
3505 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3506 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3507#endif
3508 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3509 page = kvm_dirty_ring_get_page(
3510 &vcpu->dirty_ring,
3511 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3512 else
3513 return kvm_arch_vcpu_fault(vcpu, vmf);
3514 get_page(page);
3515 vmf->page = page;
3516 return 0;
3517}
3518
3519static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3520 .fault = kvm_vcpu_fault,
3521};
3522
3523static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3524{
3525 struct kvm_vcpu *vcpu = file->private_data;
3526 unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
3527
3528 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3529 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3530 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3531 return -EINVAL;
3532
3533 vma->vm_ops = &kvm_vcpu_vm_ops;
3534 return 0;
3535}
3536
3537static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3538{
3539 struct kvm_vcpu *vcpu = filp->private_data;
3540
3541 kvm_put_kvm(vcpu->kvm);
3542 return 0;
3543}
3544
3545static struct file_operations kvm_vcpu_fops = {
3546 .release = kvm_vcpu_release,
3547 .unlocked_ioctl = kvm_vcpu_ioctl,
3548 .mmap = kvm_vcpu_mmap,
3549 .llseek = noop_llseek,
3550 KVM_COMPAT(kvm_vcpu_compat_ioctl),
3551};
3552
3553
3554
3555
3556static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3557{
3558 char name[8 + 1 + ITOA_MAX_LEN + 1];
3559
3560 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3561 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3562}
3563
3564static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3565{
3566#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3567 struct dentry *debugfs_dentry;
3568 char dir_name[ITOA_MAX_LEN * 2];
3569
3570 if (!debugfs_initialized())
3571 return;
3572
3573 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3574 debugfs_dentry = debugfs_create_dir(dir_name,
3575 vcpu->kvm->debugfs_dentry);
3576
3577 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3578#endif
3579}
3580
3581
3582
3583
3584static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3585{
3586 int r;
3587 struct kvm_vcpu *vcpu;
3588 struct page *page;
3589
3590 if (id >= KVM_MAX_VCPU_ID)
3591 return -EINVAL;
3592
3593 mutex_lock(&kvm->lock);
3594 if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3595 mutex_unlock(&kvm->lock);
3596 return -EINVAL;
3597 }
3598
3599 kvm->created_vcpus++;
3600 mutex_unlock(&kvm->lock);
3601
3602 r = kvm_arch_vcpu_precreate(kvm, id);
3603 if (r)
3604 goto vcpu_decrement;
3605
3606 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3607 if (!vcpu) {
3608 r = -ENOMEM;
3609 goto vcpu_decrement;
3610 }
3611
3612 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3613 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3614 if (!page) {
3615 r = -ENOMEM;
3616 goto vcpu_free;
3617 }
3618 vcpu->run = page_address(page);
3619
3620 kvm_vcpu_init(vcpu, kvm, id);
3621
3622 r = kvm_arch_vcpu_create(vcpu);
3623 if (r)
3624 goto vcpu_free_run_page;
3625
3626 if (kvm->dirty_ring_size) {
3627 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3628 id, kvm->dirty_ring_size);
3629 if (r)
3630 goto arch_vcpu_destroy;
3631 }
3632
3633 mutex_lock(&kvm->lock);
3634 if (kvm_get_vcpu_by_id(kvm, id)) {
3635 r = -EEXIST;
3636 goto unlock_vcpu_destroy;
3637 }
3638
3639 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3640 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
3641
3642
3643 snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
3644 task_pid_nr(current), id);
3645
3646
3647 kvm_get_kvm(kvm);
3648 r = create_vcpu_fd(vcpu);
3649 if (r < 0) {
3650 kvm_put_kvm_no_destroy(kvm);
3651 goto unlock_vcpu_destroy;
3652 }
3653
3654 kvm->vcpus[vcpu->vcpu_idx] = vcpu;
3655
3656
3657
3658
3659
3660 smp_wmb();
3661 atomic_inc(&kvm->online_vcpus);
3662
3663 mutex_unlock(&kvm->lock);
3664 kvm_arch_vcpu_postcreate(vcpu);
3665 kvm_create_vcpu_debugfs(vcpu);
3666 return r;
3667
3668unlock_vcpu_destroy:
3669 mutex_unlock(&kvm->lock);
3670 kvm_dirty_ring_free(&vcpu->dirty_ring);
3671arch_vcpu_destroy:
3672 kvm_arch_vcpu_destroy(vcpu);
3673vcpu_free_run_page:
3674 free_page((unsigned long)vcpu->run);
3675vcpu_free:
3676 kmem_cache_free(kvm_vcpu_cache, vcpu);
3677vcpu_decrement:
3678 mutex_lock(&kvm->lock);
3679 kvm->created_vcpus--;
3680 mutex_unlock(&kvm->lock);
3681 return r;
3682}
3683
3684static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3685{
3686 if (sigset) {
3687 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3688 vcpu->sigset_active = 1;
3689 vcpu->sigset = *sigset;
3690 } else
3691 vcpu->sigset_active = 0;
3692 return 0;
3693}
3694
3695static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
3696 size_t size, loff_t *offset)
3697{
3698 struct kvm_vcpu *vcpu = file->private_data;
3699
3700 return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
3701 &kvm_vcpu_stats_desc[0], &vcpu->stat,
3702 sizeof(vcpu->stat), user_buffer, size, offset);
3703}
3704
3705static const struct file_operations kvm_vcpu_stats_fops = {
3706 .read = kvm_vcpu_stats_read,
3707 .llseek = noop_llseek,
3708};
3709
3710static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
3711{
3712 int fd;
3713 struct file *file;
3714 char name[15 + ITOA_MAX_LEN + 1];
3715
3716 snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
3717
3718 fd = get_unused_fd_flags(O_CLOEXEC);
3719 if (fd < 0)
3720 return fd;
3721
3722 file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
3723 if (IS_ERR(file)) {
3724 put_unused_fd(fd);
3725 return PTR_ERR(file);
3726 }
3727 file->f_mode |= FMODE_PREAD;
3728 fd_install(fd, file);
3729
3730 return fd;
3731}
3732
3733static long kvm_vcpu_ioctl(struct file *filp,
3734 unsigned int ioctl, unsigned long arg)
3735{
3736 struct kvm_vcpu *vcpu = filp->private_data;
3737 void __user *argp = (void __user *)arg;
3738 int r;
3739 struct kvm_fpu *fpu = NULL;
3740 struct kvm_sregs *kvm_sregs = NULL;
3741
3742 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
3743 return -EIO;
3744
3745 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3746 return -EINVAL;
3747
3748
3749
3750
3751
3752 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3753 if (r != -ENOIOCTLCMD)
3754 return r;
3755
3756 if (mutex_lock_killable(&vcpu->mutex))
3757 return -EINTR;
3758 switch (ioctl) {
3759 case KVM_RUN: {
3760 struct pid *oldpid;
3761 r = -EINVAL;
3762 if (arg)
3763 goto out;
3764 oldpid = rcu_access_pointer(vcpu->pid);
3765 if (unlikely(oldpid != task_pid(current))) {
3766
3767 struct pid *newpid;
3768
3769 r = kvm_arch_vcpu_run_pid_change(vcpu);
3770 if (r)
3771 break;
3772
3773 newpid = get_task_pid(current, PIDTYPE_PID);
3774 rcu_assign_pointer(vcpu->pid, newpid);
3775 if (oldpid)
3776 synchronize_rcu();
3777 put_pid(oldpid);
3778 }
3779 r = kvm_arch_vcpu_ioctl_run(vcpu);
3780 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
3781 break;
3782 }
3783 case KVM_GET_REGS: {
3784 struct kvm_regs *kvm_regs;
3785
3786 r = -ENOMEM;
3787 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3788 if (!kvm_regs)
3789 goto out;
3790 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
3791 if (r)
3792 goto out_free1;
3793 r = -EFAULT;
3794 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3795 goto out_free1;
3796 r = 0;
3797out_free1:
3798 kfree(kvm_regs);
3799 break;
3800 }
3801 case KVM_SET_REGS: {
3802 struct kvm_regs *kvm_regs;
3803
3804 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3805 if (IS_ERR(kvm_regs)) {
3806 r = PTR_ERR(kvm_regs);
3807 goto out;
3808 }
3809 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3810 kfree(kvm_regs);
3811 break;
3812 }
3813 case KVM_GET_SREGS: {
3814 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3815 GFP_KERNEL_ACCOUNT);
3816 r = -ENOMEM;
3817 if (!kvm_sregs)
3818 goto out;
3819 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3820 if (r)
3821 goto out;
3822 r = -EFAULT;
3823 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3824 goto out;
3825 r = 0;
3826 break;
3827 }
3828 case KVM_SET_SREGS: {
3829 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3830 if (IS_ERR(kvm_sregs)) {
3831 r = PTR_ERR(kvm_sregs);
3832 kvm_sregs = NULL;
3833 goto out;
3834 }
3835 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3836 break;
3837 }
3838 case KVM_GET_MP_STATE: {
3839 struct kvm_mp_state mp_state;
3840
3841 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3842 if (r)
3843 goto out;
3844 r = -EFAULT;
3845 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3846 goto out;
3847 r = 0;
3848 break;
3849 }
3850 case KVM_SET_MP_STATE: {
3851 struct kvm_mp_state mp_state;
3852
3853 r = -EFAULT;
3854 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3855 goto out;
3856 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3857 break;
3858 }
3859 case KVM_TRANSLATE: {
3860 struct kvm_translation tr;
3861
3862 r = -EFAULT;
3863 if (copy_from_user(&tr, argp, sizeof(tr)))
3864 goto out;
3865 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3866 if (r)
3867 goto out;
3868 r = -EFAULT;
3869 if (copy_to_user(argp, &tr, sizeof(tr)))
3870 goto out;
3871 r = 0;
3872 break;
3873 }
3874 case KVM_SET_GUEST_DEBUG: {
3875 struct kvm_guest_debug dbg;
3876
3877 r = -EFAULT;
3878 if (copy_from_user(&dbg, argp, sizeof(dbg)))
3879 goto out;
3880 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
3881 break;
3882 }
3883 case KVM_SET_SIGNAL_MASK: {
3884 struct kvm_signal_mask __user *sigmask_arg = argp;
3885 struct kvm_signal_mask kvm_sigmask;
3886 sigset_t sigset, *p;
3887
3888 p = NULL;
3889 if (argp) {
3890 r = -EFAULT;
3891 if (copy_from_user(&kvm_sigmask, argp,
3892 sizeof(kvm_sigmask)))
3893 goto out;
3894 r = -EINVAL;
3895 if (kvm_sigmask.len != sizeof(sigset))
3896 goto out;
3897 r = -EFAULT;
3898 if (copy_from_user(&sigset, sigmask_arg->sigset,
3899 sizeof(sigset)))
3900 goto out;
3901 p = &sigset;
3902 }
3903 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
3904 break;
3905 }
3906 case KVM_GET_FPU: {
3907 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
3908 r = -ENOMEM;
3909 if (!fpu)
3910 goto out;
3911 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
3912 if (r)
3913 goto out;
3914 r = -EFAULT;
3915 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
3916 goto out;
3917 r = 0;
3918 break;
3919 }
3920 case KVM_SET_FPU: {
3921 fpu = memdup_user(argp, sizeof(*fpu));
3922 if (IS_ERR(fpu)) {
3923 r = PTR_ERR(fpu);
3924 fpu = NULL;
3925 goto out;
3926 }
3927 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
3928 break;
3929 }
3930 case KVM_GET_STATS_FD: {
3931 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
3932 break;
3933 }
3934 default:
3935 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
3936 }
3937out:
3938 mutex_unlock(&vcpu->mutex);
3939 kfree(fpu);
3940 kfree(kvm_sregs);
3941 return r;
3942}
3943
3944#ifdef CONFIG_KVM_COMPAT
3945static long kvm_vcpu_compat_ioctl(struct file *filp,
3946 unsigned int ioctl, unsigned long arg)
3947{
3948 struct kvm_vcpu *vcpu = filp->private_data;
3949 void __user *argp = compat_ptr(arg);
3950 int r;
3951
3952 if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
3953 return -EIO;
3954
3955 switch (ioctl) {
3956 case KVM_SET_SIGNAL_MASK: {
3957 struct kvm_signal_mask __user *sigmask_arg = argp;
3958 struct kvm_signal_mask kvm_sigmask;
3959 sigset_t sigset;
3960
3961 if (argp) {
3962 r = -EFAULT;
3963 if (copy_from_user(&kvm_sigmask, argp,
3964 sizeof(kvm_sigmask)))
3965 goto out;
3966 r = -EINVAL;
3967 if (kvm_sigmask.len != sizeof(compat_sigset_t))
3968 goto out;
3969 r = -EFAULT;
3970 if (get_compat_sigset(&sigset,
3971 (compat_sigset_t __user *)sigmask_arg->sigset))
3972 goto out;
3973 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3974 } else
3975 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
3976 break;
3977 }
3978 default:
3979 r = kvm_vcpu_ioctl(filp, ioctl, arg);
3980 }
3981
3982out:
3983 return r;
3984}
3985#endif
3986
3987static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3988{
3989 struct kvm_device *dev = filp->private_data;
3990
3991 if (dev->ops->mmap)
3992 return dev->ops->mmap(dev, vma);
3993
3994 return -ENODEV;
3995}
3996
3997static int kvm_device_ioctl_attr(struct kvm_device *dev,
3998 int (*accessor)(struct kvm_device *dev,
3999 struct kvm_device_attr *attr),
4000 unsigned long arg)
4001{
4002 struct kvm_device_attr attr;
4003
4004 if (!accessor)
4005 return -EPERM;
4006
4007 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4008 return -EFAULT;
4009
4010 return accessor(dev, &attr);
4011}
4012
4013static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4014 unsigned long arg)
4015{
4016 struct kvm_device *dev = filp->private_data;
4017
4018 if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged)
4019 return -EIO;
4020
4021 switch (ioctl) {
4022 case KVM_SET_DEVICE_ATTR:
4023 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4024 case KVM_GET_DEVICE_ATTR:
4025 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4026 case KVM_HAS_DEVICE_ATTR:
4027 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4028 default:
4029 if (dev->ops->ioctl)
4030 return dev->ops->ioctl(dev, ioctl, arg);
4031
4032 return -ENOTTY;
4033 }
4034}
4035
4036static int kvm_device_release(struct inode *inode, struct file *filp)
4037{
4038 struct kvm_device *dev = filp->private_data;
4039 struct kvm *kvm = dev->kvm;
4040
4041 if (dev->ops->release) {
4042 mutex_lock(&kvm->lock);
4043 list_del(&dev->vm_node);
4044 dev->ops->release(dev);
4045 mutex_unlock(&kvm->lock);
4046 }
4047
4048 kvm_put_kvm(kvm);
4049 return 0;
4050}
4051
4052static const struct file_operations kvm_device_fops = {
4053 .unlocked_ioctl = kvm_device_ioctl,
4054 .release = kvm_device_release,
4055 KVM_COMPAT(kvm_device_ioctl),
4056 .mmap = kvm_device_mmap,
4057};
4058
4059struct kvm_device *kvm_device_from_filp(struct file *filp)
4060{
4061 if (filp->f_op != &kvm_device_fops)
4062 return NULL;
4063
4064 return filp->private_data;
4065}
4066
4067static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4068#ifdef CONFIG_KVM_MPIC
4069 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4070 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
4071#endif
4072};
4073
4074int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4075{
4076 if (type >= ARRAY_SIZE(kvm_device_ops_table))
4077 return -ENOSPC;
4078
4079 if (kvm_device_ops_table[type] != NULL)
4080 return -EEXIST;
4081
4082 kvm_device_ops_table[type] = ops;
4083 return 0;
4084}
4085
4086void kvm_unregister_device_ops(u32 type)
4087{
4088 if (kvm_device_ops_table[type] != NULL)
4089 kvm_device_ops_table[type] = NULL;
4090}
4091
4092static int kvm_ioctl_create_device(struct kvm *kvm,
4093 struct kvm_create_device *cd)
4094{
4095 const struct kvm_device_ops *ops = NULL;
4096 struct kvm_device *dev;
4097 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4098 int type;
4099 int ret;
4100
4101 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4102 return -ENODEV;
4103
4104 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4105 ops = kvm_device_ops_table[type];
4106 if (ops == NULL)
4107 return -ENODEV;
4108
4109 if (test)
4110 return 0;
4111
4112 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4113 if (!dev)
4114 return -ENOMEM;
4115
4116 dev->ops = ops;
4117 dev->kvm = kvm;
4118
4119 mutex_lock(&kvm->lock);
4120 ret = ops->create(dev, type);
4121 if (ret < 0) {
4122 mutex_unlock(&kvm->lock);
4123 kfree(dev);
4124 return ret;
4125 }
4126 list_add(&dev->vm_node, &kvm->devices);
4127 mutex_unlock(&kvm->lock);
4128
4129 if (ops->init)
4130 ops->init(dev);
4131
4132 kvm_get_kvm(kvm);
4133 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4134 if (ret < 0) {
4135 kvm_put_kvm_no_destroy(kvm);
4136 mutex_lock(&kvm->lock);
4137 list_del(&dev->vm_node);
4138 mutex_unlock(&kvm->lock);
4139 ops->destroy(dev);
4140 return ret;
4141 }
4142
4143 cd->fd = ret;
4144 return 0;
4145}
4146
4147static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4148{
4149 switch (arg) {
4150 case KVM_CAP_USER_MEMORY:
4151 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4152 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4153 case KVM_CAP_INTERNAL_ERROR_DATA:
4154#ifdef CONFIG_HAVE_KVM_MSI
4155 case KVM_CAP_SIGNAL_MSI:
4156#endif
4157#ifdef CONFIG_HAVE_KVM_IRQFD
4158 case KVM_CAP_IRQFD:
4159 case KVM_CAP_IRQFD_RESAMPLE:
4160#endif
4161 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4162 case KVM_CAP_CHECK_EXTENSION_VM:
4163 case KVM_CAP_ENABLE_CAP_VM:
4164 case KVM_CAP_HALT_POLL:
4165 return 1;
4166#ifdef CONFIG_KVM_MMIO
4167 case KVM_CAP_COALESCED_MMIO:
4168 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4169 case KVM_CAP_COALESCED_PIO:
4170 return 1;
4171#endif
4172#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4173 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4174 return KVM_DIRTY_LOG_MANUAL_CAPS;
4175#endif
4176#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4177 case KVM_CAP_IRQ_ROUTING:
4178 return KVM_MAX_IRQ_ROUTES;
4179#endif
4180#if KVM_ADDRESS_SPACE_NUM > 1
4181 case KVM_CAP_MULTI_ADDRESS_SPACE:
4182 return KVM_ADDRESS_SPACE_NUM;
4183#endif
4184 case KVM_CAP_NR_MEMSLOTS:
4185 return KVM_USER_MEM_SLOTS;
4186 case KVM_CAP_DIRTY_LOG_RING:
4187#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
4188 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4189#else
4190 return 0;
4191#endif
4192 case KVM_CAP_BINARY_STATS_FD:
4193 return 1;
4194 default:
4195 break;
4196 }
4197 return kvm_vm_ioctl_check_extension(kvm, arg);
4198}
4199
4200static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4201{
4202 int r;
4203
4204 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4205 return -EINVAL;
4206
4207
4208 if (!size || (size & (size - 1)))
4209 return -EINVAL;
4210
4211
4212 if (size < kvm_dirty_ring_get_rsvd_entries() *
4213 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4214 return -EINVAL;
4215
4216 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4217 sizeof(struct kvm_dirty_gfn))
4218 return -E2BIG;
4219
4220
4221 if (kvm->dirty_ring_size)
4222 return -EINVAL;
4223
4224 mutex_lock(&kvm->lock);
4225
4226 if (kvm->created_vcpus) {
4227
4228 r = -EINVAL;
4229 } else {
4230 kvm->dirty_ring_size = size;
4231 r = 0;
4232 }
4233
4234 mutex_unlock(&kvm->lock);
4235 return r;
4236}
4237
4238static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4239{
4240 int i;
4241 struct kvm_vcpu *vcpu;
4242 int cleared = 0;
4243
4244 if (!kvm->dirty_ring_size)
4245 return -EINVAL;
4246
4247 mutex_lock(&kvm->slots_lock);
4248
4249 kvm_for_each_vcpu(i, vcpu, kvm)
4250 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4251
4252 mutex_unlock(&kvm->slots_lock);
4253
4254 if (cleared)
4255 kvm_flush_remote_tlbs(kvm);
4256
4257 return cleared;
4258}
4259
4260int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4261 struct kvm_enable_cap *cap)
4262{
4263 return -EINVAL;
4264}
4265
4266static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4267 struct kvm_enable_cap *cap)
4268{
4269 switch (cap->cap) {
4270#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4271 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4272 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4273
4274 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4275 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4276
4277 if (cap->flags || (cap->args[0] & ~allowed_options))
4278 return -EINVAL;
4279 kvm->manual_dirty_log_protect = cap->args[0];
4280 return 0;
4281 }
4282#endif
4283 case KVM_CAP_HALT_POLL: {
4284 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4285 return -EINVAL;
4286
4287 kvm->max_halt_poll_ns = cap->args[0];
4288 return 0;
4289 }
4290 case KVM_CAP_DIRTY_LOG_RING:
4291 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4292 default:
4293 return kvm_vm_ioctl_enable_cap(kvm, cap);
4294 }
4295}
4296
4297static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4298 size_t size, loff_t *offset)
4299{
4300 struct kvm *kvm = file->private_data;
4301
4302 return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4303 &kvm_vm_stats_desc[0], &kvm->stat,
4304 sizeof(kvm->stat), user_buffer, size, offset);
4305}
4306
4307static const struct file_operations kvm_vm_stats_fops = {
4308 .read = kvm_vm_stats_read,
4309 .llseek = noop_llseek,
4310};
4311
4312static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4313{
4314 int fd;
4315 struct file *file;
4316
4317 fd = get_unused_fd_flags(O_CLOEXEC);
4318 if (fd < 0)
4319 return fd;
4320
4321 file = anon_inode_getfile("kvm-vm-stats",
4322 &kvm_vm_stats_fops, kvm, O_RDONLY);
4323 if (IS_ERR(file)) {
4324 put_unused_fd(fd);
4325 return PTR_ERR(file);
4326 }
4327 file->f_mode |= FMODE_PREAD;
4328 fd_install(fd, file);
4329
4330 return fd;
4331}
4332
4333static long kvm_vm_ioctl(struct file *filp,
4334 unsigned int ioctl, unsigned long arg)
4335{
4336 struct kvm *kvm = filp->private_data;
4337 void __user *argp = (void __user *)arg;
4338 int r;
4339
4340 if (kvm->mm != current->mm || kvm->vm_bugged)
4341 return -EIO;
4342 switch (ioctl) {
4343 case KVM_CREATE_VCPU:
4344 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4345 break;
4346 case KVM_ENABLE_CAP: {
4347 struct kvm_enable_cap cap;
4348
4349 r = -EFAULT;
4350 if (copy_from_user(&cap, argp, sizeof(cap)))
4351 goto out;
4352 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4353 break;
4354 }
4355 case KVM_SET_USER_MEMORY_REGION: {
4356 struct kvm_userspace_memory_region kvm_userspace_mem;
4357
4358 r = -EFAULT;
4359 if (copy_from_user(&kvm_userspace_mem, argp,
4360 sizeof(kvm_userspace_mem)))
4361 goto out;
4362
4363 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4364 break;
4365 }
4366 case KVM_GET_DIRTY_LOG: {
4367 struct kvm_dirty_log log;
4368
4369 r = -EFAULT;
4370 if (copy_from_user(&log, argp, sizeof(log)))
4371 goto out;
4372 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4373 break;
4374 }
4375#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4376 case KVM_CLEAR_DIRTY_LOG: {
4377 struct kvm_clear_dirty_log log;
4378
4379 r = -EFAULT;
4380 if (copy_from_user(&log, argp, sizeof(log)))
4381 goto out;
4382 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4383 break;
4384 }
4385#endif
4386#ifdef CONFIG_KVM_MMIO
4387 case KVM_REGISTER_COALESCED_MMIO: {
4388 struct kvm_coalesced_mmio_zone zone;
4389
4390 r = -EFAULT;
4391 if (copy_from_user(&zone, argp, sizeof(zone)))
4392 goto out;
4393 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4394 break;
4395 }
4396 case KVM_UNREGISTER_COALESCED_MMIO: {
4397 struct kvm_coalesced_mmio_zone zone;
4398
4399 r = -EFAULT;
4400 if (copy_from_user(&zone, argp, sizeof(zone)))
4401 goto out;
4402 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4403 break;
4404 }
4405#endif
4406 case KVM_IRQFD: {
4407 struct kvm_irqfd data;
4408
4409 r = -EFAULT;
4410 if (copy_from_user(&data, argp, sizeof(data)))
4411 goto out;
4412 r = kvm_irqfd(kvm, &data);
4413 break;
4414 }
4415 case KVM_IOEVENTFD: {
4416 struct kvm_ioeventfd data;
4417
4418 r = -EFAULT;
4419 if (copy_from_user(&data, argp, sizeof(data)))
4420 goto out;
4421 r = kvm_ioeventfd(kvm, &data);
4422 break;
4423 }
4424#ifdef CONFIG_HAVE_KVM_MSI
4425 case KVM_SIGNAL_MSI: {
4426 struct kvm_msi msi;
4427
4428 r = -EFAULT;
4429 if (copy_from_user(&msi, argp, sizeof(msi)))
4430 goto out;
4431 r = kvm_send_userspace_msi(kvm, &msi);
4432 break;
4433 }
4434#endif
4435#ifdef __KVM_HAVE_IRQ_LINE
4436 case KVM_IRQ_LINE_STATUS:
4437 case KVM_IRQ_LINE: {
4438 struct kvm_irq_level irq_event;
4439
4440 r = -EFAULT;
4441 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4442 goto out;
4443
4444 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4445 ioctl == KVM_IRQ_LINE_STATUS);
4446 if (r)
4447 goto out;
4448
4449 r = -EFAULT;
4450 if (ioctl == KVM_IRQ_LINE_STATUS) {
4451 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4452 goto out;
4453 }
4454
4455 r = 0;
4456 break;
4457 }
4458#endif
4459#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4460 case KVM_SET_GSI_ROUTING: {
4461 struct kvm_irq_routing routing;
4462 struct kvm_irq_routing __user *urouting;
4463 struct kvm_irq_routing_entry *entries = NULL;
4464
4465 r = -EFAULT;
4466 if (copy_from_user(&routing, argp, sizeof(routing)))
4467 goto out;
4468 r = -EINVAL;
4469 if (!kvm_arch_can_set_irq_routing(kvm))
4470 goto out;
4471 if (routing.nr > KVM_MAX_IRQ_ROUTES)
4472 goto out;
4473 if (routing.flags)
4474 goto out;
4475 if (routing.nr) {
4476 urouting = argp;
4477 entries = vmemdup_user(urouting->entries,
4478 array_size(sizeof(*entries),
4479 routing.nr));
4480 if (IS_ERR(entries)) {
4481 r = PTR_ERR(entries);
4482 goto out;
4483 }
4484 }
4485 r = kvm_set_irq_routing(kvm, entries, routing.nr,
4486 routing.flags);
4487 kvfree(entries);
4488 break;
4489 }
4490#endif
4491 case KVM_CREATE_DEVICE: {
4492 struct kvm_create_device cd;
4493
4494 r = -EFAULT;
4495 if (copy_from_user(&cd, argp, sizeof(cd)))
4496 goto out;
4497
4498 r = kvm_ioctl_create_device(kvm, &cd);
4499 if (r)
4500 goto out;
4501
4502 r = -EFAULT;
4503 if (copy_to_user(argp, &cd, sizeof(cd)))
4504 goto out;
4505
4506 r = 0;
4507 break;
4508 }
4509 case KVM_CHECK_EXTENSION:
4510 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4511 break;
4512 case KVM_RESET_DIRTY_RINGS:
4513 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4514 break;
4515 case KVM_GET_STATS_FD:
4516 r = kvm_vm_ioctl_get_stats_fd(kvm);
4517 break;
4518 default:
4519 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4520 }
4521out:
4522 return r;
4523}
4524
4525#ifdef CONFIG_KVM_COMPAT
4526struct compat_kvm_dirty_log {
4527 __u32 slot;
4528 __u32 padding1;
4529 union {
4530 compat_uptr_t dirty_bitmap;
4531 __u64 padding2;
4532 };
4533};
4534
4535struct compat_kvm_clear_dirty_log {
4536 __u32 slot;
4537 __u32 num_pages;
4538 __u64 first_page;
4539 union {
4540 compat_uptr_t dirty_bitmap;
4541 __u64 padding2;
4542 };
4543};
4544
4545static long kvm_vm_compat_ioctl(struct file *filp,
4546 unsigned int ioctl, unsigned long arg)
4547{
4548 struct kvm *kvm = filp->private_data;
4549 int r;
4550
4551 if (kvm->mm != current->mm || kvm->vm_bugged)
4552 return -EIO;
4553 switch (ioctl) {
4554#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4555 case KVM_CLEAR_DIRTY_LOG: {
4556 struct compat_kvm_clear_dirty_log compat_log;
4557 struct kvm_clear_dirty_log log;
4558
4559 if (copy_from_user(&compat_log, (void __user *)arg,
4560 sizeof(compat_log)))
4561 return -EFAULT;
4562 log.slot = compat_log.slot;
4563 log.num_pages = compat_log.num_pages;
4564 log.first_page = compat_log.first_page;
4565 log.padding2 = compat_log.padding2;
4566 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4567
4568 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4569 break;
4570 }
4571#endif
4572 case KVM_GET_DIRTY_LOG: {
4573 struct compat_kvm_dirty_log compat_log;
4574 struct kvm_dirty_log log;
4575
4576 if (copy_from_user(&compat_log, (void __user *)arg,
4577 sizeof(compat_log)))
4578 return -EFAULT;
4579 log.slot = compat_log.slot;
4580 log.padding1 = compat_log.padding1;
4581 log.padding2 = compat_log.padding2;
4582 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4583
4584 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4585 break;
4586 }
4587 default:
4588 r = kvm_vm_ioctl(filp, ioctl, arg);
4589 }
4590 return r;
4591}
4592#endif
4593
4594static struct file_operations kvm_vm_fops = {
4595 .release = kvm_vm_release,
4596 .unlocked_ioctl = kvm_vm_ioctl,
4597 .llseek = noop_llseek,
4598 KVM_COMPAT(kvm_vm_compat_ioctl),
4599};
4600
4601bool file_is_kvm(struct file *file)
4602{
4603 return file && file->f_op == &kvm_vm_fops;
4604}
4605EXPORT_SYMBOL_GPL(file_is_kvm);
4606
4607static int kvm_dev_ioctl_create_vm(unsigned long type)
4608{
4609 int r;
4610 struct kvm *kvm;
4611 struct file *file;
4612
4613 kvm = kvm_create_vm(type);
4614 if (IS_ERR(kvm))
4615 return PTR_ERR(kvm);
4616#ifdef CONFIG_KVM_MMIO
4617 r = kvm_coalesced_mmio_init(kvm);
4618 if (r < 0)
4619 goto put_kvm;
4620#endif
4621 r = get_unused_fd_flags(O_CLOEXEC);
4622 if (r < 0)
4623 goto put_kvm;
4624
4625 snprintf(kvm->stats_id, sizeof(kvm->stats_id),
4626 "kvm-%d", task_pid_nr(current));
4627
4628 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4629 if (IS_ERR(file)) {
4630 put_unused_fd(r);
4631 r = PTR_ERR(file);
4632 goto put_kvm;
4633 }
4634
4635
4636
4637
4638
4639
4640
4641 if (kvm_create_vm_debugfs(kvm, r) < 0) {
4642 put_unused_fd(r);
4643 fput(file);
4644 return -ENOMEM;
4645 }
4646 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
4647
4648 fd_install(r, file);
4649 return r;
4650
4651put_kvm:
4652 kvm_put_kvm(kvm);
4653 return r;
4654}
4655
4656static long kvm_dev_ioctl(struct file *filp,
4657 unsigned int ioctl, unsigned long arg)
4658{
4659 long r = -EINVAL;
4660
4661 switch (ioctl) {
4662 case KVM_GET_API_VERSION:
4663 if (arg)
4664 goto out;
4665 r = KVM_API_VERSION;
4666 break;
4667 case KVM_CREATE_VM:
4668 r = kvm_dev_ioctl_create_vm(arg);
4669 break;
4670 case KVM_CHECK_EXTENSION:
4671 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
4672 break;
4673 case KVM_GET_VCPU_MMAP_SIZE:
4674 if (arg)
4675 goto out;
4676 r = PAGE_SIZE;
4677#ifdef CONFIG_X86
4678 r += PAGE_SIZE;
4679#endif
4680#ifdef CONFIG_KVM_MMIO
4681 r += PAGE_SIZE;
4682#endif
4683 break;
4684 case KVM_TRACE_ENABLE:
4685 case KVM_TRACE_PAUSE:
4686 case KVM_TRACE_DISABLE:
4687 r = -EOPNOTSUPP;
4688 break;
4689 default:
4690 return kvm_arch_dev_ioctl(filp, ioctl, arg);
4691 }
4692out:
4693 return r;
4694}
4695
4696static struct file_operations kvm_chardev_ops = {
4697 .unlocked_ioctl = kvm_dev_ioctl,
4698 .llseek = noop_llseek,
4699 KVM_COMPAT(kvm_dev_ioctl),
4700};
4701
4702static struct miscdevice kvm_dev = {
4703 KVM_MINOR,
4704 "kvm",
4705 &kvm_chardev_ops,
4706};
4707
4708static void hardware_enable_nolock(void *junk)
4709{
4710 int cpu = raw_smp_processor_id();
4711 int r;
4712
4713 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
4714 return;
4715
4716 cpumask_set_cpu(cpu, cpus_hardware_enabled);
4717
4718 r = kvm_arch_hardware_enable();
4719
4720 if (r) {
4721 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4722 atomic_inc(&hardware_enable_failed);
4723 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
4724 }
4725}
4726
4727static int kvm_starting_cpu(unsigned int cpu)
4728{
4729 raw_spin_lock(&kvm_count_lock);
4730 if (kvm_usage_count)
4731 hardware_enable_nolock(NULL);
4732 raw_spin_unlock(&kvm_count_lock);
4733 return 0;
4734}
4735
4736static void hardware_disable_nolock(void *junk)
4737{
4738 int cpu = raw_smp_processor_id();
4739
4740 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
4741 return;
4742 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4743 kvm_arch_hardware_disable();
4744}
4745
4746static int kvm_dying_cpu(unsigned int cpu)
4747{
4748 raw_spin_lock(&kvm_count_lock);
4749 if (kvm_usage_count)
4750 hardware_disable_nolock(NULL);
4751 raw_spin_unlock(&kvm_count_lock);
4752 return 0;
4753}
4754
4755static void hardware_disable_all_nolock(void)
4756{
4757 BUG_ON(!kvm_usage_count);
4758
4759 kvm_usage_count--;
4760 if (!kvm_usage_count)
4761 on_each_cpu(hardware_disable_nolock, NULL, 1);
4762}
4763
4764static void hardware_disable_all(void)
4765{
4766 raw_spin_lock(&kvm_count_lock);
4767 hardware_disable_all_nolock();
4768 raw_spin_unlock(&kvm_count_lock);
4769}
4770
4771static int hardware_enable_all(void)
4772{
4773 int r = 0;
4774
4775 raw_spin_lock(&kvm_count_lock);
4776
4777 kvm_usage_count++;
4778 if (kvm_usage_count == 1) {
4779 atomic_set(&hardware_enable_failed, 0);
4780 on_each_cpu(hardware_enable_nolock, NULL, 1);
4781
4782 if (atomic_read(&hardware_enable_failed)) {
4783 hardware_disable_all_nolock();
4784 r = -EBUSY;
4785 }
4786 }
4787
4788 raw_spin_unlock(&kvm_count_lock);
4789
4790 return r;
4791}
4792
4793static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
4794 void *v)
4795{
4796
4797
4798
4799
4800
4801
4802 pr_info("kvm: exiting hardware virtualization\n");
4803 kvm_rebooting = true;
4804 on_each_cpu(hardware_disable_nolock, NULL, 1);
4805 return NOTIFY_OK;
4806}
4807
4808static struct notifier_block kvm_reboot_notifier = {
4809 .notifier_call = kvm_reboot,
4810 .priority = 0,
4811};
4812
4813static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
4814{
4815 int i;
4816
4817 for (i = 0; i < bus->dev_count; i++) {
4818 struct kvm_io_device *pos = bus->range[i].dev;
4819
4820 kvm_iodevice_destructor(pos);
4821 }
4822 kfree(bus);
4823}
4824
4825static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
4826 const struct kvm_io_range *r2)
4827{
4828 gpa_t addr1 = r1->addr;
4829 gpa_t addr2 = r2->addr;
4830
4831 if (addr1 < addr2)
4832 return -1;
4833
4834
4835
4836
4837
4838
4839 if (r2->len) {
4840 addr1 += r1->len;
4841 addr2 += r2->len;
4842 }
4843
4844 if (addr1 > addr2)
4845 return 1;
4846
4847 return 0;
4848}
4849
4850static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4851{
4852 return kvm_io_bus_cmp(p1, p2);
4853}
4854
4855static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
4856 gpa_t addr, int len)
4857{
4858 struct kvm_io_range *range, key;
4859 int off;
4860
4861 key = (struct kvm_io_range) {
4862 .addr = addr,
4863 .len = len,
4864 };
4865
4866 range = bsearch(&key, bus->range, bus->dev_count,
4867 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4868 if (range == NULL)
4869 return -ENOENT;
4870
4871 off = range - bus->range;
4872
4873 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
4874 off--;
4875
4876 return off;
4877}
4878
4879static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4880 struct kvm_io_range *range, const void *val)
4881{
4882 int idx;
4883
4884 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4885 if (idx < 0)
4886 return -EOPNOTSUPP;
4887
4888 while (idx < bus->dev_count &&
4889 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4890 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
4891 range->len, val))
4892 return idx;
4893 idx++;
4894 }
4895
4896 return -EOPNOTSUPP;
4897}
4898
4899
4900int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4901 int len, const void *val)
4902{
4903 struct kvm_io_bus *bus;
4904 struct kvm_io_range range;
4905 int r;
4906
4907 range = (struct kvm_io_range) {
4908 .addr = addr,
4909 .len = len,
4910 };
4911
4912 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4913 if (!bus)
4914 return -ENOMEM;
4915 r = __kvm_io_bus_write(vcpu, bus, &range, val);
4916 return r < 0 ? r : 0;
4917}
4918EXPORT_SYMBOL_GPL(kvm_io_bus_write);
4919
4920
4921int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
4922 gpa_t addr, int len, const void *val, long cookie)
4923{
4924 struct kvm_io_bus *bus;
4925 struct kvm_io_range range;
4926
4927 range = (struct kvm_io_range) {
4928 .addr = addr,
4929 .len = len,
4930 };
4931
4932 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4933 if (!bus)
4934 return -ENOMEM;
4935
4936
4937 if ((cookie >= 0) && (cookie < bus->dev_count) &&
4938 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
4939 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
4940 val))
4941 return cookie;
4942
4943
4944
4945
4946
4947 return __kvm_io_bus_write(vcpu, bus, &range, val);
4948}
4949
4950static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4951 struct kvm_io_range *range, void *val)
4952{
4953 int idx;
4954
4955 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4956 if (idx < 0)
4957 return -EOPNOTSUPP;
4958
4959 while (idx < bus->dev_count &&
4960 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4961 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
4962 range->len, val))
4963 return idx;
4964 idx++;
4965 }
4966
4967 return -EOPNOTSUPP;
4968}
4969
4970
4971int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4972 int len, void *val)
4973{
4974 struct kvm_io_bus *bus;
4975 struct kvm_io_range range;
4976 int r;
4977
4978 range = (struct kvm_io_range) {
4979 .addr = addr,
4980 .len = len,
4981 };
4982
4983 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4984 if (!bus)
4985 return -ENOMEM;
4986 r = __kvm_io_bus_read(vcpu, bus, &range, val);
4987 return r < 0 ? r : 0;
4988}
4989
4990
4991int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
4992 int len, struct kvm_io_device *dev)
4993{
4994 int i;
4995 struct kvm_io_bus *new_bus, *bus;
4996 struct kvm_io_range range;
4997
4998 bus = kvm_get_bus(kvm, bus_idx);
4999 if (!bus)
5000 return -ENOMEM;
5001
5002
5003 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5004 return -ENOSPC;
5005
5006 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5007 GFP_KERNEL_ACCOUNT);
5008 if (!new_bus)
5009 return -ENOMEM;
5010
5011 range = (struct kvm_io_range) {
5012 .addr = addr,
5013 .len = len,
5014 .dev = dev,
5015 };
5016
5017 for (i = 0; i < bus->dev_count; i++)
5018 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5019 break;
5020
5021 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5022 new_bus->dev_count++;
5023 new_bus->range[i] = range;
5024 memcpy(new_bus->range + i + 1, bus->range + i,
5025 (bus->dev_count - i) * sizeof(struct kvm_io_range));
5026 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5027 synchronize_srcu_expedited(&kvm->srcu);
5028 kfree(bus);
5029
5030 return 0;
5031}
5032
5033int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5034 struct kvm_io_device *dev)
5035{
5036 int i, j;
5037 struct kvm_io_bus *new_bus, *bus;
5038
5039 lockdep_assert_held(&kvm->slots_lock);
5040
5041 bus = kvm_get_bus(kvm, bus_idx);
5042 if (!bus)
5043 return 0;
5044
5045 for (i = 0; i < bus->dev_count; i++) {
5046 if (bus->range[i].dev == dev) {
5047 break;
5048 }
5049 }
5050
5051 if (i == bus->dev_count)
5052 return 0;
5053
5054 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5055 GFP_KERNEL_ACCOUNT);
5056 if (new_bus) {
5057 memcpy(new_bus, bus, struct_size(bus, range, i));
5058 new_bus->dev_count--;
5059 memcpy(new_bus->range + i, bus->range + i + 1,
5060 flex_array_size(new_bus, range, new_bus->dev_count - i));
5061 }
5062
5063 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5064 synchronize_srcu_expedited(&kvm->srcu);
5065
5066
5067 if (!new_bus) {
5068 pr_err("kvm: failed to shrink bus, removing it completely\n");
5069 for (j = 0; j < bus->dev_count; j++) {
5070 if (j == i)
5071 continue;
5072 kvm_iodevice_destructor(bus->range[j].dev);
5073 }
5074 }
5075
5076 kfree(bus);
5077 return new_bus ? 0 : -ENOMEM;
5078}
5079
5080struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5081 gpa_t addr)
5082{
5083 struct kvm_io_bus *bus;
5084 int dev_idx, srcu_idx;
5085 struct kvm_io_device *iodev = NULL;
5086
5087 srcu_idx = srcu_read_lock(&kvm->srcu);
5088
5089 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5090 if (!bus)
5091 goto out_unlock;
5092
5093 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5094 if (dev_idx < 0)
5095 goto out_unlock;
5096
5097 iodev = bus->range[dev_idx].dev;
5098
5099out_unlock:
5100 srcu_read_unlock(&kvm->srcu, srcu_idx);
5101
5102 return iodev;
5103}
5104EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5105
5106static int kvm_debugfs_open(struct inode *inode, struct file *file,
5107 int (*get)(void *, u64 *), int (*set)(void *, u64),
5108 const char *fmt)
5109{
5110 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5111 inode->i_private;
5112
5113
5114
5115
5116
5117
5118 if (!kvm_get_kvm_safe(stat_data->kvm))
5119 return -ENOENT;
5120
5121 if (simple_attr_open(inode, file, get,
5122 kvm_stats_debugfs_mode(stat_data->desc) & 0222
5123 ? set : NULL,
5124 fmt)) {
5125 kvm_put_kvm(stat_data->kvm);
5126 return -ENOMEM;
5127 }
5128
5129 return 0;
5130}
5131
5132static int kvm_debugfs_release(struct inode *inode, struct file *file)
5133{
5134 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5135 inode->i_private;
5136
5137 simple_attr_release(inode, file);
5138 kvm_put_kvm(stat_data->kvm);
5139
5140 return 0;
5141}
5142
5143static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5144{
5145 *val = *(u64 *)((void *)(&kvm->stat) + offset);
5146
5147 return 0;
5148}
5149
5150static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5151{
5152 *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5153
5154 return 0;
5155}
5156
5157static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5158{
5159 int i;
5160 struct kvm_vcpu *vcpu;
5161
5162 *val = 0;
5163
5164 kvm_for_each_vcpu(i, vcpu, kvm)
5165 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5166
5167 return 0;
5168}
5169
5170static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5171{
5172 int i;
5173 struct kvm_vcpu *vcpu;
5174
5175 kvm_for_each_vcpu(i, vcpu, kvm)
5176 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5177
5178 return 0;
5179}
5180
5181static int kvm_stat_data_get(void *data, u64 *val)
5182{
5183 int r = -EFAULT;
5184 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5185
5186 switch (stat_data->kind) {
5187 case KVM_STAT_VM:
5188 r = kvm_get_stat_per_vm(stat_data->kvm,
5189 stat_data->desc->desc.offset, val);
5190 break;
5191 case KVM_STAT_VCPU:
5192 r = kvm_get_stat_per_vcpu(stat_data->kvm,
5193 stat_data->desc->desc.offset, val);
5194 break;
5195 }
5196
5197 return r;
5198}
5199
5200static int kvm_stat_data_clear(void *data, u64 val)
5201{
5202 int r = -EFAULT;
5203 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5204
5205 if (val)
5206 return -EINVAL;
5207
5208 switch (stat_data->kind) {
5209 case KVM_STAT_VM:
5210 r = kvm_clear_stat_per_vm(stat_data->kvm,
5211 stat_data->desc->desc.offset);
5212 break;
5213 case KVM_STAT_VCPU:
5214 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5215 stat_data->desc->desc.offset);
5216 break;
5217 }
5218
5219 return r;
5220}
5221
5222static int kvm_stat_data_open(struct inode *inode, struct file *file)
5223{
5224 __simple_attr_check_format("%llu\n", 0ull);
5225 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5226 kvm_stat_data_clear, "%llu\n");
5227}
5228
5229static const struct file_operations stat_fops_per_vm = {
5230 .owner = THIS_MODULE,
5231 .open = kvm_stat_data_open,
5232 .release = kvm_debugfs_release,
5233 .read = simple_attr_read,
5234 .write = simple_attr_write,
5235 .llseek = no_llseek,
5236};
5237
5238static int vm_stat_get(void *_offset, u64 *val)
5239{
5240 unsigned offset = (long)_offset;
5241 struct kvm *kvm;
5242 u64 tmp_val;
5243
5244 *val = 0;
5245 mutex_lock(&kvm_lock);
5246 list_for_each_entry(kvm, &vm_list, vm_list) {
5247 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5248 *val += tmp_val;
5249 }
5250 mutex_unlock(&kvm_lock);
5251 return 0;
5252}
5253
5254static int vm_stat_clear(void *_offset, u64 val)
5255{
5256 unsigned offset = (long)_offset;
5257 struct kvm *kvm;
5258
5259 if (val)
5260 return -EINVAL;
5261
5262 mutex_lock(&kvm_lock);
5263 list_for_each_entry(kvm, &vm_list, vm_list) {
5264 kvm_clear_stat_per_vm(kvm, offset);
5265 }
5266 mutex_unlock(&kvm_lock);
5267
5268 return 0;
5269}
5270
5271DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5272DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5273
5274static int vcpu_stat_get(void *_offset, u64 *val)
5275{
5276 unsigned offset = (long)_offset;
5277 struct kvm *kvm;
5278 u64 tmp_val;
5279
5280 *val = 0;
5281 mutex_lock(&kvm_lock);
5282 list_for_each_entry(kvm, &vm_list, vm_list) {
5283 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5284 *val += tmp_val;
5285 }
5286 mutex_unlock(&kvm_lock);
5287 return 0;
5288}
5289
5290static int vcpu_stat_clear(void *_offset, u64 val)
5291{
5292 unsigned offset = (long)_offset;
5293 struct kvm *kvm;
5294
5295 if (val)
5296 return -EINVAL;
5297
5298 mutex_lock(&kvm_lock);
5299 list_for_each_entry(kvm, &vm_list, vm_list) {
5300 kvm_clear_stat_per_vcpu(kvm, offset);
5301 }
5302 mutex_unlock(&kvm_lock);
5303
5304 return 0;
5305}
5306
5307DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5308 "%llu\n");
5309DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5310
5311static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5312{
5313 struct kobj_uevent_env *env;
5314 unsigned long long created, active;
5315
5316 if (!kvm_dev.this_device || !kvm)
5317 return;
5318
5319 mutex_lock(&kvm_lock);
5320 if (type == KVM_EVENT_CREATE_VM) {
5321 kvm_createvm_count++;
5322 kvm_active_vms++;
5323 } else if (type == KVM_EVENT_DESTROY_VM) {
5324 kvm_active_vms--;
5325 }
5326 created = kvm_createvm_count;
5327 active = kvm_active_vms;
5328 mutex_unlock(&kvm_lock);
5329
5330 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5331 if (!env)
5332 return;
5333
5334 add_uevent_var(env, "CREATED=%llu", created);
5335 add_uevent_var(env, "COUNT=%llu", active);
5336
5337 if (type == KVM_EVENT_CREATE_VM) {
5338 add_uevent_var(env, "EVENT=create");
5339 kvm->userspace_pid = task_pid_nr(current);
5340 } else if (type == KVM_EVENT_DESTROY_VM) {
5341 add_uevent_var(env, "EVENT=destroy");
5342 }
5343 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5344
5345 if (kvm->debugfs_dentry) {
5346 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5347
5348 if (p) {
5349 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5350 if (!IS_ERR(tmp))
5351 add_uevent_var(env, "STATS_PATH=%s", tmp);
5352 kfree(p);
5353 }
5354 }
5355
5356 env->envp[env->envp_idx++] = NULL;
5357 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5358 kfree(env);
5359}
5360
5361static void kvm_init_debug(void)
5362{
5363 const struct file_operations *fops;
5364 const struct _kvm_stats_desc *pdesc;
5365 int i;
5366
5367 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
5368
5369 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5370 pdesc = &kvm_vm_stats_desc[i];
5371 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5372 fops = &vm_stat_fops;
5373 else
5374 fops = &vm_stat_readonly_fops;
5375 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5376 kvm_debugfs_dir,
5377 (void *)(long)pdesc->desc.offset, fops);
5378 }
5379
5380 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5381 pdesc = &kvm_vcpu_stats_desc[i];
5382 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5383 fops = &vcpu_stat_fops;
5384 else
5385 fops = &vcpu_stat_readonly_fops;
5386 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5387 kvm_debugfs_dir,
5388 (void *)(long)pdesc->desc.offset, fops);
5389 }
5390}
5391
5392static int kvm_suspend(void)
5393{
5394 if (kvm_usage_count)
5395 hardware_disable_nolock(NULL);
5396 return 0;
5397}
5398
5399static void kvm_resume(void)
5400{
5401 if (kvm_usage_count) {
5402#ifdef CONFIG_LOCKDEP
5403 WARN_ON(lockdep_is_held(&kvm_count_lock));
5404#endif
5405 hardware_enable_nolock(NULL);
5406 }
5407}
5408
5409static struct syscore_ops kvm_syscore_ops = {
5410 .suspend = kvm_suspend,
5411 .resume = kvm_resume,
5412};
5413
5414static inline
5415struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5416{
5417 return container_of(pn, struct kvm_vcpu, preempt_notifier);
5418}
5419
5420static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5421{
5422 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5423
5424 WRITE_ONCE(vcpu->preempted, false);
5425 WRITE_ONCE(vcpu->ready, false);
5426
5427 __this_cpu_write(kvm_running_vcpu, vcpu);
5428 kvm_arch_sched_in(vcpu, cpu);
5429 kvm_arch_vcpu_load(vcpu, cpu);
5430}
5431
5432static void kvm_sched_out(struct preempt_notifier *pn,
5433 struct task_struct *next)
5434{
5435 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5436
5437 if (current->on_rq) {
5438 WRITE_ONCE(vcpu->preempted, true);
5439 WRITE_ONCE(vcpu->ready, true);
5440 }
5441 kvm_arch_vcpu_put(vcpu);
5442 __this_cpu_write(kvm_running_vcpu, NULL);
5443}
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454struct kvm_vcpu *kvm_get_running_vcpu(void)
5455{
5456 struct kvm_vcpu *vcpu;
5457
5458 preempt_disable();
5459 vcpu = __this_cpu_read(kvm_running_vcpu);
5460 preempt_enable();
5461
5462 return vcpu;
5463}
5464EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5465
5466
5467
5468
5469struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5470{
5471 return &kvm_running_vcpu;
5472}
5473
5474struct kvm_cpu_compat_check {
5475 void *opaque;
5476 int *ret;
5477};
5478
5479static void check_processor_compat(void *data)
5480{
5481 struct kvm_cpu_compat_check *c = data;
5482
5483 *c->ret = kvm_arch_check_processor_compat(c->opaque);
5484}
5485
5486int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
5487 struct module *module)
5488{
5489 struct kvm_cpu_compat_check c;
5490 int r;
5491 int cpu;
5492
5493 r = kvm_arch_init(opaque);
5494 if (r)
5495 goto out_fail;
5496
5497
5498
5499
5500
5501
5502
5503
5504 r = kvm_irqfd_init();
5505 if (r)
5506 goto out_irqfd;
5507
5508 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
5509 r = -ENOMEM;
5510 goto out_free_0;
5511 }
5512
5513 r = kvm_arch_hardware_setup(opaque);
5514 if (r < 0)
5515 goto out_free_1;
5516
5517 c.ret = &r;
5518 c.opaque = opaque;
5519 for_each_online_cpu(cpu) {
5520 smp_call_function_single(cpu, check_processor_compat, &c, 1);
5521 if (r < 0)
5522 goto out_free_2;
5523 }
5524
5525 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
5526 kvm_starting_cpu, kvm_dying_cpu);
5527 if (r)
5528 goto out_free_2;
5529 register_reboot_notifier(&kvm_reboot_notifier);
5530
5531
5532 if (!vcpu_align)
5533 vcpu_align = __alignof__(struct kvm_vcpu);
5534 kvm_vcpu_cache =
5535 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5536 SLAB_ACCOUNT,
5537 offsetof(struct kvm_vcpu, arch),
5538 offsetofend(struct kvm_vcpu, stats_id)
5539 - offsetof(struct kvm_vcpu, arch),
5540 NULL);
5541 if (!kvm_vcpu_cache) {
5542 r = -ENOMEM;
5543 goto out_free_3;
5544 }
5545
5546 r = kvm_async_pf_init();
5547 if (r)
5548 goto out_free;
5549
5550 kvm_chardev_ops.owner = module;
5551 kvm_vm_fops.owner = module;
5552 kvm_vcpu_fops.owner = module;
5553
5554 r = misc_register(&kvm_dev);
5555 if (r) {
5556 pr_err("kvm: misc device register failed\n");
5557 goto out_unreg;
5558 }
5559
5560 register_syscore_ops(&kvm_syscore_ops);
5561
5562 kvm_preempt_ops.sched_in = kvm_sched_in;
5563 kvm_preempt_ops.sched_out = kvm_sched_out;
5564
5565 kvm_init_debug();
5566
5567 r = kvm_vfio_ops_init();
5568 WARN_ON(r);
5569
5570 return 0;
5571
5572out_unreg:
5573 kvm_async_pf_deinit();
5574out_free:
5575 kmem_cache_destroy(kvm_vcpu_cache);
5576out_free_3:
5577 unregister_reboot_notifier(&kvm_reboot_notifier);
5578 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5579out_free_2:
5580 kvm_arch_hardware_unsetup();
5581out_free_1:
5582 free_cpumask_var(cpus_hardware_enabled);
5583out_free_0:
5584 kvm_irqfd_exit();
5585out_irqfd:
5586 kvm_arch_exit();
5587out_fail:
5588 return r;
5589}
5590EXPORT_SYMBOL_GPL(kvm_init);
5591
5592void kvm_exit(void)
5593{
5594 debugfs_remove_recursive(kvm_debugfs_dir);
5595 misc_deregister(&kvm_dev);
5596 kmem_cache_destroy(kvm_vcpu_cache);
5597 kvm_async_pf_deinit();
5598 unregister_syscore_ops(&kvm_syscore_ops);
5599 unregister_reboot_notifier(&kvm_reboot_notifier);
5600 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5601 on_each_cpu(hardware_disable_nolock, NULL, 1);
5602 kvm_arch_hardware_unsetup();
5603 kvm_arch_exit();
5604 kvm_irqfd_exit();
5605 free_cpumask_var(cpus_hardware_enabled);
5606 kvm_vfio_ops_exit();
5607}
5608EXPORT_SYMBOL_GPL(kvm_exit);
5609
5610struct kvm_vm_worker_thread_context {
5611 struct kvm *kvm;
5612 struct task_struct *parent;
5613 struct completion init_done;
5614 kvm_vm_thread_fn_t thread_fn;
5615 uintptr_t data;
5616 int err;
5617};
5618
5619static int kvm_vm_worker_thread(void *context)
5620{
5621
5622
5623
5624
5625 struct kvm_vm_worker_thread_context *init_context = context;
5626 struct kvm *kvm = init_context->kvm;
5627 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5628 uintptr_t data = init_context->data;
5629 int err;
5630
5631 err = kthread_park(current);
5632
5633 WARN_ON(err != 0);
5634 if (err)
5635 goto init_complete;
5636
5637 err = cgroup_attach_task_all(init_context->parent, current);
5638 if (err) {
5639 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5640 __func__, err);
5641 goto init_complete;
5642 }
5643
5644 set_user_nice(current, task_nice(init_context->parent));
5645
5646init_complete:
5647 init_context->err = err;
5648 complete(&init_context->init_done);
5649 init_context = NULL;
5650
5651 if (err)
5652 return err;
5653
5654
5655 kthread_parkme();
5656
5657 if (!kthread_should_stop())
5658 err = thread_fn(kvm, data);
5659
5660 return err;
5661}
5662
5663int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5664 uintptr_t data, const char *name,
5665 struct task_struct **thread_ptr)
5666{
5667 struct kvm_vm_worker_thread_context init_context = {};
5668 struct task_struct *thread;
5669
5670 *thread_ptr = NULL;
5671 init_context.kvm = kvm;
5672 init_context.parent = current;
5673 init_context.thread_fn = thread_fn;
5674 init_context.data = data;
5675 init_completion(&init_context.init_done);
5676
5677 thread = kthread_run(kvm_vm_worker_thread, &init_context,
5678 "%s-%d", name, task_pid_nr(current));
5679 if (IS_ERR(thread))
5680 return PTR_ERR(thread);
5681
5682
5683 WARN_ON(thread == NULL);
5684
5685 wait_for_completion(&init_context.init_done);
5686
5687 if (!init_context.err)
5688 *thread_ptr = thread;
5689
5690 return init_context.err;
5691}
5692