1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <kvm/iodev.h>
17
18#include <linux/kvm_host.h>
19#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
22#include <linux/percpu.h>
23#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
26#include <linux/reboot.h>
27#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
30#include <linux/syscore_ops.h>
31#include <linux/cpu.h>
32#include <linux/sched/signal.h>
33#include <linux/sched/mm.h>
34#include <linux/sched/stat.h>
35#include <linux/cpumask.h>
36#include <linux/smp.h>
37#include <linux/anon_inodes.h>
38#include <linux/profile.h>
39#include <linux/kvm_para.h>
40#include <linux/pagemap.h>
41#include <linux/mman.h>
42#include <linux/swap.h>
43#include <linux/bitops.h>
44#include <linux/spinlock.h>
45#include <linux/compat.h>
46#include <linux/srcu.h>
47#include <linux/hugetlb.h>
48#include <linux/slab.h>
49#include <linux/sort.h>
50#include <linux/bsearch.h>
51#include <linux/io.h>
52#include <linux/lockdep.h>
53#include <linux/kthread.h>
54#include <linux/suspend.h>
55
56#include <asm/processor.h>
57#include <asm/ioctl.h>
58#include <linux/uaccess.h>
59
60#include "coalesced_mmio.h"
61#include "async_pf.h"
62#include "mmu_lock.h"
63#include "vfio.h"
64
65#define CREATE_TRACE_POINTS
66#include <trace/events/kvm.h>
67
68#include <linux/kvm_dirty_ring.h>
69
70
71#define ITOA_MAX_LEN 12
72
73MODULE_AUTHOR("Qumranet");
74MODULE_LICENSE("GPL");
75
76
77unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
78module_param(halt_poll_ns, uint, 0644);
79EXPORT_SYMBOL_GPL(halt_poll_ns);
80
81
82unsigned int halt_poll_ns_grow = 2;
83module_param(halt_poll_ns_grow, uint, 0644);
84EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
85
86
87unsigned int halt_poll_ns_grow_start = 10000;
88module_param(halt_poll_ns_grow_start, uint, 0644);
89EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
90
91
92unsigned int halt_poll_ns_shrink;
93module_param(halt_poll_ns_shrink, uint, 0644);
94EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
95
96
97
98
99
100
101
102DEFINE_MUTEX(kvm_lock);
103static DEFINE_RAW_SPINLOCK(kvm_count_lock);
104LIST_HEAD(vm_list);
105
106static cpumask_var_t cpus_hardware_enabled;
107static int kvm_usage_count;
108static atomic_t hardware_enable_failed;
109
110static struct kmem_cache *kvm_vcpu_cache;
111
112static __read_mostly struct preempt_ops kvm_preempt_ops;
113static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
114
115struct dentry *kvm_debugfs_dir;
116EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
117
118static const struct file_operations stat_fops_per_vm;
119
120static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
121 unsigned long arg);
122#ifdef CONFIG_KVM_COMPAT
123static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
124 unsigned long arg);
125#define KVM_COMPAT(c) .compat_ioctl = (c)
126#else
127
128
129
130
131
132
133
134static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
135 unsigned long arg) { return -EINVAL; }
136
137static int kvm_no_compat_open(struct inode *inode, struct file *file)
138{
139 return is_compat_task() ? -ENODEV : 0;
140}
141#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
142 .open = kvm_no_compat_open
143#endif
144static int hardware_enable_all(void);
145static void hardware_disable_all(void);
146
147static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
148
149__visible bool kvm_rebooting;
150EXPORT_SYMBOL_GPL(kvm_rebooting);
151
152#define KVM_EVENT_CREATE_VM 0
153#define KVM_EVENT_DESTROY_VM 1
154static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
155static unsigned long long kvm_createvm_count;
156static unsigned long long kvm_active_vms;
157
158__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
159 unsigned long start, unsigned long end)
160{
161}
162
163bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
164{
165
166
167
168
169
170
171 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
172 return false;
173
174 return is_zone_device_page(pfn_to_page(pfn));
175}
176
177bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
178{
179
180
181
182
183
184 if (pfn_valid(pfn))
185 return PageReserved(pfn_to_page(pfn)) &&
186 !is_zero_pfn(pfn) &&
187 !kvm_is_zone_device_pfn(pfn);
188
189 return true;
190}
191
192bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
193{
194 struct page *page = pfn_to_page(pfn);
195
196 if (!PageTransCompoundMap(page))
197 return false;
198
199 return is_transparent_hugepage(compound_head(page));
200}
201
202
203
204
205void vcpu_load(struct kvm_vcpu *vcpu)
206{
207 int cpu = get_cpu();
208
209 __this_cpu_write(kvm_running_vcpu, vcpu);
210 preempt_notifier_register(&vcpu->preempt_notifier);
211 kvm_arch_vcpu_load(vcpu, cpu);
212 put_cpu();
213}
214EXPORT_SYMBOL_GPL(vcpu_load);
215
216void vcpu_put(struct kvm_vcpu *vcpu)
217{
218 preempt_disable();
219 kvm_arch_vcpu_put(vcpu);
220 preempt_notifier_unregister(&vcpu->preempt_notifier);
221 __this_cpu_write(kvm_running_vcpu, NULL);
222 preempt_enable();
223}
224EXPORT_SYMBOL_GPL(vcpu_put);
225
226
227static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
228{
229 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
230
231
232
233
234
235 if (req & KVM_REQUEST_WAIT)
236 return mode != OUTSIDE_GUEST_MODE;
237
238
239
240
241 return mode == IN_GUEST_MODE;
242}
243
244static void ack_flush(void *_completed)
245{
246}
247
248static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
249{
250 if (unlikely(!cpus))
251 cpus = cpu_online_mask;
252
253 if (cpumask_empty(cpus))
254 return false;
255
256 smp_call_function_many(cpus, ack_flush, NULL, wait);
257 return true;
258}
259
260bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
261 struct kvm_vcpu *except,
262 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
263{
264 int i, cpu, me;
265 struct kvm_vcpu *vcpu;
266 bool called;
267
268 me = get_cpu();
269
270 kvm_for_each_vcpu(i, vcpu, kvm) {
271 if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
272 vcpu == except)
273 continue;
274
275 kvm_make_request(req, vcpu);
276 cpu = vcpu->cpu;
277
278 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
279 continue;
280
281 if (tmp != NULL && cpu != -1 && cpu != me &&
282 kvm_request_needs_ipi(vcpu, req))
283 __cpumask_set_cpu(cpu, tmp);
284 }
285
286 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
287 put_cpu();
288
289 return called;
290}
291
292bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
293 struct kvm_vcpu *except)
294{
295 cpumask_var_t cpus;
296 bool called;
297
298 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
299
300 called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus);
301
302 free_cpumask_var(cpus);
303 return called;
304}
305
306bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
307{
308 return kvm_make_all_cpus_request_except(kvm, req, NULL);
309}
310EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
311
312#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
313void kvm_flush_remote_tlbs(struct kvm *kvm)
314{
315
316
317
318
319 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
320
321
322
323
324
325
326
327
328
329
330
331
332 if (!kvm_arch_flush_remote_tlb(kvm)
333 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
334 ++kvm->stat.generic.remote_tlb_flush;
335 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
336}
337EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
338#endif
339
340void kvm_reload_remote_mmus(struct kvm *kvm)
341{
342 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
343}
344
345#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
346static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
347 gfp_t gfp_flags)
348{
349 gfp_flags |= mc->gfp_zero;
350
351 if (mc->kmem_cache)
352 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
353 else
354 return (void *)__get_free_page(gfp_flags);
355}
356
357int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
358{
359 void *obj;
360
361 if (mc->nobjs >= min)
362 return 0;
363 while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
364 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
365 if (!obj)
366 return mc->nobjs >= min ? 0 : -ENOMEM;
367 mc->objects[mc->nobjs++] = obj;
368 }
369 return 0;
370}
371
372int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
373{
374 return mc->nobjs;
375}
376
377void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
378{
379 while (mc->nobjs) {
380 if (mc->kmem_cache)
381 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
382 else
383 free_page((unsigned long)mc->objects[--mc->nobjs]);
384 }
385}
386
387void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
388{
389 void *p;
390
391 if (WARN_ON(!mc->nobjs))
392 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
393 else
394 p = mc->objects[--mc->nobjs];
395 BUG_ON(!p);
396 return p;
397}
398#endif
399
400static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
401{
402 mutex_init(&vcpu->mutex);
403 vcpu->cpu = -1;
404 vcpu->kvm = kvm;
405 vcpu->vcpu_id = id;
406 vcpu->pid = NULL;
407 rcuwait_init(&vcpu->wait);
408 kvm_async_pf_vcpu_init(vcpu);
409
410 vcpu->pre_pcpu = -1;
411 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
412
413 kvm_vcpu_set_in_spin_loop(vcpu, false);
414 kvm_vcpu_set_dy_eligible(vcpu, false);
415 vcpu->preempted = false;
416 vcpu->ready = false;
417 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
418}
419
420void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
421{
422 kvm_dirty_ring_free(&vcpu->dirty_ring);
423 kvm_arch_vcpu_destroy(vcpu);
424
425
426
427
428
429
430 put_pid(rcu_dereference_protected(vcpu->pid, 1));
431
432 free_page((unsigned long)vcpu->run);
433 kmem_cache_free(kvm_vcpu_cache, vcpu);
434}
435EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
436
437#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
438static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
439{
440 return container_of(mn, struct kvm, mmu_notifier);
441}
442
443static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
444 struct mm_struct *mm,
445 unsigned long start, unsigned long end)
446{
447 struct kvm *kvm = mmu_notifier_to_kvm(mn);
448 int idx;
449
450 idx = srcu_read_lock(&kvm->srcu);
451 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
452 srcu_read_unlock(&kvm->srcu, idx);
453}
454
455typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
456
457typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
458 unsigned long end);
459
460struct kvm_hva_range {
461 unsigned long start;
462 unsigned long end;
463 pte_t pte;
464 hva_handler_t handler;
465 on_lock_fn_t on_lock;
466 bool flush_on_ret;
467 bool may_block;
468};
469
470
471
472
473
474
475
476
477static void kvm_null_fn(void)
478{
479
480}
481#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
482
483static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
484 const struct kvm_hva_range *range)
485{
486 bool ret = false, locked = false;
487 struct kvm_gfn_range gfn_range;
488 struct kvm_memory_slot *slot;
489 struct kvm_memslots *slots;
490 int i, idx;
491
492
493 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
494 IS_KVM_NULL_FN(range->handler)))
495 return 0;
496
497 idx = srcu_read_lock(&kvm->srcu);
498
499
500 if (!IS_KVM_NULL_FN(range->on_lock)) {
501 locked = true;
502 KVM_MMU_LOCK(kvm);
503
504 range->on_lock(kvm, range->start, range->end);
505
506 if (IS_KVM_NULL_FN(range->handler))
507 goto out_unlock;
508 }
509
510 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
511 slots = __kvm_memslots(kvm, i);
512 kvm_for_each_memslot(slot, slots) {
513 unsigned long hva_start, hva_end;
514
515 hva_start = max(range->start, slot->userspace_addr);
516 hva_end = min(range->end, slot->userspace_addr +
517 (slot->npages << PAGE_SHIFT));
518 if (hva_start >= hva_end)
519 continue;
520
521
522
523
524
525
526
527 gfn_range.pte = range->pte;
528 gfn_range.may_block = range->may_block;
529
530
531
532
533
534 gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
535 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
536 gfn_range.slot = slot;
537
538 if (!locked) {
539 locked = true;
540 KVM_MMU_LOCK(kvm);
541 }
542 ret |= range->handler(kvm, &gfn_range);
543 }
544 }
545
546 if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
547 kvm_flush_remote_tlbs(kvm);
548
549out_unlock:
550 if (locked)
551 KVM_MMU_UNLOCK(kvm);
552
553 srcu_read_unlock(&kvm->srcu, idx);
554
555
556 return (int)ret;
557}
558
559static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
560 unsigned long start,
561 unsigned long end,
562 pte_t pte,
563 hva_handler_t handler)
564{
565 struct kvm *kvm = mmu_notifier_to_kvm(mn);
566 const struct kvm_hva_range range = {
567 .start = start,
568 .end = end,
569 .pte = pte,
570 .handler = handler,
571 .on_lock = (void *)kvm_null_fn,
572 .flush_on_ret = true,
573 .may_block = false,
574 };
575
576 return __kvm_handle_hva_range(kvm, &range);
577}
578
579static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
580 unsigned long start,
581 unsigned long end,
582 hva_handler_t handler)
583{
584 struct kvm *kvm = mmu_notifier_to_kvm(mn);
585 const struct kvm_hva_range range = {
586 .start = start,
587 .end = end,
588 .pte = __pte(0),
589 .handler = handler,
590 .on_lock = (void *)kvm_null_fn,
591 .flush_on_ret = false,
592 .may_block = false,
593 };
594
595 return __kvm_handle_hva_range(kvm, &range);
596}
597static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
598 struct mm_struct *mm,
599 unsigned long address,
600 pte_t pte)
601{
602 struct kvm *kvm = mmu_notifier_to_kvm(mn);
603
604 trace_kvm_set_spte_hva(address);
605
606
607
608
609
610
611 WARN_ON_ONCE(!kvm->mmu_notifier_count);
612
613 kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
614}
615
616static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
617 unsigned long end)
618{
619
620
621
622
623
624 kvm->mmu_notifier_count++;
625 if (likely(kvm->mmu_notifier_count == 1)) {
626 kvm->mmu_notifier_range_start = start;
627 kvm->mmu_notifier_range_end = end;
628 } else {
629
630
631
632
633
634
635
636
637
638 kvm->mmu_notifier_range_start =
639 min(kvm->mmu_notifier_range_start, start);
640 kvm->mmu_notifier_range_end =
641 max(kvm->mmu_notifier_range_end, end);
642 }
643}
644
645static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
646 const struct mmu_notifier_range *range)
647{
648 struct kvm *kvm = mmu_notifier_to_kvm(mn);
649 const struct kvm_hva_range hva_range = {
650 .start = range->start,
651 .end = range->end,
652 .pte = __pte(0),
653 .handler = kvm_unmap_gfn_range,
654 .on_lock = kvm_inc_notifier_count,
655 .flush_on_ret = true,
656 .may_block = mmu_notifier_range_blockable(range),
657 };
658
659 trace_kvm_unmap_hva_range(range->start, range->end);
660
661 __kvm_handle_hva_range(kvm, &hva_range);
662
663 return 0;
664}
665
666static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
667 unsigned long end)
668{
669
670
671
672
673
674 kvm->mmu_notifier_seq++;
675 smp_wmb();
676
677
678
679
680
681 kvm->mmu_notifier_count--;
682}
683
684static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
685 const struct mmu_notifier_range *range)
686{
687 struct kvm *kvm = mmu_notifier_to_kvm(mn);
688 const struct kvm_hva_range hva_range = {
689 .start = range->start,
690 .end = range->end,
691 .pte = __pte(0),
692 .handler = (void *)kvm_null_fn,
693 .on_lock = kvm_dec_notifier_count,
694 .flush_on_ret = false,
695 .may_block = mmu_notifier_range_blockable(range),
696 };
697
698 __kvm_handle_hva_range(kvm, &hva_range);
699
700 BUG_ON(kvm->mmu_notifier_count < 0);
701}
702
703static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
704 struct mm_struct *mm,
705 unsigned long start,
706 unsigned long end)
707{
708 trace_kvm_age_hva(start, end);
709
710 return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
711}
712
713static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
714 struct mm_struct *mm,
715 unsigned long start,
716 unsigned long end)
717{
718 trace_kvm_age_hva(start, end);
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
734}
735
736static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
737 struct mm_struct *mm,
738 unsigned long address)
739{
740 trace_kvm_test_age_hva(address);
741
742 return kvm_handle_hva_range_no_flush(mn, address, address + 1,
743 kvm_test_age_gfn);
744}
745
746static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
747 struct mm_struct *mm)
748{
749 struct kvm *kvm = mmu_notifier_to_kvm(mn);
750 int idx;
751
752 idx = srcu_read_lock(&kvm->srcu);
753 kvm_arch_flush_shadow_all(kvm);
754 srcu_read_unlock(&kvm->srcu, idx);
755}
756
757static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
758 .invalidate_range = kvm_mmu_notifier_invalidate_range,
759 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
760 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
761 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
762 .clear_young = kvm_mmu_notifier_clear_young,
763 .test_young = kvm_mmu_notifier_test_young,
764 .change_pte = kvm_mmu_notifier_change_pte,
765 .release = kvm_mmu_notifier_release,
766};
767
768static int kvm_init_mmu_notifier(struct kvm *kvm)
769{
770 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
771 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
772}
773
774#else
775
776static int kvm_init_mmu_notifier(struct kvm *kvm)
777{
778 return 0;
779}
780
781#endif
782
783#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
784static int kvm_pm_notifier_call(struct notifier_block *bl,
785 unsigned long state,
786 void *unused)
787{
788 struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
789
790 return kvm_arch_pm_notifier(kvm, state);
791}
792
793static void kvm_init_pm_notifier(struct kvm *kvm)
794{
795 kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
796
797 kvm->pm_notifier.priority = INT_MAX;
798 register_pm_notifier(&kvm->pm_notifier);
799}
800
801static void kvm_destroy_pm_notifier(struct kvm *kvm)
802{
803 unregister_pm_notifier(&kvm->pm_notifier);
804}
805#else
806static void kvm_init_pm_notifier(struct kvm *kvm)
807{
808}
809
810static void kvm_destroy_pm_notifier(struct kvm *kvm)
811{
812}
813#endif
814
815static struct kvm_memslots *kvm_alloc_memslots(void)
816{
817 int i;
818 struct kvm_memslots *slots;
819
820 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
821 if (!slots)
822 return NULL;
823
824 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
825 slots->id_to_index[i] = -1;
826
827 return slots;
828}
829
830static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
831{
832 if (!memslot->dirty_bitmap)
833 return;
834
835 kvfree(memslot->dirty_bitmap);
836 memslot->dirty_bitmap = NULL;
837}
838
839static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
840{
841 kvm_destroy_dirty_bitmap(slot);
842
843 kvm_arch_free_memslot(kvm, slot);
844
845 slot->flags = 0;
846 slot->npages = 0;
847}
848
849static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
850{
851 struct kvm_memory_slot *memslot;
852
853 if (!slots)
854 return;
855
856 kvm_for_each_memslot(memslot, slots)
857 kvm_free_memslot(kvm, memslot);
858
859 kvfree(slots);
860}
861
862static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
863{
864 switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
865 case KVM_STATS_TYPE_INSTANT:
866 return 0444;
867 case KVM_STATS_TYPE_CUMULATIVE:
868 case KVM_STATS_TYPE_PEAK:
869 default:
870 return 0644;
871 }
872}
873
874
875static void kvm_destroy_vm_debugfs(struct kvm *kvm)
876{
877 int i;
878 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
879 kvm_vcpu_stats_header.num_desc;
880
881 if (!kvm->debugfs_dentry)
882 return;
883
884 debugfs_remove_recursive(kvm->debugfs_dentry);
885
886 if (kvm->debugfs_stat_data) {
887 for (i = 0; i < kvm_debugfs_num_entries; i++)
888 kfree(kvm->debugfs_stat_data[i]);
889 kfree(kvm->debugfs_stat_data);
890 }
891}
892
893static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
894{
895 static DEFINE_MUTEX(kvm_debugfs_lock);
896 struct dentry *dent;
897 char dir_name[ITOA_MAX_LEN * 2];
898 struct kvm_stat_data *stat_data;
899 const struct _kvm_stats_desc *pdesc;
900 int i;
901 int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
902 kvm_vcpu_stats_header.num_desc;
903
904 if (!debugfs_initialized())
905 return 0;
906
907 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
908 mutex_lock(&kvm_debugfs_lock);
909 dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
910 if (dent) {
911 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
912 dput(dent);
913 mutex_unlock(&kvm_debugfs_lock);
914 return 0;
915 }
916 dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
917 mutex_unlock(&kvm_debugfs_lock);
918 if (IS_ERR(dent))
919 return 0;
920
921 kvm->debugfs_dentry = dent;
922 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
923 sizeof(*kvm->debugfs_stat_data),
924 GFP_KERNEL_ACCOUNT);
925 if (!kvm->debugfs_stat_data)
926 return -ENOMEM;
927
928 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
929 pdesc = &kvm_vm_stats_desc[i];
930 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
931 if (!stat_data)
932 return -ENOMEM;
933
934 stat_data->kvm = kvm;
935 stat_data->desc = pdesc;
936 stat_data->kind = KVM_STAT_VM;
937 kvm->debugfs_stat_data[i] = stat_data;
938 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
939 kvm->debugfs_dentry, stat_data,
940 &stat_fops_per_vm);
941 }
942
943 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
944 pdesc = &kvm_vcpu_stats_desc[i];
945 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
946 if (!stat_data)
947 return -ENOMEM;
948
949 stat_data->kvm = kvm;
950 stat_data->desc = pdesc;
951 stat_data->kind = KVM_STAT_VCPU;
952 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
953 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
954 kvm->debugfs_dentry, stat_data,
955 &stat_fops_per_vm);
956 }
957 return 0;
958}
959
960
961
962
963
964int __weak kvm_arch_post_init_vm(struct kvm *kvm)
965{
966 return 0;
967}
968
969
970
971
972
973void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
974{
975}
976
977static struct kvm *kvm_create_vm(unsigned long type)
978{
979 struct kvm *kvm = kvm_arch_alloc_vm();
980 int r = -ENOMEM;
981 int i;
982
983 if (!kvm)
984 return ERR_PTR(-ENOMEM);
985
986 KVM_MMU_LOCK_INIT(kvm);
987 mmgrab(current->mm);
988 kvm->mm = current->mm;
989 kvm_eventfd_init(kvm);
990 mutex_init(&kvm->lock);
991 mutex_init(&kvm->irq_lock);
992 mutex_init(&kvm->slots_lock);
993 mutex_init(&kvm->slots_arch_lock);
994 INIT_LIST_HEAD(&kvm->devices);
995
996 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
997
998 if (init_srcu_struct(&kvm->srcu))
999 goto out_err_no_srcu;
1000 if (init_srcu_struct(&kvm->irq_srcu))
1001 goto out_err_no_irq_srcu;
1002
1003 refcount_set(&kvm->users_count, 1);
1004 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1005 struct kvm_memslots *slots = kvm_alloc_memslots();
1006
1007 if (!slots)
1008 goto out_err_no_arch_destroy_vm;
1009
1010 slots->generation = i;
1011 rcu_assign_pointer(kvm->memslots[i], slots);
1012 }
1013
1014 for (i = 0; i < KVM_NR_BUSES; i++) {
1015 rcu_assign_pointer(kvm->buses[i],
1016 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1017 if (!kvm->buses[i])
1018 goto out_err_no_arch_destroy_vm;
1019 }
1020
1021 kvm->max_halt_poll_ns = halt_poll_ns;
1022
1023 r = kvm_arch_init_vm(kvm, type);
1024 if (r)
1025 goto out_err_no_arch_destroy_vm;
1026
1027 r = hardware_enable_all();
1028 if (r)
1029 goto out_err_no_disable;
1030
1031#ifdef CONFIG_HAVE_KVM_IRQFD
1032 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1033#endif
1034
1035 r = kvm_init_mmu_notifier(kvm);
1036 if (r)
1037 goto out_err_no_mmu_notifier;
1038
1039 r = kvm_arch_post_init_vm(kvm);
1040 if (r)
1041 goto out_err;
1042
1043 mutex_lock(&kvm_lock);
1044 list_add(&kvm->vm_list, &vm_list);
1045 mutex_unlock(&kvm_lock);
1046
1047 preempt_notifier_inc();
1048 kvm_init_pm_notifier(kvm);
1049
1050 return kvm;
1051
1052out_err:
1053#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1054 if (kvm->mmu_notifier.ops)
1055 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1056#endif
1057out_err_no_mmu_notifier:
1058 hardware_disable_all();
1059out_err_no_disable:
1060 kvm_arch_destroy_vm(kvm);
1061out_err_no_arch_destroy_vm:
1062 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1063 for (i = 0; i < KVM_NR_BUSES; i++)
1064 kfree(kvm_get_bus(kvm, i));
1065 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1066 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1067 cleanup_srcu_struct(&kvm->irq_srcu);
1068out_err_no_irq_srcu:
1069 cleanup_srcu_struct(&kvm->srcu);
1070out_err_no_srcu:
1071 kvm_arch_free_vm(kvm);
1072 mmdrop(current->mm);
1073 return ERR_PTR(r);
1074}
1075
1076static void kvm_destroy_devices(struct kvm *kvm)
1077{
1078 struct kvm_device *dev, *tmp;
1079
1080
1081
1082
1083
1084
1085 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1086 list_del(&dev->vm_node);
1087 dev->ops->destroy(dev);
1088 }
1089}
1090
1091static void kvm_destroy_vm(struct kvm *kvm)
1092{
1093 int i;
1094 struct mm_struct *mm = kvm->mm;
1095
1096 kvm_destroy_pm_notifier(kvm);
1097 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1098 kvm_destroy_vm_debugfs(kvm);
1099 kvm_arch_sync_events(kvm);
1100 mutex_lock(&kvm_lock);
1101 list_del(&kvm->vm_list);
1102 mutex_unlock(&kvm_lock);
1103 kvm_arch_pre_destroy_vm(kvm);
1104
1105 kvm_free_irq_routing(kvm);
1106 for (i = 0; i < KVM_NR_BUSES; i++) {
1107 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1108
1109 if (bus)
1110 kvm_io_bus_destroy(bus);
1111 kvm->buses[i] = NULL;
1112 }
1113 kvm_coalesced_mmio_free(kvm);
1114#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1115 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1116#else
1117 kvm_arch_flush_shadow_all(kvm);
1118#endif
1119 kvm_arch_destroy_vm(kvm);
1120 kvm_destroy_devices(kvm);
1121 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1122 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1123 cleanup_srcu_struct(&kvm->irq_srcu);
1124 cleanup_srcu_struct(&kvm->srcu);
1125 kvm_arch_free_vm(kvm);
1126 preempt_notifier_dec();
1127 hardware_disable_all();
1128 mmdrop(mm);
1129}
1130
1131void kvm_get_kvm(struct kvm *kvm)
1132{
1133 refcount_inc(&kvm->users_count);
1134}
1135EXPORT_SYMBOL_GPL(kvm_get_kvm);
1136
1137void kvm_put_kvm(struct kvm *kvm)
1138{
1139 if (refcount_dec_and_test(&kvm->users_count))
1140 kvm_destroy_vm(kvm);
1141}
1142EXPORT_SYMBOL_GPL(kvm_put_kvm);
1143
1144
1145
1146
1147
1148
1149
1150
1151void kvm_put_kvm_no_destroy(struct kvm *kvm)
1152{
1153 WARN_ON(refcount_dec_and_test(&kvm->users_count));
1154}
1155EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1156
1157static int kvm_vm_release(struct inode *inode, struct file *filp)
1158{
1159 struct kvm *kvm = filp->private_data;
1160
1161 kvm_irqfd_release(kvm);
1162
1163 kvm_put_kvm(kvm);
1164 return 0;
1165}
1166
1167
1168
1169
1170
1171static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1172{
1173 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
1174
1175 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
1176 if (!memslot->dirty_bitmap)
1177 return -ENOMEM;
1178
1179 return 0;
1180}
1181
1182
1183
1184
1185
1186static inline void kvm_memslot_delete(struct kvm_memslots *slots,
1187 struct kvm_memory_slot *memslot)
1188{
1189 struct kvm_memory_slot *mslots = slots->memslots;
1190 int i;
1191
1192 if (WARN_ON(slots->id_to_index[memslot->id] == -1))
1193 return;
1194
1195 slots->used_slots--;
1196
1197 if (atomic_read(&slots->lru_slot) >= slots->used_slots)
1198 atomic_set(&slots->lru_slot, 0);
1199
1200 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
1201 mslots[i] = mslots[i + 1];
1202 slots->id_to_index[mslots[i].id] = i;
1203 }
1204 mslots[i] = *memslot;
1205 slots->id_to_index[memslot->id] = -1;
1206}
1207
1208
1209
1210
1211
1212static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
1213{
1214 return slots->used_slots++;
1215}
1216
1217
1218
1219
1220
1221
1222
1223
1224static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
1225 struct kvm_memory_slot *memslot)
1226{
1227 struct kvm_memory_slot *mslots = slots->memslots;
1228 int i;
1229
1230 if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
1231 WARN_ON_ONCE(!slots->used_slots))
1232 return -1;
1233
1234
1235
1236
1237
1238
1239 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
1240 if (memslot->base_gfn > mslots[i + 1].base_gfn)
1241 break;
1242
1243 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1244
1245
1246 mslots[i] = mslots[i + 1];
1247 slots->id_to_index[mslots[i].id] = i;
1248 }
1249 return i;
1250}
1251
1252
1253
1254
1255
1256
1257
1258
1259static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1260 struct kvm_memory_slot *memslot,
1261 int start)
1262{
1263 struct kvm_memory_slot *mslots = slots->memslots;
1264 int i;
1265
1266 for (i = start; i > 0; i--) {
1267 if (memslot->base_gfn < mslots[i - 1].base_gfn)
1268 break;
1269
1270 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1271
1272
1273 mslots[i] = mslots[i - 1];
1274 slots->id_to_index[mslots[i].id] = i;
1275 }
1276 return i;
1277}
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320static void update_memslots(struct kvm_memslots *slots,
1321 struct kvm_memory_slot *memslot,
1322 enum kvm_mr_change change)
1323{
1324 int i;
1325
1326 if (change == KVM_MR_DELETE) {
1327 kvm_memslot_delete(slots, memslot);
1328 } else {
1329 if (change == KVM_MR_CREATE)
1330 i = kvm_memslot_insert_back(slots);
1331 else
1332 i = kvm_memslot_move_backward(slots, memslot);
1333 i = kvm_memslot_move_forward(slots, memslot, i);
1334
1335
1336
1337
1338
1339 slots->memslots[i] = *memslot;
1340 slots->id_to_index[memslot->id] = i;
1341 }
1342}
1343
1344static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1345{
1346 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1347
1348#ifdef __KVM_HAVE_READONLY_MEM
1349 valid_flags |= KVM_MEM_READONLY;
1350#endif
1351
1352 if (mem->flags & ~valid_flags)
1353 return -EINVAL;
1354
1355 return 0;
1356}
1357
1358static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
1359 int as_id, struct kvm_memslots *slots)
1360{
1361 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
1362 u64 gen = old_memslots->generation;
1363
1364 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1365 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1366
1367 rcu_assign_pointer(kvm->memslots[as_id], slots);
1368
1369
1370
1371
1372
1373
1374 mutex_unlock(&kvm->slots_arch_lock);
1375
1376 synchronize_srcu_expedited(&kvm->srcu);
1377
1378
1379
1380
1381
1382
1383
1384 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1385
1386
1387
1388
1389
1390
1391
1392
1393 gen += KVM_ADDRESS_SPACE_NUM;
1394
1395 kvm_arch_memslots_updated(kvm, gen);
1396
1397 slots->generation = gen;
1398
1399 return old_memslots;
1400}
1401
1402static size_t kvm_memslots_size(int slots)
1403{
1404 return sizeof(struct kvm_memslots) +
1405 (sizeof(struct kvm_memory_slot) * slots);
1406}
1407
1408static void kvm_copy_memslots(struct kvm_memslots *to,
1409 struct kvm_memslots *from)
1410{
1411 memcpy(to, from, kvm_memslots_size(from->used_slots));
1412}
1413
1414
1415
1416
1417
1418
1419static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1420 enum kvm_mr_change change)
1421{
1422 struct kvm_memslots *slots;
1423 size_t new_size;
1424
1425 if (change == KVM_MR_CREATE)
1426 new_size = kvm_memslots_size(old->used_slots + 1);
1427 else
1428 new_size = kvm_memslots_size(old->used_slots);
1429
1430 slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1431 if (likely(slots))
1432 kvm_copy_memslots(slots, old);
1433
1434 return slots;
1435}
1436
1437static int kvm_set_memslot(struct kvm *kvm,
1438 const struct kvm_userspace_memory_region *mem,
1439 struct kvm_memory_slot *old,
1440 struct kvm_memory_slot *new, int as_id,
1441 enum kvm_mr_change change)
1442{
1443 struct kvm_memory_slot *slot;
1444 struct kvm_memslots *slots;
1445 int r;
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461 mutex_lock(&kvm->slots_arch_lock);
1462
1463 slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
1464 if (!slots) {
1465 mutex_unlock(&kvm->slots_arch_lock);
1466 return -ENOMEM;
1467 }
1468
1469 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1470
1471
1472
1473
1474 slot = id_to_memslot(slots, old->id);
1475 slot->flags |= KVM_MEMSLOT_INVALID;
1476
1477
1478
1479
1480
1481
1482 slots = install_new_memslots(kvm, as_id, slots);
1483
1484
1485
1486
1487
1488
1489
1490
1491 kvm_arch_flush_shadow_memslot(kvm, slot);
1492
1493
1494 mutex_lock(&kvm->slots_arch_lock);
1495
1496
1497
1498
1499
1500
1501
1502 kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id));
1503 }
1504
1505 r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
1506 if (r)
1507 goto out_slots;
1508
1509 update_memslots(slots, new, change);
1510 slots = install_new_memslots(kvm, as_id, slots);
1511
1512 kvm_arch_commit_memory_region(kvm, mem, old, new, change);
1513
1514 kvfree(slots);
1515 return 0;
1516
1517out_slots:
1518 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1519 slot = id_to_memslot(slots, old->id);
1520 slot->flags &= ~KVM_MEMSLOT_INVALID;
1521 slots = install_new_memslots(kvm, as_id, slots);
1522 } else {
1523 mutex_unlock(&kvm->slots_arch_lock);
1524 }
1525 kvfree(slots);
1526 return r;
1527}
1528
1529static int kvm_delete_memslot(struct kvm *kvm,
1530 const struct kvm_userspace_memory_region *mem,
1531 struct kvm_memory_slot *old, int as_id)
1532{
1533 struct kvm_memory_slot new;
1534 int r;
1535
1536 if (!old->npages)
1537 return -EINVAL;
1538
1539 memset(&new, 0, sizeof(new));
1540 new.id = old->id;
1541
1542
1543
1544
1545 new.as_id = as_id;
1546
1547 r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
1548 if (r)
1549 return r;
1550
1551 kvm_free_memslot(kvm, old);
1552 return 0;
1553}
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563int __kvm_set_memory_region(struct kvm *kvm,
1564 const struct kvm_userspace_memory_region *mem)
1565{
1566 struct kvm_memory_slot old, new;
1567 struct kvm_memory_slot *tmp;
1568 enum kvm_mr_change change;
1569 int as_id, id;
1570 int r;
1571
1572 r = check_memory_region_flags(mem);
1573 if (r)
1574 return r;
1575
1576 as_id = mem->slot >> 16;
1577 id = (u16)mem->slot;
1578
1579
1580 if (mem->memory_size & (PAGE_SIZE - 1))
1581 return -EINVAL;
1582 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1583 return -EINVAL;
1584
1585 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1586 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1587 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1588 mem->memory_size))
1589 return -EINVAL;
1590 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1591 return -EINVAL;
1592 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1593 return -EINVAL;
1594
1595
1596
1597
1598
1599
1600
1601 tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1602 if (tmp) {
1603 old = *tmp;
1604 tmp = NULL;
1605 } else {
1606 memset(&old, 0, sizeof(old));
1607 old.id = id;
1608 }
1609
1610 if (!mem->memory_size)
1611 return kvm_delete_memslot(kvm, mem, &old, as_id);
1612
1613 new.as_id = as_id;
1614 new.id = id;
1615 new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1616 new.npages = mem->memory_size >> PAGE_SHIFT;
1617 new.flags = mem->flags;
1618 new.userspace_addr = mem->userspace_addr;
1619
1620 if (new.npages > KVM_MEM_MAX_NR_PAGES)
1621 return -EINVAL;
1622
1623 if (!old.npages) {
1624 change = KVM_MR_CREATE;
1625 new.dirty_bitmap = NULL;
1626 memset(&new.arch, 0, sizeof(new.arch));
1627 } else {
1628 if ((new.userspace_addr != old.userspace_addr) ||
1629 (new.npages != old.npages) ||
1630 ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1631 return -EINVAL;
1632
1633 if (new.base_gfn != old.base_gfn)
1634 change = KVM_MR_MOVE;
1635 else if (new.flags != old.flags)
1636 change = KVM_MR_FLAGS_ONLY;
1637 else
1638 return 0;
1639
1640
1641 new.dirty_bitmap = old.dirty_bitmap;
1642 memcpy(&new.arch, &old.arch, sizeof(new.arch));
1643 }
1644
1645 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
1646
1647 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1648 if (tmp->id == id)
1649 continue;
1650 if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1651 (new.base_gfn >= tmp->base_gfn + tmp->npages)))
1652 return -EEXIST;
1653 }
1654 }
1655
1656
1657 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1658 new.dirty_bitmap = NULL;
1659 else if (!new.dirty_bitmap && !kvm->dirty_ring_size) {
1660 r = kvm_alloc_dirty_bitmap(&new);
1661 if (r)
1662 return r;
1663
1664 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1665 bitmap_set(new.dirty_bitmap, 0, new.npages);
1666 }
1667
1668 r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
1669 if (r)
1670 goto out_bitmap;
1671
1672 if (old.dirty_bitmap && !new.dirty_bitmap)
1673 kvm_destroy_dirty_bitmap(&old);
1674 return 0;
1675
1676out_bitmap:
1677 if (new.dirty_bitmap && !old.dirty_bitmap)
1678 kvm_destroy_dirty_bitmap(&new);
1679 return r;
1680}
1681EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1682
1683int kvm_set_memory_region(struct kvm *kvm,
1684 const struct kvm_userspace_memory_region *mem)
1685{
1686 int r;
1687
1688 mutex_lock(&kvm->slots_lock);
1689 r = __kvm_set_memory_region(kvm, mem);
1690 mutex_unlock(&kvm->slots_lock);
1691 return r;
1692}
1693EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1694
1695static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1696 struct kvm_userspace_memory_region *mem)
1697{
1698 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1699 return -EINVAL;
1700
1701 return kvm_set_memory_region(kvm, mem);
1702}
1703
1704#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1705
1706
1707
1708
1709
1710
1711
1712int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1713 int *is_dirty, struct kvm_memory_slot **memslot)
1714{
1715 struct kvm_memslots *slots;
1716 int i, as_id, id;
1717 unsigned long n;
1718 unsigned long any = 0;
1719
1720
1721 if (kvm->dirty_ring_size)
1722 return -ENXIO;
1723
1724 *memslot = NULL;
1725 *is_dirty = 0;
1726
1727 as_id = log->slot >> 16;
1728 id = (u16)log->slot;
1729 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1730 return -EINVAL;
1731
1732 slots = __kvm_memslots(kvm, as_id);
1733 *memslot = id_to_memslot(slots, id);
1734 if (!(*memslot) || !(*memslot)->dirty_bitmap)
1735 return -ENOENT;
1736
1737 kvm_arch_sync_dirty_log(kvm, *memslot);
1738
1739 n = kvm_dirty_bitmap_bytes(*memslot);
1740
1741 for (i = 0; !any && i < n/sizeof(long); ++i)
1742 any = (*memslot)->dirty_bitmap[i];
1743
1744 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1745 return -EFAULT;
1746
1747 if (any)
1748 *is_dirty = 1;
1749 return 0;
1750}
1751EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1752
1753#else
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
1776{
1777 struct kvm_memslots *slots;
1778 struct kvm_memory_slot *memslot;
1779 int i, as_id, id;
1780 unsigned long n;
1781 unsigned long *dirty_bitmap;
1782 unsigned long *dirty_bitmap_buffer;
1783 bool flush;
1784
1785
1786 if (kvm->dirty_ring_size)
1787 return -ENXIO;
1788
1789 as_id = log->slot >> 16;
1790 id = (u16)log->slot;
1791 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1792 return -EINVAL;
1793
1794 slots = __kvm_memslots(kvm, as_id);
1795 memslot = id_to_memslot(slots, id);
1796 if (!memslot || !memslot->dirty_bitmap)
1797 return -ENOENT;
1798
1799 dirty_bitmap = memslot->dirty_bitmap;
1800
1801 kvm_arch_sync_dirty_log(kvm, memslot);
1802
1803 n = kvm_dirty_bitmap_bytes(memslot);
1804 flush = false;
1805 if (kvm->manual_dirty_log_protect) {
1806
1807
1808
1809
1810
1811
1812
1813
1814 dirty_bitmap_buffer = dirty_bitmap;
1815 } else {
1816 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1817 memset(dirty_bitmap_buffer, 0, n);
1818
1819 KVM_MMU_LOCK(kvm);
1820 for (i = 0; i < n / sizeof(long); i++) {
1821 unsigned long mask;
1822 gfn_t offset;
1823
1824 if (!dirty_bitmap[i])
1825 continue;
1826
1827 flush = true;
1828 mask = xchg(&dirty_bitmap[i], 0);
1829 dirty_bitmap_buffer[i] = mask;
1830
1831 offset = i * BITS_PER_LONG;
1832 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1833 offset, mask);
1834 }
1835 KVM_MMU_UNLOCK(kvm);
1836 }
1837
1838 if (flush)
1839 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1840
1841 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1842 return -EFAULT;
1843 return 0;
1844}
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1867 struct kvm_dirty_log *log)
1868{
1869 int r;
1870
1871 mutex_lock(&kvm->slots_lock);
1872
1873 r = kvm_get_dirty_log_protect(kvm, log);
1874
1875 mutex_unlock(&kvm->slots_lock);
1876 return r;
1877}
1878
1879
1880
1881
1882
1883
1884
1885static int kvm_clear_dirty_log_protect(struct kvm *kvm,
1886 struct kvm_clear_dirty_log *log)
1887{
1888 struct kvm_memslots *slots;
1889 struct kvm_memory_slot *memslot;
1890 int as_id, id;
1891 gfn_t offset;
1892 unsigned long i, n;
1893 unsigned long *dirty_bitmap;
1894 unsigned long *dirty_bitmap_buffer;
1895 bool flush;
1896
1897
1898 if (kvm->dirty_ring_size)
1899 return -ENXIO;
1900
1901 as_id = log->slot >> 16;
1902 id = (u16)log->slot;
1903 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1904 return -EINVAL;
1905
1906 if (log->first_page & 63)
1907 return -EINVAL;
1908
1909 slots = __kvm_memslots(kvm, as_id);
1910 memslot = id_to_memslot(slots, id);
1911 if (!memslot || !memslot->dirty_bitmap)
1912 return -ENOENT;
1913
1914 dirty_bitmap = memslot->dirty_bitmap;
1915
1916 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
1917
1918 if (log->first_page > memslot->npages ||
1919 log->num_pages > memslot->npages - log->first_page ||
1920 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
1921 return -EINVAL;
1922
1923 kvm_arch_sync_dirty_log(kvm, memslot);
1924
1925 flush = false;
1926 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1927 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
1928 return -EFAULT;
1929
1930 KVM_MMU_LOCK(kvm);
1931 for (offset = log->first_page, i = offset / BITS_PER_LONG,
1932 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
1933 i++, offset += BITS_PER_LONG) {
1934 unsigned long mask = *dirty_bitmap_buffer++;
1935 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
1936 if (!mask)
1937 continue;
1938
1939 mask &= atomic_long_fetch_andnot(mask, p);
1940
1941
1942
1943
1944
1945
1946
1947 if (mask) {
1948 flush = true;
1949 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1950 offset, mask);
1951 }
1952 }
1953 KVM_MMU_UNLOCK(kvm);
1954
1955 if (flush)
1956 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1957
1958 return 0;
1959}
1960
1961static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
1962 struct kvm_clear_dirty_log *log)
1963{
1964 int r;
1965
1966 mutex_lock(&kvm->slots_lock);
1967
1968 r = kvm_clear_dirty_log_protect(kvm, log);
1969
1970 mutex_unlock(&kvm->slots_lock);
1971 return r;
1972}
1973#endif
1974
1975struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1976{
1977 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
1978}
1979EXPORT_SYMBOL_GPL(gfn_to_memslot);
1980
1981struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
1982{
1983 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
1984}
1985EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
1986
1987bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
1988{
1989 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
1990
1991 return kvm_is_visible_memslot(memslot);
1992}
1993EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
1994
1995bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
1996{
1997 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1998
1999 return kvm_is_visible_memslot(memslot);
2000}
2001EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2002
2003unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2004{
2005 struct vm_area_struct *vma;
2006 unsigned long addr, size;
2007
2008 size = PAGE_SIZE;
2009
2010 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2011 if (kvm_is_error_hva(addr))
2012 return PAGE_SIZE;
2013
2014 mmap_read_lock(current->mm);
2015 vma = find_vma(current->mm, addr);
2016 if (!vma)
2017 goto out;
2018
2019 size = vma_kernel_pagesize(vma);
2020
2021out:
2022 mmap_read_unlock(current->mm);
2023
2024 return size;
2025}
2026
2027static bool memslot_is_readonly(struct kvm_memory_slot *slot)
2028{
2029 return slot->flags & KVM_MEM_READONLY;
2030}
2031
2032static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2033 gfn_t *nr_pages, bool write)
2034{
2035 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2036 return KVM_HVA_ERR_BAD;
2037
2038 if (memslot_is_readonly(slot) && write)
2039 return KVM_HVA_ERR_RO_BAD;
2040
2041 if (nr_pages)
2042 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2043
2044 return __gfn_to_hva_memslot(slot, gfn);
2045}
2046
2047static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2048 gfn_t *nr_pages)
2049{
2050 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2051}
2052
2053unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2054 gfn_t gfn)
2055{
2056 return gfn_to_hva_many(slot, gfn, NULL);
2057}
2058EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2059
2060unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2061{
2062 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2063}
2064EXPORT_SYMBOL_GPL(gfn_to_hva);
2065
2066unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2067{
2068 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2069}
2070EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2081 gfn_t gfn, bool *writable)
2082{
2083 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2084
2085 if (!kvm_is_error_hva(hva) && writable)
2086 *writable = !memslot_is_readonly(slot);
2087
2088 return hva;
2089}
2090
2091unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2092{
2093 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2094
2095 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2096}
2097
2098unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2099{
2100 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2101
2102 return gfn_to_hva_memslot_prot(slot, gfn, writable);
2103}
2104
2105static inline int check_user_page_hwpoison(unsigned long addr)
2106{
2107 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2108
2109 rc = get_user_pages(addr, 1, flags, NULL, NULL);
2110 return rc == -EHWPOISON;
2111}
2112
2113
2114
2115
2116
2117
2118static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2119 bool *writable, kvm_pfn_t *pfn)
2120{
2121 struct page *page[1];
2122
2123
2124
2125
2126
2127
2128 if (!(write_fault || writable))
2129 return false;
2130
2131 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2132 *pfn = page_to_pfn(page[0]);
2133
2134 if (writable)
2135 *writable = true;
2136 return true;
2137 }
2138
2139 return false;
2140}
2141
2142
2143
2144
2145
2146static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2147 bool *writable, kvm_pfn_t *pfn)
2148{
2149 unsigned int flags = FOLL_HWPOISON;
2150 struct page *page;
2151 int npages = 0;
2152
2153 might_sleep();
2154
2155 if (writable)
2156 *writable = write_fault;
2157
2158 if (write_fault)
2159 flags |= FOLL_WRITE;
2160 if (async)
2161 flags |= FOLL_NOWAIT;
2162
2163 npages = get_user_pages_unlocked(addr, 1, &page, flags);
2164 if (npages != 1)
2165 return npages;
2166
2167
2168 if (unlikely(!write_fault) && writable) {
2169 struct page *wpage;
2170
2171 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2172 *writable = true;
2173 put_page(page);
2174 page = wpage;
2175 }
2176 }
2177 *pfn = page_to_pfn(page);
2178 return npages;
2179}
2180
2181static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2182{
2183 if (unlikely(!(vma->vm_flags & VM_READ)))
2184 return false;
2185
2186 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2187 return false;
2188
2189 return true;
2190}
2191
2192static int kvm_try_get_pfn(kvm_pfn_t pfn)
2193{
2194 if (kvm_is_reserved_pfn(pfn))
2195 return 1;
2196 return get_page_unless_zero(pfn_to_page(pfn));
2197}
2198
2199static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2200 unsigned long addr, bool *async,
2201 bool write_fault, bool *writable,
2202 kvm_pfn_t *p_pfn)
2203{
2204 kvm_pfn_t pfn;
2205 pte_t *ptep;
2206 spinlock_t *ptl;
2207 int r;
2208
2209 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2210 if (r) {
2211
2212
2213
2214
2215 bool unlocked = false;
2216 r = fixup_user_fault(current->mm, addr,
2217 (write_fault ? FAULT_FLAG_WRITE : 0),
2218 &unlocked);
2219 if (unlocked)
2220 return -EAGAIN;
2221 if (r)
2222 return r;
2223
2224 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2225 if (r)
2226 return r;
2227 }
2228
2229 if (write_fault && !pte_write(*ptep)) {
2230 pfn = KVM_PFN_ERR_RO_FAULT;
2231 goto out;
2232 }
2233
2234 if (writable)
2235 *writable = pte_write(*ptep);
2236 pfn = pte_pfn(*ptep);
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255 if (!kvm_try_get_pfn(pfn))
2256 r = -EFAULT;
2257
2258out:
2259 pte_unmap_unlock(ptep, ptl);
2260 *p_pfn = pfn;
2261
2262 return r;
2263}
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
2280 bool write_fault, bool *writable)
2281{
2282 struct vm_area_struct *vma;
2283 kvm_pfn_t pfn = 0;
2284 int npages, r;
2285
2286
2287 BUG_ON(atomic && async);
2288
2289 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2290 return pfn;
2291
2292 if (atomic)
2293 return KVM_PFN_ERR_FAULT;
2294
2295 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
2296 if (npages == 1)
2297 return pfn;
2298
2299 mmap_read_lock(current->mm);
2300 if (npages == -EHWPOISON ||
2301 (!async && check_user_page_hwpoison(addr))) {
2302 pfn = KVM_PFN_ERR_HWPOISON;
2303 goto exit;
2304 }
2305
2306retry:
2307 vma = vma_lookup(current->mm, addr);
2308
2309 if (vma == NULL)
2310 pfn = KVM_PFN_ERR_FAULT;
2311 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2312 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
2313 if (r == -EAGAIN)
2314 goto retry;
2315 if (r < 0)
2316 pfn = KVM_PFN_ERR_FAULT;
2317 } else {
2318 if (async && vma_is_valid(vma, write_fault))
2319 *async = true;
2320 pfn = KVM_PFN_ERR_FAULT;
2321 }
2322exit:
2323 mmap_read_unlock(current->mm);
2324 return pfn;
2325}
2326
2327kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
2328 bool atomic, bool *async, bool write_fault,
2329 bool *writable, hva_t *hva)
2330{
2331 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2332
2333 if (hva)
2334 *hva = addr;
2335
2336 if (addr == KVM_HVA_ERR_RO_BAD) {
2337 if (writable)
2338 *writable = false;
2339 return KVM_PFN_ERR_RO_FAULT;
2340 }
2341
2342 if (kvm_is_error_hva(addr)) {
2343 if (writable)
2344 *writable = false;
2345 return KVM_PFN_NOSLOT;
2346 }
2347
2348
2349 if (writable && memslot_is_readonly(slot)) {
2350 *writable = false;
2351 writable = NULL;
2352 }
2353
2354 return hva_to_pfn(addr, atomic, async, write_fault,
2355 writable);
2356}
2357EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2358
2359kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2360 bool *writable)
2361{
2362 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2363 write_fault, writable, NULL);
2364}
2365EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2366
2367kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
2368{
2369 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
2370}
2371EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2372
2373kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
2374{
2375 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
2376}
2377EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2378
2379kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2380{
2381 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2382}
2383EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2384
2385kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2386{
2387 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2388}
2389EXPORT_SYMBOL_GPL(gfn_to_pfn);
2390
2391kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2392{
2393 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2394}
2395EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2396
2397int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2398 struct page **pages, int nr_pages)
2399{
2400 unsigned long addr;
2401 gfn_t entry = 0;
2402
2403 addr = gfn_to_hva_many(slot, gfn, &entry);
2404 if (kvm_is_error_hva(addr))
2405 return -1;
2406
2407 if (entry < nr_pages)
2408 return 0;
2409
2410 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2411}
2412EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2413
2414static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
2415{
2416 if (is_error_noslot_pfn(pfn))
2417 return KVM_ERR_PTR_BAD_PAGE;
2418
2419 if (kvm_is_reserved_pfn(pfn)) {
2420 WARN_ON(1);
2421 return KVM_ERR_PTR_BAD_PAGE;
2422 }
2423
2424 return pfn_to_page(pfn);
2425}
2426
2427struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2428{
2429 kvm_pfn_t pfn;
2430
2431 pfn = gfn_to_pfn(kvm, gfn);
2432
2433 return kvm_pfn_to_page(pfn);
2434}
2435EXPORT_SYMBOL_GPL(gfn_to_page);
2436
2437void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
2438{
2439 if (pfn == 0)
2440 return;
2441
2442 if (cache)
2443 cache->pfn = cache->gfn = 0;
2444
2445 if (dirty)
2446 kvm_release_pfn_dirty(pfn);
2447 else
2448 kvm_release_pfn_clean(pfn);
2449}
2450
2451static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
2452 struct gfn_to_pfn_cache *cache, u64 gen)
2453{
2454 kvm_release_pfn(cache->pfn, cache->dirty, cache);
2455
2456 cache->pfn = gfn_to_pfn_memslot(slot, gfn);
2457 cache->gfn = gfn;
2458 cache->dirty = false;
2459 cache->generation = gen;
2460}
2461
2462static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
2463 struct kvm_host_map *map,
2464 struct gfn_to_pfn_cache *cache,
2465 bool atomic)
2466{
2467 kvm_pfn_t pfn;
2468 void *hva = NULL;
2469 struct page *page = KVM_UNMAPPED_PAGE;
2470 struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
2471 u64 gen = slots->generation;
2472
2473 if (!map)
2474 return -EINVAL;
2475
2476 if (cache) {
2477 if (!cache->pfn || cache->gfn != gfn ||
2478 cache->generation != gen) {
2479 if (atomic)
2480 return -EAGAIN;
2481 kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
2482 }
2483 pfn = cache->pfn;
2484 } else {
2485 if (atomic)
2486 return -EAGAIN;
2487 pfn = gfn_to_pfn_memslot(slot, gfn);
2488 }
2489 if (is_error_noslot_pfn(pfn))
2490 return -EINVAL;
2491
2492 if (pfn_valid(pfn)) {
2493 page = pfn_to_page(pfn);
2494 if (atomic)
2495 hva = kmap_atomic(page);
2496 else
2497 hva = kmap(page);
2498#ifdef CONFIG_HAS_IOMEM
2499 } else if (!atomic) {
2500 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2501 } else {
2502 return -EINVAL;
2503#endif
2504 }
2505
2506 if (!hva)
2507 return -EFAULT;
2508
2509 map->page = page;
2510 map->hva = hva;
2511 map->pfn = pfn;
2512 map->gfn = gfn;
2513
2514 return 0;
2515}
2516
2517int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
2518 struct gfn_to_pfn_cache *cache, bool atomic)
2519{
2520 return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
2521 cache, atomic);
2522}
2523EXPORT_SYMBOL_GPL(kvm_map_gfn);
2524
2525int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2526{
2527 return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
2528 NULL, false);
2529}
2530EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2531
2532static void __kvm_unmap_gfn(struct kvm *kvm,
2533 struct kvm_memory_slot *memslot,
2534 struct kvm_host_map *map,
2535 struct gfn_to_pfn_cache *cache,
2536 bool dirty, bool atomic)
2537{
2538 if (!map)
2539 return;
2540
2541 if (!map->hva)
2542 return;
2543
2544 if (map->page != KVM_UNMAPPED_PAGE) {
2545 if (atomic)
2546 kunmap_atomic(map->hva);
2547 else
2548 kunmap(map->page);
2549 }
2550#ifdef CONFIG_HAS_IOMEM
2551 else if (!atomic)
2552 memunmap(map->hva);
2553 else
2554 WARN_ONCE(1, "Unexpected unmapping in atomic context");
2555#endif
2556
2557 if (dirty)
2558 mark_page_dirty_in_slot(kvm, memslot, map->gfn);
2559
2560 if (cache)
2561 cache->dirty |= dirty;
2562 else
2563 kvm_release_pfn(map->pfn, dirty, NULL);
2564
2565 map->hva = NULL;
2566 map->page = NULL;
2567}
2568
2569int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
2570 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
2571{
2572 __kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map,
2573 cache, dirty, atomic);
2574 return 0;
2575}
2576EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
2577
2578void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2579{
2580 __kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn),
2581 map, NULL, dirty, false);
2582}
2583EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2584
2585struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2586{
2587 kvm_pfn_t pfn;
2588
2589 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2590
2591 return kvm_pfn_to_page(pfn);
2592}
2593EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2594
2595void kvm_release_page_clean(struct page *page)
2596{
2597 WARN_ON(is_error_page(page));
2598
2599 kvm_release_pfn_clean(page_to_pfn(page));
2600}
2601EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2602
2603void kvm_release_pfn_clean(kvm_pfn_t pfn)
2604{
2605 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2606 put_page(pfn_to_page(pfn));
2607}
2608EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2609
2610void kvm_release_page_dirty(struct page *page)
2611{
2612 WARN_ON(is_error_page(page));
2613
2614 kvm_release_pfn_dirty(page_to_pfn(page));
2615}
2616EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2617
2618void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2619{
2620 kvm_set_pfn_dirty(pfn);
2621 kvm_release_pfn_clean(pfn);
2622}
2623EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2624
2625void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2626{
2627 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2628 SetPageDirty(pfn_to_page(pfn));
2629}
2630EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2631
2632void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2633{
2634 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2635 mark_page_accessed(pfn_to_page(pfn));
2636}
2637EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2638
2639void kvm_get_pfn(kvm_pfn_t pfn)
2640{
2641 if (!kvm_is_reserved_pfn(pfn))
2642 get_page(pfn_to_page(pfn));
2643}
2644EXPORT_SYMBOL_GPL(kvm_get_pfn);
2645
2646static int next_segment(unsigned long len, int offset)
2647{
2648 if (len > PAGE_SIZE - offset)
2649 return PAGE_SIZE - offset;
2650 else
2651 return len;
2652}
2653
2654static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2655 void *data, int offset, int len)
2656{
2657 int r;
2658 unsigned long addr;
2659
2660 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2661 if (kvm_is_error_hva(addr))
2662 return -EFAULT;
2663 r = __copy_from_user(data, (void __user *)addr + offset, len);
2664 if (r)
2665 return -EFAULT;
2666 return 0;
2667}
2668
2669int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2670 int len)
2671{
2672 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2673
2674 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2675}
2676EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2677
2678int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2679 int offset, int len)
2680{
2681 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2682
2683 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2684}
2685EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2686
2687int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2688{
2689 gfn_t gfn = gpa >> PAGE_SHIFT;
2690 int seg;
2691 int offset = offset_in_page(gpa);
2692 int ret;
2693
2694 while ((seg = next_segment(len, offset)) != 0) {
2695 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2696 if (ret < 0)
2697 return ret;
2698 offset = 0;
2699 len -= seg;
2700 data += seg;
2701 ++gfn;
2702 }
2703 return 0;
2704}
2705EXPORT_SYMBOL_GPL(kvm_read_guest);
2706
2707int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2708{
2709 gfn_t gfn = gpa >> PAGE_SHIFT;
2710 int seg;
2711 int offset = offset_in_page(gpa);
2712 int ret;
2713
2714 while ((seg = next_segment(len, offset)) != 0) {
2715 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2716 if (ret < 0)
2717 return ret;
2718 offset = 0;
2719 len -= seg;
2720 data += seg;
2721 ++gfn;
2722 }
2723 return 0;
2724}
2725EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2726
2727static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2728 void *data, int offset, unsigned long len)
2729{
2730 int r;
2731 unsigned long addr;
2732
2733 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2734 if (kvm_is_error_hva(addr))
2735 return -EFAULT;
2736 pagefault_disable();
2737 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2738 pagefault_enable();
2739 if (r)
2740 return -EFAULT;
2741 return 0;
2742}
2743
2744int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2745 void *data, unsigned long len)
2746{
2747 gfn_t gfn = gpa >> PAGE_SHIFT;
2748 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2749 int offset = offset_in_page(gpa);
2750
2751 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2752}
2753EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2754
2755static int __kvm_write_guest_page(struct kvm *kvm,
2756 struct kvm_memory_slot *memslot, gfn_t gfn,
2757 const void *data, int offset, int len)
2758{
2759 int r;
2760 unsigned long addr;
2761
2762 addr = gfn_to_hva_memslot(memslot, gfn);
2763 if (kvm_is_error_hva(addr))
2764 return -EFAULT;
2765 r = __copy_to_user((void __user *)addr + offset, data, len);
2766 if (r)
2767 return -EFAULT;
2768 mark_page_dirty_in_slot(kvm, memslot, gfn);
2769 return 0;
2770}
2771
2772int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2773 const void *data, int offset, int len)
2774{
2775 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2776
2777 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
2778}
2779EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2780
2781int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2782 const void *data, int offset, int len)
2783{
2784 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2785
2786 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
2787}
2788EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2789
2790int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2791 unsigned long len)
2792{
2793 gfn_t gfn = gpa >> PAGE_SHIFT;
2794 int seg;
2795 int offset = offset_in_page(gpa);
2796 int ret;
2797
2798 while ((seg = next_segment(len, offset)) != 0) {
2799 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2800 if (ret < 0)
2801 return ret;
2802 offset = 0;
2803 len -= seg;
2804 data += seg;
2805 ++gfn;
2806 }
2807 return 0;
2808}
2809EXPORT_SYMBOL_GPL(kvm_write_guest);
2810
2811int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2812 unsigned long len)
2813{
2814 gfn_t gfn = gpa >> PAGE_SHIFT;
2815 int seg;
2816 int offset = offset_in_page(gpa);
2817 int ret;
2818
2819 while ((seg = next_segment(len, offset)) != 0) {
2820 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2821 if (ret < 0)
2822 return ret;
2823 offset = 0;
2824 len -= seg;
2825 data += seg;
2826 ++gfn;
2827 }
2828 return 0;
2829}
2830EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2831
2832static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2833 struct gfn_to_hva_cache *ghc,
2834 gpa_t gpa, unsigned long len)
2835{
2836 int offset = offset_in_page(gpa);
2837 gfn_t start_gfn = gpa >> PAGE_SHIFT;
2838 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2839 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2840 gfn_t nr_pages_avail;
2841
2842
2843 ghc->generation = slots->generation;
2844
2845 if (start_gfn > end_gfn) {
2846 ghc->hva = KVM_HVA_ERR_BAD;
2847 return -EINVAL;
2848 }
2849
2850
2851
2852
2853
2854 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
2855 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2856 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2857 &nr_pages_avail);
2858 if (kvm_is_error_hva(ghc->hva))
2859 return -EFAULT;
2860 }
2861
2862
2863 if (nr_pages_needed == 1)
2864 ghc->hva += offset;
2865 else
2866 ghc->memslot = NULL;
2867
2868 ghc->gpa = gpa;
2869 ghc->len = len;
2870 return 0;
2871}
2872
2873int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2874 gpa_t gpa, unsigned long len)
2875{
2876 struct kvm_memslots *slots = kvm_memslots(kvm);
2877 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
2878}
2879EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
2880
2881int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2882 void *data, unsigned int offset,
2883 unsigned long len)
2884{
2885 struct kvm_memslots *slots = kvm_memslots(kvm);
2886 int r;
2887 gpa_t gpa = ghc->gpa + offset;
2888
2889 BUG_ON(len + offset > ghc->len);
2890
2891 if (slots->generation != ghc->generation) {
2892 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2893 return -EFAULT;
2894 }
2895
2896 if (kvm_is_error_hva(ghc->hva))
2897 return -EFAULT;
2898
2899 if (unlikely(!ghc->memslot))
2900 return kvm_write_guest(kvm, gpa, data, len);
2901
2902 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
2903 if (r)
2904 return -EFAULT;
2905 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
2906
2907 return 0;
2908}
2909EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
2910
2911int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2912 void *data, unsigned long len)
2913{
2914 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
2915}
2916EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
2917
2918int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2919 void *data, unsigned int offset,
2920 unsigned long len)
2921{
2922 struct kvm_memslots *slots = kvm_memslots(kvm);
2923 int r;
2924 gpa_t gpa = ghc->gpa + offset;
2925
2926 BUG_ON(len + offset > ghc->len);
2927
2928 if (slots->generation != ghc->generation) {
2929 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2930 return -EFAULT;
2931 }
2932
2933 if (kvm_is_error_hva(ghc->hva))
2934 return -EFAULT;
2935
2936 if (unlikely(!ghc->memslot))
2937 return kvm_read_guest(kvm, gpa, data, len);
2938
2939 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
2940 if (r)
2941 return -EFAULT;
2942
2943 return 0;
2944}
2945EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
2946
2947int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2948 void *data, unsigned long len)
2949{
2950 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
2951}
2952EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
2953
2954int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
2955{
2956 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
2957 gfn_t gfn = gpa >> PAGE_SHIFT;
2958 int seg;
2959 int offset = offset_in_page(gpa);
2960 int ret;
2961
2962 while ((seg = next_segment(len, offset)) != 0) {
2963 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
2964 if (ret < 0)
2965 return ret;
2966 offset = 0;
2967 len -= seg;
2968 ++gfn;
2969 }
2970 return 0;
2971}
2972EXPORT_SYMBOL_GPL(kvm_clear_guest);
2973
2974void mark_page_dirty_in_slot(struct kvm *kvm,
2975 struct kvm_memory_slot *memslot,
2976 gfn_t gfn)
2977{
2978 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
2979 unsigned long rel_gfn = gfn - memslot->base_gfn;
2980 u32 slot = (memslot->as_id << 16) | memslot->id;
2981
2982 if (kvm->dirty_ring_size)
2983 kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
2984 slot, rel_gfn);
2985 else
2986 set_bit_le(rel_gfn, memslot->dirty_bitmap);
2987 }
2988}
2989EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
2990
2991void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
2992{
2993 struct kvm_memory_slot *memslot;
2994
2995 memslot = gfn_to_memslot(kvm, gfn);
2996 mark_page_dirty_in_slot(kvm, memslot, gfn);
2997}
2998EXPORT_SYMBOL_GPL(mark_page_dirty);
2999
3000void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3001{
3002 struct kvm_memory_slot *memslot;
3003
3004 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3005 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3006}
3007EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3008
3009void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3010{
3011 if (!vcpu->sigset_active)
3012 return;
3013
3014
3015
3016
3017
3018
3019
3020 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked);
3021}
3022
3023void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3024{
3025 if (!vcpu->sigset_active)
3026 return;
3027
3028 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL);
3029 sigemptyset(¤t->real_blocked);
3030}
3031
3032static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3033{
3034 unsigned int old, val, grow, grow_start;
3035
3036 old = val = vcpu->halt_poll_ns;
3037 grow_start = READ_ONCE(halt_poll_ns_grow_start);
3038 grow = READ_ONCE(halt_poll_ns_grow);
3039 if (!grow)
3040 goto out;
3041
3042 val *= grow;
3043 if (val < grow_start)
3044 val = grow_start;
3045
3046 if (val > vcpu->kvm->max_halt_poll_ns)
3047 val = vcpu->kvm->max_halt_poll_ns;
3048
3049 vcpu->halt_poll_ns = val;
3050out:
3051 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3052}
3053
3054static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3055{
3056 unsigned int old, val, shrink;
3057
3058 old = val = vcpu->halt_poll_ns;
3059 shrink = READ_ONCE(halt_poll_ns_shrink);
3060 if (shrink == 0)
3061 val = 0;
3062 else
3063 val /= shrink;
3064
3065 vcpu->halt_poll_ns = val;
3066 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3067}
3068
3069static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3070{
3071 int ret = -EINTR;
3072 int idx = srcu_read_lock(&vcpu->kvm->srcu);
3073
3074 if (kvm_arch_vcpu_runnable(vcpu)) {
3075 kvm_make_request(KVM_REQ_UNHALT, vcpu);
3076 goto out;
3077 }
3078 if (kvm_cpu_has_pending_timer(vcpu))
3079 goto out;
3080 if (signal_pending(current))
3081 goto out;
3082 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3083 goto out;
3084
3085 ret = 0;
3086out:
3087 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3088 return ret;
3089}
3090
3091static inline void
3092update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
3093{
3094 if (waited)
3095 vcpu->stat.generic.halt_poll_fail_ns += poll_ns;
3096 else
3097 vcpu->stat.generic.halt_poll_success_ns += poll_ns;
3098}
3099
3100
3101
3102
3103void kvm_vcpu_block(struct kvm_vcpu *vcpu)
3104{
3105 ktime_t start, cur, poll_end;
3106 bool waited = false;
3107 u64 block_ns;
3108
3109 kvm_arch_vcpu_blocking(vcpu);
3110
3111 start = cur = poll_end = ktime_get();
3112 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
3113 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
3114
3115 ++vcpu->stat.generic.halt_attempted_poll;
3116 do {
3117
3118
3119
3120
3121 if (kvm_vcpu_check_block(vcpu) < 0) {
3122 ++vcpu->stat.generic.halt_successful_poll;
3123 if (!vcpu_valid_wakeup(vcpu))
3124 ++vcpu->stat.generic.halt_poll_invalid;
3125 goto out;
3126 }
3127 cpu_relax();
3128 poll_end = cur = ktime_get();
3129 } while (kvm_vcpu_can_poll(cur, stop));
3130 }
3131
3132 prepare_to_rcuwait(&vcpu->wait);
3133 for (;;) {
3134 set_current_state(TASK_INTERRUPTIBLE);
3135
3136 if (kvm_vcpu_check_block(vcpu) < 0)
3137 break;
3138
3139 waited = true;
3140 schedule();
3141 }
3142 finish_rcuwait(&vcpu->wait);
3143 cur = ktime_get();
3144out:
3145 kvm_arch_vcpu_unblocking(vcpu);
3146 block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3147
3148 update_halt_poll_stats(
3149 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
3150
3151 if (!kvm_arch_no_poll(vcpu)) {
3152 if (!vcpu_valid_wakeup(vcpu)) {
3153 shrink_halt_poll_ns(vcpu);
3154 } else if (vcpu->kvm->max_halt_poll_ns) {
3155 if (block_ns <= vcpu->halt_poll_ns)
3156 ;
3157
3158 else if (vcpu->halt_poll_ns &&
3159 block_ns > vcpu->kvm->max_halt_poll_ns)
3160 shrink_halt_poll_ns(vcpu);
3161
3162 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
3163 block_ns < vcpu->kvm->max_halt_poll_ns)
3164 grow_halt_poll_ns(vcpu);
3165 } else {
3166 vcpu->halt_poll_ns = 0;
3167 }
3168 }
3169
3170 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
3171 kvm_arch_vcpu_block_finish(vcpu);
3172}
3173EXPORT_SYMBOL_GPL(kvm_vcpu_block);
3174
3175bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3176{
3177 struct rcuwait *waitp;
3178
3179 waitp = kvm_arch_vcpu_get_wait(vcpu);
3180 if (rcuwait_wake_up(waitp)) {
3181 WRITE_ONCE(vcpu->ready, true);
3182 ++vcpu->stat.generic.halt_wakeup;
3183 return true;
3184 }
3185
3186 return false;
3187}
3188EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3189
3190#ifndef CONFIG_S390
3191
3192
3193
3194void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3195{
3196 int me;
3197 int cpu = vcpu->cpu;
3198
3199 if (kvm_vcpu_wake_up(vcpu))
3200 return;
3201
3202 me = get_cpu();
3203 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3204 if (kvm_arch_vcpu_should_kick(vcpu))
3205 smp_send_reschedule(cpu);
3206 put_cpu();
3207}
3208EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3209#endif
3210
3211int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3212{
3213 struct pid *pid;
3214 struct task_struct *task = NULL;
3215 int ret = 0;
3216
3217 rcu_read_lock();
3218 pid = rcu_dereference(target->pid);
3219 if (pid)
3220 task = get_pid_task(pid, PIDTYPE_PID);
3221 rcu_read_unlock();
3222 if (!task)
3223 return ret;
3224 ret = yield_to(task, 1);
3225 put_task_struct(task);
3226
3227 return ret;
3228}
3229EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3254{
3255#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3256 bool eligible;
3257
3258 eligible = !vcpu->spin_loop.in_spin_loop ||
3259 vcpu->spin_loop.dy_eligible;
3260
3261 if (vcpu->spin_loop.in_spin_loop)
3262 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3263
3264 return eligible;
3265#else
3266 return true;
3267#endif
3268}
3269
3270
3271
3272
3273
3274
3275bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3276{
3277 return kvm_arch_vcpu_runnable(vcpu);
3278}
3279
3280static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3281{
3282 if (kvm_arch_dy_runnable(vcpu))
3283 return true;
3284
3285#ifdef CONFIG_KVM_ASYNC_PF
3286 if (!list_empty_careful(&vcpu->async_pf.done))
3287 return true;
3288#endif
3289
3290 return false;
3291}
3292
3293bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3294{
3295 return false;
3296}
3297
3298void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3299{
3300 struct kvm *kvm = me->kvm;
3301 struct kvm_vcpu *vcpu;
3302 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3303 int yielded = 0;
3304 int try = 3;
3305 int pass;
3306 int i;
3307
3308 kvm_vcpu_set_in_spin_loop(me, true);
3309
3310
3311
3312
3313
3314
3315
3316 for (pass = 0; pass < 2 && !yielded && try; pass++) {
3317 kvm_for_each_vcpu(i, vcpu, kvm) {
3318 if (!pass && i <= last_boosted_vcpu) {
3319 i = last_boosted_vcpu;
3320 continue;
3321 } else if (pass && i > last_boosted_vcpu)
3322 break;
3323 if (!READ_ONCE(vcpu->ready))
3324 continue;
3325 if (vcpu == me)
3326 continue;
3327 if (rcuwait_active(&vcpu->wait) &&
3328 !vcpu_dy_runnable(vcpu))
3329 continue;
3330 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3331 !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3332 !kvm_arch_vcpu_in_kernel(vcpu))
3333 continue;
3334 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3335 continue;
3336
3337 yielded = kvm_vcpu_yield_to(vcpu);
3338 if (yielded > 0) {
3339 kvm->last_boosted_vcpu = i;
3340 break;
3341 } else if (yielded < 0) {
3342 try--;
3343 if (!try)
3344 break;
3345 }
3346 }
3347 }
3348 kvm_vcpu_set_in_spin_loop(me, false);
3349
3350
3351 kvm_vcpu_set_dy_eligible(me, false);
3352}
3353EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3354
3355static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3356{
3357#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
3358 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3359 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3360 kvm->dirty_ring_size / PAGE_SIZE);
3361#else
3362 return false;
3363#endif
3364}
3365
3366static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3367{
3368 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3369 struct page *page;
3370
3371 if (vmf->pgoff == 0)
3372 page = virt_to_page(vcpu->run);
3373#ifdef CONFIG_X86
3374 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3375 page = virt_to_page(vcpu->arch.pio_data);
3376#endif
3377#ifdef CONFIG_KVM_MMIO
3378 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3379 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3380#endif
3381 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3382 page = kvm_dirty_ring_get_page(
3383 &vcpu->dirty_ring,
3384 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3385 else
3386 return kvm_arch_vcpu_fault(vcpu, vmf);
3387 get_page(page);
3388 vmf->page = page;
3389 return 0;
3390}
3391
3392static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3393 .fault = kvm_vcpu_fault,
3394};
3395
3396static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3397{
3398 struct kvm_vcpu *vcpu = file->private_data;
3399 unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
3400
3401 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3402 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3403 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3404 return -EINVAL;
3405
3406 vma->vm_ops = &kvm_vcpu_vm_ops;
3407 return 0;
3408}
3409
3410static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3411{
3412 struct kvm_vcpu *vcpu = filp->private_data;
3413
3414 kvm_put_kvm(vcpu->kvm);
3415 return 0;
3416}
3417
3418static struct file_operations kvm_vcpu_fops = {
3419 .release = kvm_vcpu_release,
3420 .unlocked_ioctl = kvm_vcpu_ioctl,
3421 .mmap = kvm_vcpu_mmap,
3422 .llseek = noop_llseek,
3423 KVM_COMPAT(kvm_vcpu_compat_ioctl),
3424};
3425
3426
3427
3428
3429static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3430{
3431 char name[8 + 1 + ITOA_MAX_LEN + 1];
3432
3433 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3434 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3435}
3436
3437static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3438{
3439#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3440 struct dentry *debugfs_dentry;
3441 char dir_name[ITOA_MAX_LEN * 2];
3442
3443 if (!debugfs_initialized())
3444 return;
3445
3446 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3447 debugfs_dentry = debugfs_create_dir(dir_name,
3448 vcpu->kvm->debugfs_dentry);
3449
3450 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3451#endif
3452}
3453
3454
3455
3456
3457static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3458{
3459 int r;
3460 struct kvm_vcpu *vcpu;
3461 struct page *page;
3462
3463 if (id >= KVM_MAX_VCPU_ID)
3464 return -EINVAL;
3465
3466 mutex_lock(&kvm->lock);
3467 if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3468 mutex_unlock(&kvm->lock);
3469 return -EINVAL;
3470 }
3471
3472 kvm->created_vcpus++;
3473 mutex_unlock(&kvm->lock);
3474
3475 r = kvm_arch_vcpu_precreate(kvm, id);
3476 if (r)
3477 goto vcpu_decrement;
3478
3479 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3480 if (!vcpu) {
3481 r = -ENOMEM;
3482 goto vcpu_decrement;
3483 }
3484
3485 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3486 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3487 if (!page) {
3488 r = -ENOMEM;
3489 goto vcpu_free;
3490 }
3491 vcpu->run = page_address(page);
3492
3493 kvm_vcpu_init(vcpu, kvm, id);
3494
3495 r = kvm_arch_vcpu_create(vcpu);
3496 if (r)
3497 goto vcpu_free_run_page;
3498
3499 if (kvm->dirty_ring_size) {
3500 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3501 id, kvm->dirty_ring_size);
3502 if (r)
3503 goto arch_vcpu_destroy;
3504 }
3505
3506 mutex_lock(&kvm->lock);
3507 if (kvm_get_vcpu_by_id(kvm, id)) {
3508 r = -EEXIST;
3509 goto unlock_vcpu_destroy;
3510 }
3511
3512 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3513 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
3514
3515
3516 snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
3517 task_pid_nr(current), id);
3518
3519
3520 kvm_get_kvm(kvm);
3521 r = create_vcpu_fd(vcpu);
3522 if (r < 0) {
3523 kvm_put_kvm_no_destroy(kvm);
3524 goto unlock_vcpu_destroy;
3525 }
3526
3527 kvm->vcpus[vcpu->vcpu_idx] = vcpu;
3528
3529
3530
3531
3532
3533 smp_wmb();
3534 atomic_inc(&kvm->online_vcpus);
3535
3536 mutex_unlock(&kvm->lock);
3537 kvm_arch_vcpu_postcreate(vcpu);
3538 kvm_create_vcpu_debugfs(vcpu);
3539 return r;
3540
3541unlock_vcpu_destroy:
3542 mutex_unlock(&kvm->lock);
3543 kvm_dirty_ring_free(&vcpu->dirty_ring);
3544arch_vcpu_destroy:
3545 kvm_arch_vcpu_destroy(vcpu);
3546vcpu_free_run_page:
3547 free_page((unsigned long)vcpu->run);
3548vcpu_free:
3549 kmem_cache_free(kvm_vcpu_cache, vcpu);
3550vcpu_decrement:
3551 mutex_lock(&kvm->lock);
3552 kvm->created_vcpus--;
3553 mutex_unlock(&kvm->lock);
3554 return r;
3555}
3556
3557static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3558{
3559 if (sigset) {
3560 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3561 vcpu->sigset_active = 1;
3562 vcpu->sigset = *sigset;
3563 } else
3564 vcpu->sigset_active = 0;
3565 return 0;
3566}
3567
3568static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
3569 size_t size, loff_t *offset)
3570{
3571 struct kvm_vcpu *vcpu = file->private_data;
3572
3573 return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
3574 &kvm_vcpu_stats_desc[0], &vcpu->stat,
3575 sizeof(vcpu->stat), user_buffer, size, offset);
3576}
3577
3578static const struct file_operations kvm_vcpu_stats_fops = {
3579 .read = kvm_vcpu_stats_read,
3580 .llseek = noop_llseek,
3581};
3582
3583static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
3584{
3585 int fd;
3586 struct file *file;
3587 char name[15 + ITOA_MAX_LEN + 1];
3588
3589 snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
3590
3591 fd = get_unused_fd_flags(O_CLOEXEC);
3592 if (fd < 0)
3593 return fd;
3594
3595 file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
3596 if (IS_ERR(file)) {
3597 put_unused_fd(fd);
3598 return PTR_ERR(file);
3599 }
3600 file->f_mode |= FMODE_PREAD;
3601 fd_install(fd, file);
3602
3603 return fd;
3604}
3605
3606static long kvm_vcpu_ioctl(struct file *filp,
3607 unsigned int ioctl, unsigned long arg)
3608{
3609 struct kvm_vcpu *vcpu = filp->private_data;
3610 void __user *argp = (void __user *)arg;
3611 int r;
3612 struct kvm_fpu *fpu = NULL;
3613 struct kvm_sregs *kvm_sregs = NULL;
3614
3615 if (vcpu->kvm->mm != current->mm)
3616 return -EIO;
3617
3618 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3619 return -EINVAL;
3620
3621
3622
3623
3624
3625 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3626 if (r != -ENOIOCTLCMD)
3627 return r;
3628
3629 if (mutex_lock_killable(&vcpu->mutex))
3630 return -EINTR;
3631 switch (ioctl) {
3632 case KVM_RUN: {
3633 struct pid *oldpid;
3634 r = -EINVAL;
3635 if (arg)
3636 goto out;
3637 oldpid = rcu_access_pointer(vcpu->pid);
3638 if (unlikely(oldpid != task_pid(current))) {
3639
3640 struct pid *newpid;
3641
3642 r = kvm_arch_vcpu_run_pid_change(vcpu);
3643 if (r)
3644 break;
3645
3646 newpid = get_task_pid(current, PIDTYPE_PID);
3647 rcu_assign_pointer(vcpu->pid, newpid);
3648 if (oldpid)
3649 synchronize_rcu();
3650 put_pid(oldpid);
3651 }
3652 r = kvm_arch_vcpu_ioctl_run(vcpu);
3653 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
3654 break;
3655 }
3656 case KVM_GET_REGS: {
3657 struct kvm_regs *kvm_regs;
3658
3659 r = -ENOMEM;
3660 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3661 if (!kvm_regs)
3662 goto out;
3663 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
3664 if (r)
3665 goto out_free1;
3666 r = -EFAULT;
3667 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3668 goto out_free1;
3669 r = 0;
3670out_free1:
3671 kfree(kvm_regs);
3672 break;
3673 }
3674 case KVM_SET_REGS: {
3675 struct kvm_regs *kvm_regs;
3676
3677 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3678 if (IS_ERR(kvm_regs)) {
3679 r = PTR_ERR(kvm_regs);
3680 goto out;
3681 }
3682 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3683 kfree(kvm_regs);
3684 break;
3685 }
3686 case KVM_GET_SREGS: {
3687 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3688 GFP_KERNEL_ACCOUNT);
3689 r = -ENOMEM;
3690 if (!kvm_sregs)
3691 goto out;
3692 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3693 if (r)
3694 goto out;
3695 r = -EFAULT;
3696 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3697 goto out;
3698 r = 0;
3699 break;
3700 }
3701 case KVM_SET_SREGS: {
3702 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3703 if (IS_ERR(kvm_sregs)) {
3704 r = PTR_ERR(kvm_sregs);
3705 kvm_sregs = NULL;
3706 goto out;
3707 }
3708 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3709 break;
3710 }
3711 case KVM_GET_MP_STATE: {
3712 struct kvm_mp_state mp_state;
3713
3714 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3715 if (r)
3716 goto out;
3717 r = -EFAULT;
3718 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3719 goto out;
3720 r = 0;
3721 break;
3722 }
3723 case KVM_SET_MP_STATE: {
3724 struct kvm_mp_state mp_state;
3725
3726 r = -EFAULT;
3727 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3728 goto out;
3729 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3730 break;
3731 }
3732 case KVM_TRANSLATE: {
3733 struct kvm_translation tr;
3734
3735 r = -EFAULT;
3736 if (copy_from_user(&tr, argp, sizeof(tr)))
3737 goto out;
3738 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3739 if (r)
3740 goto out;
3741 r = -EFAULT;
3742 if (copy_to_user(argp, &tr, sizeof(tr)))
3743 goto out;
3744 r = 0;
3745 break;
3746 }
3747 case KVM_SET_GUEST_DEBUG: {
3748 struct kvm_guest_debug dbg;
3749
3750 r = -EFAULT;
3751 if (copy_from_user(&dbg, argp, sizeof(dbg)))
3752 goto out;
3753 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
3754 break;
3755 }
3756 case KVM_SET_SIGNAL_MASK: {
3757 struct kvm_signal_mask __user *sigmask_arg = argp;
3758 struct kvm_signal_mask kvm_sigmask;
3759 sigset_t sigset, *p;
3760
3761 p = NULL;
3762 if (argp) {
3763 r = -EFAULT;
3764 if (copy_from_user(&kvm_sigmask, argp,
3765 sizeof(kvm_sigmask)))
3766 goto out;
3767 r = -EINVAL;
3768 if (kvm_sigmask.len != sizeof(sigset))
3769 goto out;
3770 r = -EFAULT;
3771 if (copy_from_user(&sigset, sigmask_arg->sigset,
3772 sizeof(sigset)))
3773 goto out;
3774 p = &sigset;
3775 }
3776 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
3777 break;
3778 }
3779 case KVM_GET_FPU: {
3780 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
3781 r = -ENOMEM;
3782 if (!fpu)
3783 goto out;
3784 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
3785 if (r)
3786 goto out;
3787 r = -EFAULT;
3788 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
3789 goto out;
3790 r = 0;
3791 break;
3792 }
3793 case KVM_SET_FPU: {
3794 fpu = memdup_user(argp, sizeof(*fpu));
3795 if (IS_ERR(fpu)) {
3796 r = PTR_ERR(fpu);
3797 fpu = NULL;
3798 goto out;
3799 }
3800 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
3801 break;
3802 }
3803 case KVM_GET_STATS_FD: {
3804 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
3805 break;
3806 }
3807 default:
3808 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
3809 }
3810out:
3811 mutex_unlock(&vcpu->mutex);
3812 kfree(fpu);
3813 kfree(kvm_sregs);
3814 return r;
3815}
3816
3817#ifdef CONFIG_KVM_COMPAT
3818static long kvm_vcpu_compat_ioctl(struct file *filp,
3819 unsigned int ioctl, unsigned long arg)
3820{
3821 struct kvm_vcpu *vcpu = filp->private_data;
3822 void __user *argp = compat_ptr(arg);
3823 int r;
3824
3825 if (vcpu->kvm->mm != current->mm)
3826 return -EIO;
3827
3828 switch (ioctl) {
3829 case KVM_SET_SIGNAL_MASK: {
3830 struct kvm_signal_mask __user *sigmask_arg = argp;
3831 struct kvm_signal_mask kvm_sigmask;
3832 sigset_t sigset;
3833
3834 if (argp) {
3835 r = -EFAULT;
3836 if (copy_from_user(&kvm_sigmask, argp,
3837 sizeof(kvm_sigmask)))
3838 goto out;
3839 r = -EINVAL;
3840 if (kvm_sigmask.len != sizeof(compat_sigset_t))
3841 goto out;
3842 r = -EFAULT;
3843 if (get_compat_sigset(&sigset,
3844 (compat_sigset_t __user *)sigmask_arg->sigset))
3845 goto out;
3846 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3847 } else
3848 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
3849 break;
3850 }
3851 default:
3852 r = kvm_vcpu_ioctl(filp, ioctl, arg);
3853 }
3854
3855out:
3856 return r;
3857}
3858#endif
3859
3860static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3861{
3862 struct kvm_device *dev = filp->private_data;
3863
3864 if (dev->ops->mmap)
3865 return dev->ops->mmap(dev, vma);
3866
3867 return -ENODEV;
3868}
3869
3870static int kvm_device_ioctl_attr(struct kvm_device *dev,
3871 int (*accessor)(struct kvm_device *dev,
3872 struct kvm_device_attr *attr),
3873 unsigned long arg)
3874{
3875 struct kvm_device_attr attr;
3876
3877 if (!accessor)
3878 return -EPERM;
3879
3880 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
3881 return -EFAULT;
3882
3883 return accessor(dev, &attr);
3884}
3885
3886static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
3887 unsigned long arg)
3888{
3889 struct kvm_device *dev = filp->private_data;
3890
3891 if (dev->kvm->mm != current->mm)
3892 return -EIO;
3893
3894 switch (ioctl) {
3895 case KVM_SET_DEVICE_ATTR:
3896 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
3897 case KVM_GET_DEVICE_ATTR:
3898 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
3899 case KVM_HAS_DEVICE_ATTR:
3900 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
3901 default:
3902 if (dev->ops->ioctl)
3903 return dev->ops->ioctl(dev, ioctl, arg);
3904
3905 return -ENOTTY;
3906 }
3907}
3908
3909static int kvm_device_release(struct inode *inode, struct file *filp)
3910{
3911 struct kvm_device *dev = filp->private_data;
3912 struct kvm *kvm = dev->kvm;
3913
3914 if (dev->ops->release) {
3915 mutex_lock(&kvm->lock);
3916 list_del(&dev->vm_node);
3917 dev->ops->release(dev);
3918 mutex_unlock(&kvm->lock);
3919 }
3920
3921 kvm_put_kvm(kvm);
3922 return 0;
3923}
3924
3925static const struct file_operations kvm_device_fops = {
3926 .unlocked_ioctl = kvm_device_ioctl,
3927 .release = kvm_device_release,
3928 KVM_COMPAT(kvm_device_ioctl),
3929 .mmap = kvm_device_mmap,
3930};
3931
3932struct kvm_device *kvm_device_from_filp(struct file *filp)
3933{
3934 if (filp->f_op != &kvm_device_fops)
3935 return NULL;
3936
3937 return filp->private_data;
3938}
3939
3940static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
3941#ifdef CONFIG_KVM_MPIC
3942 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
3943 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
3944#endif
3945};
3946
3947int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
3948{
3949 if (type >= ARRAY_SIZE(kvm_device_ops_table))
3950 return -ENOSPC;
3951
3952 if (kvm_device_ops_table[type] != NULL)
3953 return -EEXIST;
3954
3955 kvm_device_ops_table[type] = ops;
3956 return 0;
3957}
3958
3959void kvm_unregister_device_ops(u32 type)
3960{
3961 if (kvm_device_ops_table[type] != NULL)
3962 kvm_device_ops_table[type] = NULL;
3963}
3964
3965static int kvm_ioctl_create_device(struct kvm *kvm,
3966 struct kvm_create_device *cd)
3967{
3968 const struct kvm_device_ops *ops = NULL;
3969 struct kvm_device *dev;
3970 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
3971 int type;
3972 int ret;
3973
3974 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
3975 return -ENODEV;
3976
3977 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
3978 ops = kvm_device_ops_table[type];
3979 if (ops == NULL)
3980 return -ENODEV;
3981
3982 if (test)
3983 return 0;
3984
3985 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
3986 if (!dev)
3987 return -ENOMEM;
3988
3989 dev->ops = ops;
3990 dev->kvm = kvm;
3991
3992 mutex_lock(&kvm->lock);
3993 ret = ops->create(dev, type);
3994 if (ret < 0) {
3995 mutex_unlock(&kvm->lock);
3996 kfree(dev);
3997 return ret;
3998 }
3999 list_add(&dev->vm_node, &kvm->devices);
4000 mutex_unlock(&kvm->lock);
4001
4002 if (ops->init)
4003 ops->init(dev);
4004
4005 kvm_get_kvm(kvm);
4006 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4007 if (ret < 0) {
4008 kvm_put_kvm_no_destroy(kvm);
4009 mutex_lock(&kvm->lock);
4010 list_del(&dev->vm_node);
4011 mutex_unlock(&kvm->lock);
4012 ops->destroy(dev);
4013 return ret;
4014 }
4015
4016 cd->fd = ret;
4017 return 0;
4018}
4019
4020static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4021{
4022 switch (arg) {
4023 case KVM_CAP_USER_MEMORY:
4024 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4025 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4026 case KVM_CAP_INTERNAL_ERROR_DATA:
4027#ifdef CONFIG_HAVE_KVM_MSI
4028 case KVM_CAP_SIGNAL_MSI:
4029#endif
4030#ifdef CONFIG_HAVE_KVM_IRQFD
4031 case KVM_CAP_IRQFD:
4032 case KVM_CAP_IRQFD_RESAMPLE:
4033#endif
4034 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4035 case KVM_CAP_CHECK_EXTENSION_VM:
4036 case KVM_CAP_ENABLE_CAP_VM:
4037 case KVM_CAP_HALT_POLL:
4038 return 1;
4039#ifdef CONFIG_KVM_MMIO
4040 case KVM_CAP_COALESCED_MMIO:
4041 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4042 case KVM_CAP_COALESCED_PIO:
4043 return 1;
4044#endif
4045#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4046 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4047 return KVM_DIRTY_LOG_MANUAL_CAPS;
4048#endif
4049#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4050 case KVM_CAP_IRQ_ROUTING:
4051 return KVM_MAX_IRQ_ROUTES;
4052#endif
4053#if KVM_ADDRESS_SPACE_NUM > 1
4054 case KVM_CAP_MULTI_ADDRESS_SPACE:
4055 return KVM_ADDRESS_SPACE_NUM;
4056#endif
4057 case KVM_CAP_NR_MEMSLOTS:
4058 return KVM_USER_MEM_SLOTS;
4059 case KVM_CAP_DIRTY_LOG_RING:
4060#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
4061 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4062#else
4063 return 0;
4064#endif
4065 case KVM_CAP_BINARY_STATS_FD:
4066 return 1;
4067 default:
4068 break;
4069 }
4070 return kvm_vm_ioctl_check_extension(kvm, arg);
4071}
4072
4073static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4074{
4075 int r;
4076
4077 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4078 return -EINVAL;
4079
4080
4081 if (!size || (size & (size - 1)))
4082 return -EINVAL;
4083
4084
4085 if (size < kvm_dirty_ring_get_rsvd_entries() *
4086 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4087 return -EINVAL;
4088
4089 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4090 sizeof(struct kvm_dirty_gfn))
4091 return -E2BIG;
4092
4093
4094 if (kvm->dirty_ring_size)
4095 return -EINVAL;
4096
4097 mutex_lock(&kvm->lock);
4098
4099 if (kvm->created_vcpus) {
4100
4101 r = -EINVAL;
4102 } else {
4103 kvm->dirty_ring_size = size;
4104 r = 0;
4105 }
4106
4107 mutex_unlock(&kvm->lock);
4108 return r;
4109}
4110
4111static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4112{
4113 int i;
4114 struct kvm_vcpu *vcpu;
4115 int cleared = 0;
4116
4117 if (!kvm->dirty_ring_size)
4118 return -EINVAL;
4119
4120 mutex_lock(&kvm->slots_lock);
4121
4122 kvm_for_each_vcpu(i, vcpu, kvm)
4123 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4124
4125 mutex_unlock(&kvm->slots_lock);
4126
4127 if (cleared)
4128 kvm_flush_remote_tlbs(kvm);
4129
4130 return cleared;
4131}
4132
4133int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4134 struct kvm_enable_cap *cap)
4135{
4136 return -EINVAL;
4137}
4138
4139static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4140 struct kvm_enable_cap *cap)
4141{
4142 switch (cap->cap) {
4143#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4144 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4145 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4146
4147 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4148 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4149
4150 if (cap->flags || (cap->args[0] & ~allowed_options))
4151 return -EINVAL;
4152 kvm->manual_dirty_log_protect = cap->args[0];
4153 return 0;
4154 }
4155#endif
4156 case KVM_CAP_HALT_POLL: {
4157 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4158 return -EINVAL;
4159
4160 kvm->max_halt_poll_ns = cap->args[0];
4161 return 0;
4162 }
4163 case KVM_CAP_DIRTY_LOG_RING:
4164 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4165 default:
4166 return kvm_vm_ioctl_enable_cap(kvm, cap);
4167 }
4168}
4169
4170static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4171 size_t size, loff_t *offset)
4172{
4173 struct kvm *kvm = file->private_data;
4174
4175 return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4176 &kvm_vm_stats_desc[0], &kvm->stat,
4177 sizeof(kvm->stat), user_buffer, size, offset);
4178}
4179
4180static const struct file_operations kvm_vm_stats_fops = {
4181 .read = kvm_vm_stats_read,
4182 .llseek = noop_llseek,
4183};
4184
4185static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4186{
4187 int fd;
4188 struct file *file;
4189
4190 fd = get_unused_fd_flags(O_CLOEXEC);
4191 if (fd < 0)
4192 return fd;
4193
4194 file = anon_inode_getfile("kvm-vm-stats",
4195 &kvm_vm_stats_fops, kvm, O_RDONLY);
4196 if (IS_ERR(file)) {
4197 put_unused_fd(fd);
4198 return PTR_ERR(file);
4199 }
4200 file->f_mode |= FMODE_PREAD;
4201 fd_install(fd, file);
4202
4203 return fd;
4204}
4205
4206static long kvm_vm_ioctl(struct file *filp,
4207 unsigned int ioctl, unsigned long arg)
4208{
4209 struct kvm *kvm = filp->private_data;
4210 void __user *argp = (void __user *)arg;
4211 int r;
4212
4213 if (kvm->mm != current->mm)
4214 return -EIO;
4215 switch (ioctl) {
4216 case KVM_CREATE_VCPU:
4217 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4218 break;
4219 case KVM_ENABLE_CAP: {
4220 struct kvm_enable_cap cap;
4221
4222 r = -EFAULT;
4223 if (copy_from_user(&cap, argp, sizeof(cap)))
4224 goto out;
4225 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4226 break;
4227 }
4228 case KVM_SET_USER_MEMORY_REGION: {
4229 struct kvm_userspace_memory_region kvm_userspace_mem;
4230
4231 r = -EFAULT;
4232 if (copy_from_user(&kvm_userspace_mem, argp,
4233 sizeof(kvm_userspace_mem)))
4234 goto out;
4235
4236 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4237 break;
4238 }
4239 case KVM_GET_DIRTY_LOG: {
4240 struct kvm_dirty_log log;
4241
4242 r = -EFAULT;
4243 if (copy_from_user(&log, argp, sizeof(log)))
4244 goto out;
4245 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4246 break;
4247 }
4248#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4249 case KVM_CLEAR_DIRTY_LOG: {
4250 struct kvm_clear_dirty_log log;
4251
4252 r = -EFAULT;
4253 if (copy_from_user(&log, argp, sizeof(log)))
4254 goto out;
4255 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4256 break;
4257 }
4258#endif
4259#ifdef CONFIG_KVM_MMIO
4260 case KVM_REGISTER_COALESCED_MMIO: {
4261 struct kvm_coalesced_mmio_zone zone;
4262
4263 r = -EFAULT;
4264 if (copy_from_user(&zone, argp, sizeof(zone)))
4265 goto out;
4266 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4267 break;
4268 }
4269 case KVM_UNREGISTER_COALESCED_MMIO: {
4270 struct kvm_coalesced_mmio_zone zone;
4271
4272 r = -EFAULT;
4273 if (copy_from_user(&zone, argp, sizeof(zone)))
4274 goto out;
4275 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4276 break;
4277 }
4278#endif
4279 case KVM_IRQFD: {
4280 struct kvm_irqfd data;
4281
4282 r = -EFAULT;
4283 if (copy_from_user(&data, argp, sizeof(data)))
4284 goto out;
4285 r = kvm_irqfd(kvm, &data);
4286 break;
4287 }
4288 case KVM_IOEVENTFD: {
4289 struct kvm_ioeventfd data;
4290
4291 r = -EFAULT;
4292 if (copy_from_user(&data, argp, sizeof(data)))
4293 goto out;
4294 r = kvm_ioeventfd(kvm, &data);
4295 break;
4296 }
4297#ifdef CONFIG_HAVE_KVM_MSI
4298 case KVM_SIGNAL_MSI: {
4299 struct kvm_msi msi;
4300
4301 r = -EFAULT;
4302 if (copy_from_user(&msi, argp, sizeof(msi)))
4303 goto out;
4304 r = kvm_send_userspace_msi(kvm, &msi);
4305 break;
4306 }
4307#endif
4308#ifdef __KVM_HAVE_IRQ_LINE
4309 case KVM_IRQ_LINE_STATUS:
4310 case KVM_IRQ_LINE: {
4311 struct kvm_irq_level irq_event;
4312
4313 r = -EFAULT;
4314 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4315 goto out;
4316
4317 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4318 ioctl == KVM_IRQ_LINE_STATUS);
4319 if (r)
4320 goto out;
4321
4322 r = -EFAULT;
4323 if (ioctl == KVM_IRQ_LINE_STATUS) {
4324 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4325 goto out;
4326 }
4327
4328 r = 0;
4329 break;
4330 }
4331#endif
4332#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4333 case KVM_SET_GSI_ROUTING: {
4334 struct kvm_irq_routing routing;
4335 struct kvm_irq_routing __user *urouting;
4336 struct kvm_irq_routing_entry *entries = NULL;
4337
4338 r = -EFAULT;
4339 if (copy_from_user(&routing, argp, sizeof(routing)))
4340 goto out;
4341 r = -EINVAL;
4342 if (!kvm_arch_can_set_irq_routing(kvm))
4343 goto out;
4344 if (routing.nr > KVM_MAX_IRQ_ROUTES)
4345 goto out;
4346 if (routing.flags)
4347 goto out;
4348 if (routing.nr) {
4349 urouting = argp;
4350 entries = vmemdup_user(urouting->entries,
4351 array_size(sizeof(*entries),
4352 routing.nr));
4353 if (IS_ERR(entries)) {
4354 r = PTR_ERR(entries);
4355 goto out;
4356 }
4357 }
4358 r = kvm_set_irq_routing(kvm, entries, routing.nr,
4359 routing.flags);
4360 kvfree(entries);
4361 break;
4362 }
4363#endif
4364 case KVM_CREATE_DEVICE: {
4365 struct kvm_create_device cd;
4366
4367 r = -EFAULT;
4368 if (copy_from_user(&cd, argp, sizeof(cd)))
4369 goto out;
4370
4371 r = kvm_ioctl_create_device(kvm, &cd);
4372 if (r)
4373 goto out;
4374
4375 r = -EFAULT;
4376 if (copy_to_user(argp, &cd, sizeof(cd)))
4377 goto out;
4378
4379 r = 0;
4380 break;
4381 }
4382 case KVM_CHECK_EXTENSION:
4383 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4384 break;
4385 case KVM_RESET_DIRTY_RINGS:
4386 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4387 break;
4388 case KVM_GET_STATS_FD:
4389 r = kvm_vm_ioctl_get_stats_fd(kvm);
4390 break;
4391 default:
4392 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4393 }
4394out:
4395 return r;
4396}
4397
4398#ifdef CONFIG_KVM_COMPAT
4399struct compat_kvm_dirty_log {
4400 __u32 slot;
4401 __u32 padding1;
4402 union {
4403 compat_uptr_t dirty_bitmap;
4404 __u64 padding2;
4405 };
4406};
4407
4408struct compat_kvm_clear_dirty_log {
4409 __u32 slot;
4410 __u32 num_pages;
4411 __u64 first_page;
4412 union {
4413 compat_uptr_t dirty_bitmap;
4414 __u64 padding2;
4415 };
4416};
4417
4418static long kvm_vm_compat_ioctl(struct file *filp,
4419 unsigned int ioctl, unsigned long arg)
4420{
4421 struct kvm *kvm = filp->private_data;
4422 int r;
4423
4424 if (kvm->mm != current->mm)
4425 return -EIO;
4426 switch (ioctl) {
4427#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4428 case KVM_CLEAR_DIRTY_LOG: {
4429 struct compat_kvm_clear_dirty_log compat_log;
4430 struct kvm_clear_dirty_log log;
4431
4432 if (copy_from_user(&compat_log, (void __user *)arg,
4433 sizeof(compat_log)))
4434 return -EFAULT;
4435 log.slot = compat_log.slot;
4436 log.num_pages = compat_log.num_pages;
4437 log.first_page = compat_log.first_page;
4438 log.padding2 = compat_log.padding2;
4439 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4440
4441 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4442 break;
4443 }
4444#endif
4445 case KVM_GET_DIRTY_LOG: {
4446 struct compat_kvm_dirty_log compat_log;
4447 struct kvm_dirty_log log;
4448
4449 if (copy_from_user(&compat_log, (void __user *)arg,
4450 sizeof(compat_log)))
4451 return -EFAULT;
4452 log.slot = compat_log.slot;
4453 log.padding1 = compat_log.padding1;
4454 log.padding2 = compat_log.padding2;
4455 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4456
4457 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4458 break;
4459 }
4460 default:
4461 r = kvm_vm_ioctl(filp, ioctl, arg);
4462 }
4463 return r;
4464}
4465#endif
4466
4467static struct file_operations kvm_vm_fops = {
4468 .release = kvm_vm_release,
4469 .unlocked_ioctl = kvm_vm_ioctl,
4470 .llseek = noop_llseek,
4471 KVM_COMPAT(kvm_vm_compat_ioctl),
4472};
4473
4474bool file_is_kvm(struct file *file)
4475{
4476 return file && file->f_op == &kvm_vm_fops;
4477}
4478EXPORT_SYMBOL_GPL(file_is_kvm);
4479
4480static int kvm_dev_ioctl_create_vm(unsigned long type)
4481{
4482 int r;
4483 struct kvm *kvm;
4484 struct file *file;
4485
4486 kvm = kvm_create_vm(type);
4487 if (IS_ERR(kvm))
4488 return PTR_ERR(kvm);
4489#ifdef CONFIG_KVM_MMIO
4490 r = kvm_coalesced_mmio_init(kvm);
4491 if (r < 0)
4492 goto put_kvm;
4493#endif
4494 r = get_unused_fd_flags(O_CLOEXEC);
4495 if (r < 0)
4496 goto put_kvm;
4497
4498 snprintf(kvm->stats_id, sizeof(kvm->stats_id),
4499 "kvm-%d", task_pid_nr(current));
4500
4501 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4502 if (IS_ERR(file)) {
4503 put_unused_fd(r);
4504 r = PTR_ERR(file);
4505 goto put_kvm;
4506 }
4507
4508
4509
4510
4511
4512
4513
4514 if (kvm_create_vm_debugfs(kvm, r) < 0) {
4515 put_unused_fd(r);
4516 fput(file);
4517 return -ENOMEM;
4518 }
4519 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
4520
4521 fd_install(r, file);
4522 return r;
4523
4524put_kvm:
4525 kvm_put_kvm(kvm);
4526 return r;
4527}
4528
4529static long kvm_dev_ioctl(struct file *filp,
4530 unsigned int ioctl, unsigned long arg)
4531{
4532 long r = -EINVAL;
4533
4534 switch (ioctl) {
4535 case KVM_GET_API_VERSION:
4536 if (arg)
4537 goto out;
4538 r = KVM_API_VERSION;
4539 break;
4540 case KVM_CREATE_VM:
4541 r = kvm_dev_ioctl_create_vm(arg);
4542 break;
4543 case KVM_CHECK_EXTENSION:
4544 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
4545 break;
4546 case KVM_GET_VCPU_MMAP_SIZE:
4547 if (arg)
4548 goto out;
4549 r = PAGE_SIZE;
4550#ifdef CONFIG_X86
4551 r += PAGE_SIZE;
4552#endif
4553#ifdef CONFIG_KVM_MMIO
4554 r += PAGE_SIZE;
4555#endif
4556 break;
4557 case KVM_TRACE_ENABLE:
4558 case KVM_TRACE_PAUSE:
4559 case KVM_TRACE_DISABLE:
4560 r = -EOPNOTSUPP;
4561 break;
4562 default:
4563 return kvm_arch_dev_ioctl(filp, ioctl, arg);
4564 }
4565out:
4566 return r;
4567}
4568
4569static struct file_operations kvm_chardev_ops = {
4570 .unlocked_ioctl = kvm_dev_ioctl,
4571 .llseek = noop_llseek,
4572 KVM_COMPAT(kvm_dev_ioctl),
4573};
4574
4575static struct miscdevice kvm_dev = {
4576 KVM_MINOR,
4577 "kvm",
4578 &kvm_chardev_ops,
4579};
4580
4581static void hardware_enable_nolock(void *junk)
4582{
4583 int cpu = raw_smp_processor_id();
4584 int r;
4585
4586 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
4587 return;
4588
4589 cpumask_set_cpu(cpu, cpus_hardware_enabled);
4590
4591 r = kvm_arch_hardware_enable();
4592
4593 if (r) {
4594 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4595 atomic_inc(&hardware_enable_failed);
4596 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
4597 }
4598}
4599
4600static int kvm_starting_cpu(unsigned int cpu)
4601{
4602 raw_spin_lock(&kvm_count_lock);
4603 if (kvm_usage_count)
4604 hardware_enable_nolock(NULL);
4605 raw_spin_unlock(&kvm_count_lock);
4606 return 0;
4607}
4608
4609static void hardware_disable_nolock(void *junk)
4610{
4611 int cpu = raw_smp_processor_id();
4612
4613 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
4614 return;
4615 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4616 kvm_arch_hardware_disable();
4617}
4618
4619static int kvm_dying_cpu(unsigned int cpu)
4620{
4621 raw_spin_lock(&kvm_count_lock);
4622 if (kvm_usage_count)
4623 hardware_disable_nolock(NULL);
4624 raw_spin_unlock(&kvm_count_lock);
4625 return 0;
4626}
4627
4628static void hardware_disable_all_nolock(void)
4629{
4630 BUG_ON(!kvm_usage_count);
4631
4632 kvm_usage_count--;
4633 if (!kvm_usage_count)
4634 on_each_cpu(hardware_disable_nolock, NULL, 1);
4635}
4636
4637static void hardware_disable_all(void)
4638{
4639 raw_spin_lock(&kvm_count_lock);
4640 hardware_disable_all_nolock();
4641 raw_spin_unlock(&kvm_count_lock);
4642}
4643
4644static int hardware_enable_all(void)
4645{
4646 int r = 0;
4647
4648 raw_spin_lock(&kvm_count_lock);
4649
4650 kvm_usage_count++;
4651 if (kvm_usage_count == 1) {
4652 atomic_set(&hardware_enable_failed, 0);
4653 on_each_cpu(hardware_enable_nolock, NULL, 1);
4654
4655 if (atomic_read(&hardware_enable_failed)) {
4656 hardware_disable_all_nolock();
4657 r = -EBUSY;
4658 }
4659 }
4660
4661 raw_spin_unlock(&kvm_count_lock);
4662
4663 return r;
4664}
4665
4666static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
4667 void *v)
4668{
4669
4670
4671
4672
4673
4674
4675 pr_info("kvm: exiting hardware virtualization\n");
4676 kvm_rebooting = true;
4677 on_each_cpu(hardware_disable_nolock, NULL, 1);
4678 return NOTIFY_OK;
4679}
4680
4681static struct notifier_block kvm_reboot_notifier = {
4682 .notifier_call = kvm_reboot,
4683 .priority = 0,
4684};
4685
4686static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
4687{
4688 int i;
4689
4690 for (i = 0; i < bus->dev_count; i++) {
4691 struct kvm_io_device *pos = bus->range[i].dev;
4692
4693 kvm_iodevice_destructor(pos);
4694 }
4695 kfree(bus);
4696}
4697
4698static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
4699 const struct kvm_io_range *r2)
4700{
4701 gpa_t addr1 = r1->addr;
4702 gpa_t addr2 = r2->addr;
4703
4704 if (addr1 < addr2)
4705 return -1;
4706
4707
4708
4709
4710
4711
4712 if (r2->len) {
4713 addr1 += r1->len;
4714 addr2 += r2->len;
4715 }
4716
4717 if (addr1 > addr2)
4718 return 1;
4719
4720 return 0;
4721}
4722
4723static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4724{
4725 return kvm_io_bus_cmp(p1, p2);
4726}
4727
4728static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
4729 gpa_t addr, int len)
4730{
4731 struct kvm_io_range *range, key;
4732 int off;
4733
4734 key = (struct kvm_io_range) {
4735 .addr = addr,
4736 .len = len,
4737 };
4738
4739 range = bsearch(&key, bus->range, bus->dev_count,
4740 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4741 if (range == NULL)
4742 return -ENOENT;
4743
4744 off = range - bus->range;
4745
4746 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
4747 off--;
4748
4749 return off;
4750}
4751
4752static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4753 struct kvm_io_range *range, const void *val)
4754{
4755 int idx;
4756
4757 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4758 if (idx < 0)
4759 return -EOPNOTSUPP;
4760
4761 while (idx < bus->dev_count &&
4762 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4763 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
4764 range->len, val))
4765 return idx;
4766 idx++;
4767 }
4768
4769 return -EOPNOTSUPP;
4770}
4771
4772
4773int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4774 int len, const void *val)
4775{
4776 struct kvm_io_bus *bus;
4777 struct kvm_io_range range;
4778 int r;
4779
4780 range = (struct kvm_io_range) {
4781 .addr = addr,
4782 .len = len,
4783 };
4784
4785 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4786 if (!bus)
4787 return -ENOMEM;
4788 r = __kvm_io_bus_write(vcpu, bus, &range, val);
4789 return r < 0 ? r : 0;
4790}
4791EXPORT_SYMBOL_GPL(kvm_io_bus_write);
4792
4793
4794int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
4795 gpa_t addr, int len, const void *val, long cookie)
4796{
4797 struct kvm_io_bus *bus;
4798 struct kvm_io_range range;
4799
4800 range = (struct kvm_io_range) {
4801 .addr = addr,
4802 .len = len,
4803 };
4804
4805 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4806 if (!bus)
4807 return -ENOMEM;
4808
4809
4810 if ((cookie >= 0) && (cookie < bus->dev_count) &&
4811 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
4812 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
4813 val))
4814 return cookie;
4815
4816
4817
4818
4819
4820 return __kvm_io_bus_write(vcpu, bus, &range, val);
4821}
4822
4823static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4824 struct kvm_io_range *range, void *val)
4825{
4826 int idx;
4827
4828 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4829 if (idx < 0)
4830 return -EOPNOTSUPP;
4831
4832 while (idx < bus->dev_count &&
4833 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4834 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
4835 range->len, val))
4836 return idx;
4837 idx++;
4838 }
4839
4840 return -EOPNOTSUPP;
4841}
4842
4843
4844int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4845 int len, void *val)
4846{
4847 struct kvm_io_bus *bus;
4848 struct kvm_io_range range;
4849 int r;
4850
4851 range = (struct kvm_io_range) {
4852 .addr = addr,
4853 .len = len,
4854 };
4855
4856 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4857 if (!bus)
4858 return -ENOMEM;
4859 r = __kvm_io_bus_read(vcpu, bus, &range, val);
4860 return r < 0 ? r : 0;
4861}
4862
4863
4864int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
4865 int len, struct kvm_io_device *dev)
4866{
4867 int i;
4868 struct kvm_io_bus *new_bus, *bus;
4869 struct kvm_io_range range;
4870
4871 bus = kvm_get_bus(kvm, bus_idx);
4872 if (!bus)
4873 return -ENOMEM;
4874
4875
4876 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
4877 return -ENOSPC;
4878
4879 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
4880 GFP_KERNEL_ACCOUNT);
4881 if (!new_bus)
4882 return -ENOMEM;
4883
4884 range = (struct kvm_io_range) {
4885 .addr = addr,
4886 .len = len,
4887 .dev = dev,
4888 };
4889
4890 for (i = 0; i < bus->dev_count; i++)
4891 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
4892 break;
4893
4894 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
4895 new_bus->dev_count++;
4896 new_bus->range[i] = range;
4897 memcpy(new_bus->range + i + 1, bus->range + i,
4898 (bus->dev_count - i) * sizeof(struct kvm_io_range));
4899 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4900 synchronize_srcu_expedited(&kvm->srcu);
4901 kfree(bus);
4902
4903 return 0;
4904}
4905
4906int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4907 struct kvm_io_device *dev)
4908{
4909 int i, j;
4910 struct kvm_io_bus *new_bus, *bus;
4911
4912 lockdep_assert_held(&kvm->slots_lock);
4913
4914 bus = kvm_get_bus(kvm, bus_idx);
4915 if (!bus)
4916 return 0;
4917
4918 for (i = 0; i < bus->dev_count; i++) {
4919 if (bus->range[i].dev == dev) {
4920 break;
4921 }
4922 }
4923
4924 if (i == bus->dev_count)
4925 return 0;
4926
4927 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
4928 GFP_KERNEL_ACCOUNT);
4929 if (new_bus) {
4930 memcpy(new_bus, bus, struct_size(bus, range, i));
4931 new_bus->dev_count--;
4932 memcpy(new_bus->range + i, bus->range + i + 1,
4933 flex_array_size(new_bus, range, new_bus->dev_count - i));
4934 }
4935
4936 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4937 synchronize_srcu_expedited(&kvm->srcu);
4938
4939
4940 if (!new_bus) {
4941 pr_err("kvm: failed to shrink bus, removing it completely\n");
4942 for (j = 0; j < bus->dev_count; j++) {
4943 if (j == i)
4944 continue;
4945 kvm_iodevice_destructor(bus->range[j].dev);
4946 }
4947 }
4948
4949 kfree(bus);
4950 return new_bus ? 0 : -ENOMEM;
4951}
4952
4953struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4954 gpa_t addr)
4955{
4956 struct kvm_io_bus *bus;
4957 int dev_idx, srcu_idx;
4958 struct kvm_io_device *iodev = NULL;
4959
4960 srcu_idx = srcu_read_lock(&kvm->srcu);
4961
4962 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
4963 if (!bus)
4964 goto out_unlock;
4965
4966 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
4967 if (dev_idx < 0)
4968 goto out_unlock;
4969
4970 iodev = bus->range[dev_idx].dev;
4971
4972out_unlock:
4973 srcu_read_unlock(&kvm->srcu, srcu_idx);
4974
4975 return iodev;
4976}
4977EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
4978
4979static int kvm_debugfs_open(struct inode *inode, struct file *file,
4980 int (*get)(void *, u64 *), int (*set)(void *, u64),
4981 const char *fmt)
4982{
4983 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
4984 inode->i_private;
4985
4986
4987
4988
4989
4990
4991 if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
4992 return -ENOENT;
4993
4994 if (simple_attr_open(inode, file, get,
4995 kvm_stats_debugfs_mode(stat_data->desc) & 0222
4996 ? set : NULL,
4997 fmt)) {
4998 kvm_put_kvm(stat_data->kvm);
4999 return -ENOMEM;
5000 }
5001
5002 return 0;
5003}
5004
5005static int kvm_debugfs_release(struct inode *inode, struct file *file)
5006{
5007 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5008 inode->i_private;
5009
5010 simple_attr_release(inode, file);
5011 kvm_put_kvm(stat_data->kvm);
5012
5013 return 0;
5014}
5015
5016static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5017{
5018 *val = *(u64 *)((void *)(&kvm->stat) + offset);
5019
5020 return 0;
5021}
5022
5023static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5024{
5025 *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5026
5027 return 0;
5028}
5029
5030static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5031{
5032 int i;
5033 struct kvm_vcpu *vcpu;
5034
5035 *val = 0;
5036
5037 kvm_for_each_vcpu(i, vcpu, kvm)
5038 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5039
5040 return 0;
5041}
5042
5043static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5044{
5045 int i;
5046 struct kvm_vcpu *vcpu;
5047
5048 kvm_for_each_vcpu(i, vcpu, kvm)
5049 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5050
5051 return 0;
5052}
5053
5054static int kvm_stat_data_get(void *data, u64 *val)
5055{
5056 int r = -EFAULT;
5057 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5058
5059 switch (stat_data->kind) {
5060 case KVM_STAT_VM:
5061 r = kvm_get_stat_per_vm(stat_data->kvm,
5062 stat_data->desc->desc.offset, val);
5063 break;
5064 case KVM_STAT_VCPU:
5065 r = kvm_get_stat_per_vcpu(stat_data->kvm,
5066 stat_data->desc->desc.offset, val);
5067 break;
5068 }
5069
5070 return r;
5071}
5072
5073static int kvm_stat_data_clear(void *data, u64 val)
5074{
5075 int r = -EFAULT;
5076 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5077
5078 if (val)
5079 return -EINVAL;
5080
5081 switch (stat_data->kind) {
5082 case KVM_STAT_VM:
5083 r = kvm_clear_stat_per_vm(stat_data->kvm,
5084 stat_data->desc->desc.offset);
5085 break;
5086 case KVM_STAT_VCPU:
5087 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5088 stat_data->desc->desc.offset);
5089 break;
5090 }
5091
5092 return r;
5093}
5094
5095static int kvm_stat_data_open(struct inode *inode, struct file *file)
5096{
5097 __simple_attr_check_format("%llu\n", 0ull);
5098 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5099 kvm_stat_data_clear, "%llu\n");
5100}
5101
5102static const struct file_operations stat_fops_per_vm = {
5103 .owner = THIS_MODULE,
5104 .open = kvm_stat_data_open,
5105 .release = kvm_debugfs_release,
5106 .read = simple_attr_read,
5107 .write = simple_attr_write,
5108 .llseek = no_llseek,
5109};
5110
5111static int vm_stat_get(void *_offset, u64 *val)
5112{
5113 unsigned offset = (long)_offset;
5114 struct kvm *kvm;
5115 u64 tmp_val;
5116
5117 *val = 0;
5118 mutex_lock(&kvm_lock);
5119 list_for_each_entry(kvm, &vm_list, vm_list) {
5120 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5121 *val += tmp_val;
5122 }
5123 mutex_unlock(&kvm_lock);
5124 return 0;
5125}
5126
5127static int vm_stat_clear(void *_offset, u64 val)
5128{
5129 unsigned offset = (long)_offset;
5130 struct kvm *kvm;
5131
5132 if (val)
5133 return -EINVAL;
5134
5135 mutex_lock(&kvm_lock);
5136 list_for_each_entry(kvm, &vm_list, vm_list) {
5137 kvm_clear_stat_per_vm(kvm, offset);
5138 }
5139 mutex_unlock(&kvm_lock);
5140
5141 return 0;
5142}
5143
5144DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5145DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5146
5147static int vcpu_stat_get(void *_offset, u64 *val)
5148{
5149 unsigned offset = (long)_offset;
5150 struct kvm *kvm;
5151 u64 tmp_val;
5152
5153 *val = 0;
5154 mutex_lock(&kvm_lock);
5155 list_for_each_entry(kvm, &vm_list, vm_list) {
5156 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5157 *val += tmp_val;
5158 }
5159 mutex_unlock(&kvm_lock);
5160 return 0;
5161}
5162
5163static int vcpu_stat_clear(void *_offset, u64 val)
5164{
5165 unsigned offset = (long)_offset;
5166 struct kvm *kvm;
5167
5168 if (val)
5169 return -EINVAL;
5170
5171 mutex_lock(&kvm_lock);
5172 list_for_each_entry(kvm, &vm_list, vm_list) {
5173 kvm_clear_stat_per_vcpu(kvm, offset);
5174 }
5175 mutex_unlock(&kvm_lock);
5176
5177 return 0;
5178}
5179
5180DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5181 "%llu\n");
5182DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5183
5184static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5185{
5186 struct kobj_uevent_env *env;
5187 unsigned long long created, active;
5188
5189 if (!kvm_dev.this_device || !kvm)
5190 return;
5191
5192 mutex_lock(&kvm_lock);
5193 if (type == KVM_EVENT_CREATE_VM) {
5194 kvm_createvm_count++;
5195 kvm_active_vms++;
5196 } else if (type == KVM_EVENT_DESTROY_VM) {
5197 kvm_active_vms--;
5198 }
5199 created = kvm_createvm_count;
5200 active = kvm_active_vms;
5201 mutex_unlock(&kvm_lock);
5202
5203 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5204 if (!env)
5205 return;
5206
5207 add_uevent_var(env, "CREATED=%llu", created);
5208 add_uevent_var(env, "COUNT=%llu", active);
5209
5210 if (type == KVM_EVENT_CREATE_VM) {
5211 add_uevent_var(env, "EVENT=create");
5212 kvm->userspace_pid = task_pid_nr(current);
5213 } else if (type == KVM_EVENT_DESTROY_VM) {
5214 add_uevent_var(env, "EVENT=destroy");
5215 }
5216 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5217
5218 if (kvm->debugfs_dentry) {
5219 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5220
5221 if (p) {
5222 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5223 if (!IS_ERR(tmp))
5224 add_uevent_var(env, "STATS_PATH=%s", tmp);
5225 kfree(p);
5226 }
5227 }
5228
5229 env->envp[env->envp_idx++] = NULL;
5230 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5231 kfree(env);
5232}
5233
5234static void kvm_init_debug(void)
5235{
5236 const struct file_operations *fops;
5237 const struct _kvm_stats_desc *pdesc;
5238 int i;
5239
5240 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
5241
5242 for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5243 pdesc = &kvm_vm_stats_desc[i];
5244 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5245 fops = &vm_stat_fops;
5246 else
5247 fops = &vm_stat_readonly_fops;
5248 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5249 kvm_debugfs_dir,
5250 (void *)(long)pdesc->desc.offset, fops);
5251 }
5252
5253 for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5254 pdesc = &kvm_vcpu_stats_desc[i];
5255 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5256 fops = &vcpu_stat_fops;
5257 else
5258 fops = &vcpu_stat_readonly_fops;
5259 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5260 kvm_debugfs_dir,
5261 (void *)(long)pdesc->desc.offset, fops);
5262 }
5263}
5264
5265static int kvm_suspend(void)
5266{
5267 if (kvm_usage_count)
5268 hardware_disable_nolock(NULL);
5269 return 0;
5270}
5271
5272static void kvm_resume(void)
5273{
5274 if (kvm_usage_count) {
5275#ifdef CONFIG_LOCKDEP
5276 WARN_ON(lockdep_is_held(&kvm_count_lock));
5277#endif
5278 hardware_enable_nolock(NULL);
5279 }
5280}
5281
5282static struct syscore_ops kvm_syscore_ops = {
5283 .suspend = kvm_suspend,
5284 .resume = kvm_resume,
5285};
5286
5287static inline
5288struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5289{
5290 return container_of(pn, struct kvm_vcpu, preempt_notifier);
5291}
5292
5293static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5294{
5295 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5296
5297 WRITE_ONCE(vcpu->preempted, false);
5298 WRITE_ONCE(vcpu->ready, false);
5299
5300 __this_cpu_write(kvm_running_vcpu, vcpu);
5301 kvm_arch_sched_in(vcpu, cpu);
5302 kvm_arch_vcpu_load(vcpu, cpu);
5303}
5304
5305static void kvm_sched_out(struct preempt_notifier *pn,
5306 struct task_struct *next)
5307{
5308 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5309
5310 if (current->on_rq) {
5311 WRITE_ONCE(vcpu->preempted, true);
5312 WRITE_ONCE(vcpu->ready, true);
5313 }
5314 kvm_arch_vcpu_put(vcpu);
5315 __this_cpu_write(kvm_running_vcpu, NULL);
5316}
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327struct kvm_vcpu *kvm_get_running_vcpu(void)
5328{
5329 struct kvm_vcpu *vcpu;
5330
5331 preempt_disable();
5332 vcpu = __this_cpu_read(kvm_running_vcpu);
5333 preempt_enable();
5334
5335 return vcpu;
5336}
5337EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5338
5339
5340
5341
5342struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5343{
5344 return &kvm_running_vcpu;
5345}
5346
5347struct kvm_cpu_compat_check {
5348 void *opaque;
5349 int *ret;
5350};
5351
5352static void check_processor_compat(void *data)
5353{
5354 struct kvm_cpu_compat_check *c = data;
5355
5356 *c->ret = kvm_arch_check_processor_compat(c->opaque);
5357}
5358
5359int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
5360 struct module *module)
5361{
5362 struct kvm_cpu_compat_check c;
5363 int r;
5364 int cpu;
5365
5366 r = kvm_arch_init(opaque);
5367 if (r)
5368 goto out_fail;
5369
5370
5371
5372
5373
5374
5375
5376
5377 r = kvm_irqfd_init();
5378 if (r)
5379 goto out_irqfd;
5380
5381 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
5382 r = -ENOMEM;
5383 goto out_free_0;
5384 }
5385
5386 r = kvm_arch_hardware_setup(opaque);
5387 if (r < 0)
5388 goto out_free_1;
5389
5390 c.ret = &r;
5391 c.opaque = opaque;
5392 for_each_online_cpu(cpu) {
5393 smp_call_function_single(cpu, check_processor_compat, &c, 1);
5394 if (r < 0)
5395 goto out_free_2;
5396 }
5397
5398 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
5399 kvm_starting_cpu, kvm_dying_cpu);
5400 if (r)
5401 goto out_free_2;
5402 register_reboot_notifier(&kvm_reboot_notifier);
5403
5404
5405 if (!vcpu_align)
5406 vcpu_align = __alignof__(struct kvm_vcpu);
5407 kvm_vcpu_cache =
5408 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5409 SLAB_ACCOUNT,
5410 offsetof(struct kvm_vcpu, arch),
5411 offsetofend(struct kvm_vcpu, stats_id)
5412 - offsetof(struct kvm_vcpu, arch),
5413 NULL);
5414 if (!kvm_vcpu_cache) {
5415 r = -ENOMEM;
5416 goto out_free_3;
5417 }
5418
5419 r = kvm_async_pf_init();
5420 if (r)
5421 goto out_free;
5422
5423 kvm_chardev_ops.owner = module;
5424 kvm_vm_fops.owner = module;
5425 kvm_vcpu_fops.owner = module;
5426
5427 r = misc_register(&kvm_dev);
5428 if (r) {
5429 pr_err("kvm: misc device register failed\n");
5430 goto out_unreg;
5431 }
5432
5433 register_syscore_ops(&kvm_syscore_ops);
5434
5435 kvm_preempt_ops.sched_in = kvm_sched_in;
5436 kvm_preempt_ops.sched_out = kvm_sched_out;
5437
5438 kvm_init_debug();
5439
5440 r = kvm_vfio_ops_init();
5441 WARN_ON(r);
5442
5443 return 0;
5444
5445out_unreg:
5446 kvm_async_pf_deinit();
5447out_free:
5448 kmem_cache_destroy(kvm_vcpu_cache);
5449out_free_3:
5450 unregister_reboot_notifier(&kvm_reboot_notifier);
5451 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5452out_free_2:
5453 kvm_arch_hardware_unsetup();
5454out_free_1:
5455 free_cpumask_var(cpus_hardware_enabled);
5456out_free_0:
5457 kvm_irqfd_exit();
5458out_irqfd:
5459 kvm_arch_exit();
5460out_fail:
5461 return r;
5462}
5463EXPORT_SYMBOL_GPL(kvm_init);
5464
5465void kvm_exit(void)
5466{
5467 debugfs_remove_recursive(kvm_debugfs_dir);
5468 misc_deregister(&kvm_dev);
5469 kmem_cache_destroy(kvm_vcpu_cache);
5470 kvm_async_pf_deinit();
5471 unregister_syscore_ops(&kvm_syscore_ops);
5472 unregister_reboot_notifier(&kvm_reboot_notifier);
5473 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5474 on_each_cpu(hardware_disable_nolock, NULL, 1);
5475 kvm_arch_hardware_unsetup();
5476 kvm_arch_exit();
5477 kvm_irqfd_exit();
5478 free_cpumask_var(cpus_hardware_enabled);
5479 kvm_vfio_ops_exit();
5480}
5481EXPORT_SYMBOL_GPL(kvm_exit);
5482
5483struct kvm_vm_worker_thread_context {
5484 struct kvm *kvm;
5485 struct task_struct *parent;
5486 struct completion init_done;
5487 kvm_vm_thread_fn_t thread_fn;
5488 uintptr_t data;
5489 int err;
5490};
5491
5492static int kvm_vm_worker_thread(void *context)
5493{
5494
5495
5496
5497
5498 struct kvm_vm_worker_thread_context *init_context = context;
5499 struct kvm *kvm = init_context->kvm;
5500 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5501 uintptr_t data = init_context->data;
5502 int err;
5503
5504 err = kthread_park(current);
5505
5506 WARN_ON(err != 0);
5507 if (err)
5508 goto init_complete;
5509
5510 err = cgroup_attach_task_all(init_context->parent, current);
5511 if (err) {
5512 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5513 __func__, err);
5514 goto init_complete;
5515 }
5516
5517 set_user_nice(current, task_nice(init_context->parent));
5518
5519init_complete:
5520 init_context->err = err;
5521 complete(&init_context->init_done);
5522 init_context = NULL;
5523
5524 if (err)
5525 return err;
5526
5527
5528 kthread_parkme();
5529
5530 if (!kthread_should_stop())
5531 err = thread_fn(kvm, data);
5532
5533 return err;
5534}
5535
5536int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5537 uintptr_t data, const char *name,
5538 struct task_struct **thread_ptr)
5539{
5540 struct kvm_vm_worker_thread_context init_context = {};
5541 struct task_struct *thread;
5542
5543 *thread_ptr = NULL;
5544 init_context.kvm = kvm;
5545 init_context.parent = current;
5546 init_context.thread_fn = thread_fn;
5547 init_context.data = data;
5548 init_completion(&init_context.init_done);
5549
5550 thread = kthread_run(kvm_vm_worker_thread, &init_context,
5551 "%s-%d", name, task_pid_nr(current));
5552 if (IS_ERR(thread))
5553 return PTR_ERR(thread);
5554
5555
5556 WARN_ON(thread == NULL);
5557
5558 wait_for_completion(&init_context.init_done);
5559
5560 if (!init_context.err)
5561 *thread_ptr = thread;
5562
5563 return init_context.err;
5564}
5565