1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <kvm/iodev.h>
17
18#include <linux/kvm_host.h>
19#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
22#include <linux/percpu.h>
23#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
26#include <linux/reboot.h>
27#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
30#include <linux/syscore_ops.h>
31#include <linux/cpu.h>
32#include <linux/sched/signal.h>
33#include <linux/sched/mm.h>
34#include <linux/sched/stat.h>
35#include <linux/cpumask.h>
36#include <linux/smp.h>
37#include <linux/anon_inodes.h>
38#include <linux/profile.h>
39#include <linux/kvm_para.h>
40#include <linux/pagemap.h>
41#include <linux/mman.h>
42#include <linux/swap.h>
43#include <linux/bitops.h>
44#include <linux/spinlock.h>
45#include <linux/compat.h>
46#include <linux/srcu.h>
47#include <linux/hugetlb.h>
48#include <linux/slab.h>
49#include <linux/sort.h>
50#include <linux/bsearch.h>
51#include <linux/io.h>
52#include <linux/lockdep.h>
53#include <linux/kthread.h>
54
55#include <asm/processor.h>
56#include <asm/ioctl.h>
57#include <linux/uaccess.h>
58
59#include "coalesced_mmio.h"
60#include "async_pf.h"
61#include "mmu_lock.h"
62#include "vfio.h"
63
64#define CREATE_TRACE_POINTS
65#include <trace/events/kvm.h>
66
67#include <linux/kvm_dirty_ring.h>
68
69
70#define ITOA_MAX_LEN 12
71
72MODULE_AUTHOR("Qumranet");
73MODULE_LICENSE("GPL");
74
75
76unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
77module_param(halt_poll_ns, uint, 0644);
78EXPORT_SYMBOL_GPL(halt_poll_ns);
79
80
81unsigned int halt_poll_ns_grow = 2;
82module_param(halt_poll_ns_grow, uint, 0644);
83EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
84
85
86unsigned int halt_poll_ns_grow_start = 10000;
87module_param(halt_poll_ns_grow_start, uint, 0644);
88EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
89
90
91unsigned int halt_poll_ns_shrink;
92module_param(halt_poll_ns_shrink, uint, 0644);
93EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
94
95
96
97
98
99
100
101DEFINE_MUTEX(kvm_lock);
102static DEFINE_RAW_SPINLOCK(kvm_count_lock);
103LIST_HEAD(vm_list);
104
105static cpumask_var_t cpus_hardware_enabled;
106static int kvm_usage_count;
107static atomic_t hardware_enable_failed;
108
109static struct kmem_cache *kvm_vcpu_cache;
110
111static __read_mostly struct preempt_ops kvm_preempt_ops;
112static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
113
114struct dentry *kvm_debugfs_dir;
115EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
116
117static int kvm_debugfs_num_entries;
118static const struct file_operations stat_fops_per_vm;
119
120static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
121 unsigned long arg);
122#ifdef CONFIG_KVM_COMPAT
123static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
124 unsigned long arg);
125#define KVM_COMPAT(c) .compat_ioctl = (c)
126#else
127
128
129
130
131
132
133
134static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
135 unsigned long arg) { return -EINVAL; }
136
137static int kvm_no_compat_open(struct inode *inode, struct file *file)
138{
139 return is_compat_task() ? -ENODEV : 0;
140}
141#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
142 .open = kvm_no_compat_open
143#endif
144static int hardware_enable_all(void);
145static void hardware_disable_all(void);
146
147static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
148
149__visible bool kvm_rebooting;
150EXPORT_SYMBOL_GPL(kvm_rebooting);
151
152#define KVM_EVENT_CREATE_VM 0
153#define KVM_EVENT_DESTROY_VM 1
154static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
155static unsigned long long kvm_createvm_count;
156static unsigned long long kvm_active_vms;
157
158__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
159 unsigned long start, unsigned long end)
160{
161}
162
163bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
164{
165
166
167
168
169
170
171 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
172 return false;
173
174 return is_zone_device_page(pfn_to_page(pfn));
175}
176
177bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
178{
179
180
181
182
183
184 if (pfn_valid(pfn))
185 return PageReserved(pfn_to_page(pfn)) &&
186 !is_zero_pfn(pfn) &&
187 !kvm_is_zone_device_pfn(pfn);
188
189 return true;
190}
191
192bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
193{
194 struct page *page = pfn_to_page(pfn);
195
196 if (!PageTransCompoundMap(page))
197 return false;
198
199 return is_transparent_hugepage(compound_head(page));
200}
201
202
203
204
205void vcpu_load(struct kvm_vcpu *vcpu)
206{
207 int cpu = get_cpu();
208
209 __this_cpu_write(kvm_running_vcpu, vcpu);
210 preempt_notifier_register(&vcpu->preempt_notifier);
211 kvm_arch_vcpu_load(vcpu, cpu);
212 put_cpu();
213}
214EXPORT_SYMBOL_GPL(vcpu_load);
215
216void vcpu_put(struct kvm_vcpu *vcpu)
217{
218 preempt_disable();
219 kvm_arch_vcpu_put(vcpu);
220 preempt_notifier_unregister(&vcpu->preempt_notifier);
221 __this_cpu_write(kvm_running_vcpu, NULL);
222 preempt_enable();
223}
224EXPORT_SYMBOL_GPL(vcpu_put);
225
226
227static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
228{
229 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
230
231
232
233
234
235 if (req & KVM_REQUEST_WAIT)
236 return mode != OUTSIDE_GUEST_MODE;
237
238
239
240
241 return mode == IN_GUEST_MODE;
242}
243
244static void ack_flush(void *_completed)
245{
246}
247
248static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
249{
250 if (unlikely(!cpus))
251 cpus = cpu_online_mask;
252
253 if (cpumask_empty(cpus))
254 return false;
255
256 smp_call_function_many(cpus, ack_flush, NULL, wait);
257 return true;
258}
259
260bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
261 struct kvm_vcpu *except,
262 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
263{
264 int i, cpu, me;
265 struct kvm_vcpu *vcpu;
266 bool called;
267
268 me = get_cpu();
269
270 kvm_for_each_vcpu(i, vcpu, kvm) {
271 if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
272 vcpu == except)
273 continue;
274
275 kvm_make_request(req, vcpu);
276 cpu = vcpu->cpu;
277
278 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
279 continue;
280
281 if (tmp != NULL && cpu != -1 && cpu != me &&
282 kvm_request_needs_ipi(vcpu, req))
283 __cpumask_set_cpu(cpu, tmp);
284 }
285
286 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
287 put_cpu();
288
289 return called;
290}
291
292bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
293 struct kvm_vcpu *except)
294{
295 cpumask_var_t cpus;
296 bool called;
297
298 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
299
300 called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus);
301
302 free_cpumask_var(cpus);
303 return called;
304}
305
306bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
307{
308 return kvm_make_all_cpus_request_except(kvm, req, NULL);
309}
310EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
311
312#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
313void kvm_flush_remote_tlbs(struct kvm *kvm)
314{
315
316
317
318
319 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
320
321
322
323
324
325
326
327
328
329
330
331
332 if (!kvm_arch_flush_remote_tlb(kvm)
333 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
334 ++kvm->stat.remote_tlb_flush;
335 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
336}
337EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
338#endif
339
340void kvm_reload_remote_mmus(struct kvm *kvm)
341{
342 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
343}
344
345#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
346static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
347 gfp_t gfp_flags)
348{
349 gfp_flags |= mc->gfp_zero;
350
351 if (mc->kmem_cache)
352 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
353 else
354 return (void *)__get_free_page(gfp_flags);
355}
356
357int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
358{
359 void *obj;
360
361 if (mc->nobjs >= min)
362 return 0;
363 while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
364 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
365 if (!obj)
366 return mc->nobjs >= min ? 0 : -ENOMEM;
367 mc->objects[mc->nobjs++] = obj;
368 }
369 return 0;
370}
371
372int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
373{
374 return mc->nobjs;
375}
376
377void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
378{
379 while (mc->nobjs) {
380 if (mc->kmem_cache)
381 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
382 else
383 free_page((unsigned long)mc->objects[--mc->nobjs]);
384 }
385}
386
387void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
388{
389 void *p;
390
391 if (WARN_ON(!mc->nobjs))
392 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
393 else
394 p = mc->objects[--mc->nobjs];
395 BUG_ON(!p);
396 return p;
397}
398#endif
399
400static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
401{
402 mutex_init(&vcpu->mutex);
403 vcpu->cpu = -1;
404 vcpu->kvm = kvm;
405 vcpu->vcpu_id = id;
406 vcpu->pid = NULL;
407 rcuwait_init(&vcpu->wait);
408 kvm_async_pf_vcpu_init(vcpu);
409
410 vcpu->pre_pcpu = -1;
411 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
412
413 kvm_vcpu_set_in_spin_loop(vcpu, false);
414 kvm_vcpu_set_dy_eligible(vcpu, false);
415 vcpu->preempted = false;
416 vcpu->ready = false;
417 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
418}
419
420void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
421{
422 kvm_dirty_ring_free(&vcpu->dirty_ring);
423 kvm_arch_vcpu_destroy(vcpu);
424
425
426
427
428
429
430 put_pid(rcu_dereference_protected(vcpu->pid, 1));
431
432 free_page((unsigned long)vcpu->run);
433 kmem_cache_free(kvm_vcpu_cache, vcpu);
434}
435EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
436
437#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
438static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
439{
440 return container_of(mn, struct kvm, mmu_notifier);
441}
442
443static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
444 struct mm_struct *mm,
445 unsigned long start, unsigned long end)
446{
447 struct kvm *kvm = mmu_notifier_to_kvm(mn);
448 int idx;
449
450 idx = srcu_read_lock(&kvm->srcu);
451 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
452 srcu_read_unlock(&kvm->srcu, idx);
453}
454
455typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
456
457typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
458 unsigned long end);
459
460struct kvm_hva_range {
461 unsigned long start;
462 unsigned long end;
463 pte_t pte;
464 hva_handler_t handler;
465 on_lock_fn_t on_lock;
466 bool flush_on_ret;
467 bool may_block;
468};
469
470
471
472
473
474
475
476
477static void kvm_null_fn(void)
478{
479
480}
481#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
482
483static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
484 const struct kvm_hva_range *range)
485{
486 bool ret = false, locked = false;
487 struct kvm_gfn_range gfn_range;
488 struct kvm_memory_slot *slot;
489 struct kvm_memslots *slots;
490 int i, idx;
491
492
493 if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
494 IS_KVM_NULL_FN(range->handler)))
495 return 0;
496
497 idx = srcu_read_lock(&kvm->srcu);
498
499
500 if (!IS_KVM_NULL_FN(range->on_lock)) {
501 locked = true;
502 KVM_MMU_LOCK(kvm);
503
504 range->on_lock(kvm, range->start, range->end);
505
506 if (IS_KVM_NULL_FN(range->handler))
507 goto out_unlock;
508 }
509
510 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
511 slots = __kvm_memslots(kvm, i);
512 kvm_for_each_memslot(slot, slots) {
513 unsigned long hva_start, hva_end;
514
515 hva_start = max(range->start, slot->userspace_addr);
516 hva_end = min(range->end, slot->userspace_addr +
517 (slot->npages << PAGE_SHIFT));
518 if (hva_start >= hva_end)
519 continue;
520
521
522
523
524
525
526
527 gfn_range.pte = range->pte;
528 gfn_range.may_block = range->may_block;
529
530
531
532
533
534 gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
535 gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
536 gfn_range.slot = slot;
537
538 if (!locked) {
539 locked = true;
540 KVM_MMU_LOCK(kvm);
541 }
542 ret |= range->handler(kvm, &gfn_range);
543 }
544 }
545
546 if (range->flush_on_ret && (ret || kvm->tlbs_dirty))
547 kvm_flush_remote_tlbs(kvm);
548
549out_unlock:
550 if (locked)
551 KVM_MMU_UNLOCK(kvm);
552
553 srcu_read_unlock(&kvm->srcu, idx);
554
555
556 return (int)ret;
557}
558
559static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
560 unsigned long start,
561 unsigned long end,
562 pte_t pte,
563 hva_handler_t handler)
564{
565 struct kvm *kvm = mmu_notifier_to_kvm(mn);
566 const struct kvm_hva_range range = {
567 .start = start,
568 .end = end,
569 .pte = pte,
570 .handler = handler,
571 .on_lock = (void *)kvm_null_fn,
572 .flush_on_ret = true,
573 .may_block = false,
574 };
575
576 return __kvm_handle_hva_range(kvm, &range);
577}
578
579static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
580 unsigned long start,
581 unsigned long end,
582 hva_handler_t handler)
583{
584 struct kvm *kvm = mmu_notifier_to_kvm(mn);
585 const struct kvm_hva_range range = {
586 .start = start,
587 .end = end,
588 .pte = __pte(0),
589 .handler = handler,
590 .on_lock = (void *)kvm_null_fn,
591 .flush_on_ret = false,
592 .may_block = false,
593 };
594
595 return __kvm_handle_hva_range(kvm, &range);
596}
597static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
598 struct mm_struct *mm,
599 unsigned long address,
600 pte_t pte)
601{
602 struct kvm *kvm = mmu_notifier_to_kvm(mn);
603
604 trace_kvm_set_spte_hva(address);
605
606
607
608
609
610
611 WARN_ON_ONCE(!kvm->mmu_notifier_count);
612
613 kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
614}
615
616static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
617 unsigned long end)
618{
619
620
621
622
623
624 kvm->mmu_notifier_count++;
625 if (likely(kvm->mmu_notifier_count == 1)) {
626 kvm->mmu_notifier_range_start = start;
627 kvm->mmu_notifier_range_end = end;
628 } else {
629
630
631
632
633
634
635
636
637
638 kvm->mmu_notifier_range_start =
639 min(kvm->mmu_notifier_range_start, start);
640 kvm->mmu_notifier_range_end =
641 max(kvm->mmu_notifier_range_end, end);
642 }
643}
644
645static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
646 const struct mmu_notifier_range *range)
647{
648 struct kvm *kvm = mmu_notifier_to_kvm(mn);
649 const struct kvm_hva_range hva_range = {
650 .start = range->start,
651 .end = range->end,
652 .pte = __pte(0),
653 .handler = kvm_unmap_gfn_range,
654 .on_lock = kvm_inc_notifier_count,
655 .flush_on_ret = true,
656 .may_block = mmu_notifier_range_blockable(range),
657 };
658
659 trace_kvm_unmap_hva_range(range->start, range->end);
660
661 __kvm_handle_hva_range(kvm, &hva_range);
662
663 return 0;
664}
665
666static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
667 unsigned long end)
668{
669
670
671
672
673
674 kvm->mmu_notifier_seq++;
675 smp_wmb();
676
677
678
679
680
681 kvm->mmu_notifier_count--;
682}
683
684static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
685 const struct mmu_notifier_range *range)
686{
687 struct kvm *kvm = mmu_notifier_to_kvm(mn);
688 const struct kvm_hva_range hva_range = {
689 .start = range->start,
690 .end = range->end,
691 .pte = __pte(0),
692 .handler = (void *)kvm_null_fn,
693 .on_lock = kvm_dec_notifier_count,
694 .flush_on_ret = false,
695 .may_block = mmu_notifier_range_blockable(range),
696 };
697
698 __kvm_handle_hva_range(kvm, &hva_range);
699
700 BUG_ON(kvm->mmu_notifier_count < 0);
701}
702
703static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
704 struct mm_struct *mm,
705 unsigned long start,
706 unsigned long end)
707{
708 trace_kvm_age_hva(start, end);
709
710 return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
711}
712
713static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
714 struct mm_struct *mm,
715 unsigned long start,
716 unsigned long end)
717{
718 trace_kvm_age_hva(start, end);
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733 return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
734}
735
736static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
737 struct mm_struct *mm,
738 unsigned long address)
739{
740 trace_kvm_test_age_hva(address);
741
742 return kvm_handle_hva_range_no_flush(mn, address, address + 1,
743 kvm_test_age_gfn);
744}
745
746static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
747 struct mm_struct *mm)
748{
749 struct kvm *kvm = mmu_notifier_to_kvm(mn);
750 int idx;
751
752 idx = srcu_read_lock(&kvm->srcu);
753 kvm_arch_flush_shadow_all(kvm);
754 srcu_read_unlock(&kvm->srcu, idx);
755}
756
757static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
758 .invalidate_range = kvm_mmu_notifier_invalidate_range,
759 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
760 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
761 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
762 .clear_young = kvm_mmu_notifier_clear_young,
763 .test_young = kvm_mmu_notifier_test_young,
764 .change_pte = kvm_mmu_notifier_change_pte,
765 .release = kvm_mmu_notifier_release,
766};
767
768static int kvm_init_mmu_notifier(struct kvm *kvm)
769{
770 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
771 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
772}
773
774#else
775
776static int kvm_init_mmu_notifier(struct kvm *kvm)
777{
778 return 0;
779}
780
781#endif
782
783static struct kvm_memslots *kvm_alloc_memslots(void)
784{
785 int i;
786 struct kvm_memslots *slots;
787
788 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
789 if (!slots)
790 return NULL;
791
792 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
793 slots->id_to_index[i] = -1;
794
795 return slots;
796}
797
798static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
799{
800 if (!memslot->dirty_bitmap)
801 return;
802
803 kvfree(memslot->dirty_bitmap);
804 memslot->dirty_bitmap = NULL;
805}
806
807static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
808{
809 kvm_destroy_dirty_bitmap(slot);
810
811 kvm_arch_free_memslot(kvm, slot);
812
813 slot->flags = 0;
814 slot->npages = 0;
815}
816
817static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
818{
819 struct kvm_memory_slot *memslot;
820
821 if (!slots)
822 return;
823
824 kvm_for_each_memslot(memslot, slots)
825 kvm_free_memslot(kvm, memslot);
826
827 kvfree(slots);
828}
829
830static void kvm_destroy_vm_debugfs(struct kvm *kvm)
831{
832 int i;
833
834 if (!kvm->debugfs_dentry)
835 return;
836
837 debugfs_remove_recursive(kvm->debugfs_dentry);
838
839 if (kvm->debugfs_stat_data) {
840 for (i = 0; i < kvm_debugfs_num_entries; i++)
841 kfree(kvm->debugfs_stat_data[i]);
842 kfree(kvm->debugfs_stat_data);
843 }
844}
845
846static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
847{
848 char dir_name[ITOA_MAX_LEN * 2];
849 struct kvm_stat_data *stat_data;
850 struct kvm_stats_debugfs_item *p;
851
852 if (!debugfs_initialized())
853 return 0;
854
855 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
856 kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
857
858 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
859 sizeof(*kvm->debugfs_stat_data),
860 GFP_KERNEL_ACCOUNT);
861 if (!kvm->debugfs_stat_data)
862 return -ENOMEM;
863
864 for (p = debugfs_entries; p->name; p++) {
865 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
866 if (!stat_data)
867 return -ENOMEM;
868
869 stat_data->kvm = kvm;
870 stat_data->dbgfs_item = p;
871 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
872 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
873 kvm->debugfs_dentry, stat_data,
874 &stat_fops_per_vm);
875 }
876 return 0;
877}
878
879
880
881
882
883int __weak kvm_arch_post_init_vm(struct kvm *kvm)
884{
885 return 0;
886}
887
888
889
890
891
892void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
893{
894}
895
896static struct kvm *kvm_create_vm(unsigned long type)
897{
898 struct kvm *kvm = kvm_arch_alloc_vm();
899 int r = -ENOMEM;
900 int i;
901
902 if (!kvm)
903 return ERR_PTR(-ENOMEM);
904
905 KVM_MMU_LOCK_INIT(kvm);
906 mmgrab(current->mm);
907 kvm->mm = current->mm;
908 kvm_eventfd_init(kvm);
909 mutex_init(&kvm->lock);
910 mutex_init(&kvm->irq_lock);
911 mutex_init(&kvm->slots_lock);
912 INIT_LIST_HEAD(&kvm->devices);
913
914 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
915
916 if (init_srcu_struct(&kvm->srcu))
917 goto out_err_no_srcu;
918 if (init_srcu_struct(&kvm->irq_srcu))
919 goto out_err_no_irq_srcu;
920
921 refcount_set(&kvm->users_count, 1);
922 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
923 struct kvm_memslots *slots = kvm_alloc_memslots();
924
925 if (!slots)
926 goto out_err_no_arch_destroy_vm;
927
928 slots->generation = i;
929 rcu_assign_pointer(kvm->memslots[i], slots);
930 }
931
932 for (i = 0; i < KVM_NR_BUSES; i++) {
933 rcu_assign_pointer(kvm->buses[i],
934 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
935 if (!kvm->buses[i])
936 goto out_err_no_arch_destroy_vm;
937 }
938
939 kvm->max_halt_poll_ns = halt_poll_ns;
940
941 r = kvm_arch_init_vm(kvm, type);
942 if (r)
943 goto out_err_no_arch_destroy_vm;
944
945 r = hardware_enable_all();
946 if (r)
947 goto out_err_no_disable;
948
949#ifdef CONFIG_HAVE_KVM_IRQFD
950 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
951#endif
952
953 r = kvm_init_mmu_notifier(kvm);
954 if (r)
955 goto out_err_no_mmu_notifier;
956
957 r = kvm_arch_post_init_vm(kvm);
958 if (r)
959 goto out_err;
960
961 mutex_lock(&kvm_lock);
962 list_add(&kvm->vm_list, &vm_list);
963 mutex_unlock(&kvm_lock);
964
965 preempt_notifier_inc();
966
967 return kvm;
968
969out_err:
970#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
971 if (kvm->mmu_notifier.ops)
972 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
973#endif
974out_err_no_mmu_notifier:
975 hardware_disable_all();
976out_err_no_disable:
977 kvm_arch_destroy_vm(kvm);
978out_err_no_arch_destroy_vm:
979 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
980 for (i = 0; i < KVM_NR_BUSES; i++)
981 kfree(kvm_get_bus(kvm, i));
982 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
983 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
984 cleanup_srcu_struct(&kvm->irq_srcu);
985out_err_no_irq_srcu:
986 cleanup_srcu_struct(&kvm->srcu);
987out_err_no_srcu:
988 kvm_arch_free_vm(kvm);
989 mmdrop(current->mm);
990 return ERR_PTR(r);
991}
992
993static void kvm_destroy_devices(struct kvm *kvm)
994{
995 struct kvm_device *dev, *tmp;
996
997
998
999
1000
1001
1002 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1003 list_del(&dev->vm_node);
1004 dev->ops->destroy(dev);
1005 }
1006}
1007
1008static void kvm_destroy_vm(struct kvm *kvm)
1009{
1010 int i;
1011 struct mm_struct *mm = kvm->mm;
1012
1013 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1014 kvm_destroy_vm_debugfs(kvm);
1015 kvm_arch_sync_events(kvm);
1016 mutex_lock(&kvm_lock);
1017 list_del(&kvm->vm_list);
1018 mutex_unlock(&kvm_lock);
1019 kvm_arch_pre_destroy_vm(kvm);
1020
1021 kvm_free_irq_routing(kvm);
1022 for (i = 0; i < KVM_NR_BUSES; i++) {
1023 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1024
1025 if (bus)
1026 kvm_io_bus_destroy(bus);
1027 kvm->buses[i] = NULL;
1028 }
1029 kvm_coalesced_mmio_free(kvm);
1030#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1031 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1032#else
1033 kvm_arch_flush_shadow_all(kvm);
1034#endif
1035 kvm_arch_destroy_vm(kvm);
1036 kvm_destroy_devices(kvm);
1037 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1038 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1039 cleanup_srcu_struct(&kvm->irq_srcu);
1040 cleanup_srcu_struct(&kvm->srcu);
1041 kvm_arch_free_vm(kvm);
1042 preempt_notifier_dec();
1043 hardware_disable_all();
1044 mmdrop(mm);
1045}
1046
1047void kvm_get_kvm(struct kvm *kvm)
1048{
1049 refcount_inc(&kvm->users_count);
1050}
1051EXPORT_SYMBOL_GPL(kvm_get_kvm);
1052
1053void kvm_put_kvm(struct kvm *kvm)
1054{
1055 if (refcount_dec_and_test(&kvm->users_count))
1056 kvm_destroy_vm(kvm);
1057}
1058EXPORT_SYMBOL_GPL(kvm_put_kvm);
1059
1060
1061
1062
1063
1064
1065
1066
1067void kvm_put_kvm_no_destroy(struct kvm *kvm)
1068{
1069 WARN_ON(refcount_dec_and_test(&kvm->users_count));
1070}
1071EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1072
1073static int kvm_vm_release(struct inode *inode, struct file *filp)
1074{
1075 struct kvm *kvm = filp->private_data;
1076
1077 kvm_irqfd_release(kvm);
1078
1079 kvm_put_kvm(kvm);
1080 return 0;
1081}
1082
1083
1084
1085
1086
1087static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1088{
1089 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
1090
1091 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
1092 if (!memslot->dirty_bitmap)
1093 return -ENOMEM;
1094
1095 return 0;
1096}
1097
1098
1099
1100
1101
1102static inline void kvm_memslot_delete(struct kvm_memslots *slots,
1103 struct kvm_memory_slot *memslot)
1104{
1105 struct kvm_memory_slot *mslots = slots->memslots;
1106 int i;
1107
1108 if (WARN_ON(slots->id_to_index[memslot->id] == -1))
1109 return;
1110
1111 slots->used_slots--;
1112
1113 if (atomic_read(&slots->lru_slot) >= slots->used_slots)
1114 atomic_set(&slots->lru_slot, 0);
1115
1116 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
1117 mslots[i] = mslots[i + 1];
1118 slots->id_to_index[mslots[i].id] = i;
1119 }
1120 mslots[i] = *memslot;
1121 slots->id_to_index[memslot->id] = -1;
1122}
1123
1124
1125
1126
1127
1128static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
1129{
1130 return slots->used_slots++;
1131}
1132
1133
1134
1135
1136
1137
1138
1139
1140static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
1141 struct kvm_memory_slot *memslot)
1142{
1143 struct kvm_memory_slot *mslots = slots->memslots;
1144 int i;
1145
1146 if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
1147 WARN_ON_ONCE(!slots->used_slots))
1148 return -1;
1149
1150
1151
1152
1153
1154
1155 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
1156 if (memslot->base_gfn > mslots[i + 1].base_gfn)
1157 break;
1158
1159 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1160
1161
1162 mslots[i] = mslots[i + 1];
1163 slots->id_to_index[mslots[i].id] = i;
1164 }
1165 return i;
1166}
1167
1168
1169
1170
1171
1172
1173
1174
1175static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1176 struct kvm_memory_slot *memslot,
1177 int start)
1178{
1179 struct kvm_memory_slot *mslots = slots->memslots;
1180 int i;
1181
1182 for (i = start; i > 0; i--) {
1183 if (memslot->base_gfn < mslots[i - 1].base_gfn)
1184 break;
1185
1186 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1187
1188
1189 mslots[i] = mslots[i - 1];
1190 slots->id_to_index[mslots[i].id] = i;
1191 }
1192 return i;
1193}
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236static void update_memslots(struct kvm_memslots *slots,
1237 struct kvm_memory_slot *memslot,
1238 enum kvm_mr_change change)
1239{
1240 int i;
1241
1242 if (change == KVM_MR_DELETE) {
1243 kvm_memslot_delete(slots, memslot);
1244 } else {
1245 if (change == KVM_MR_CREATE)
1246 i = kvm_memslot_insert_back(slots);
1247 else
1248 i = kvm_memslot_move_backward(slots, memslot);
1249 i = kvm_memslot_move_forward(slots, memslot, i);
1250
1251
1252
1253
1254
1255 slots->memslots[i] = *memslot;
1256 slots->id_to_index[memslot->id] = i;
1257 }
1258}
1259
1260static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1261{
1262 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1263
1264#ifdef __KVM_HAVE_READONLY_MEM
1265 valid_flags |= KVM_MEM_READONLY;
1266#endif
1267
1268 if (mem->flags & ~valid_flags)
1269 return -EINVAL;
1270
1271 return 0;
1272}
1273
1274static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
1275 int as_id, struct kvm_memslots *slots)
1276{
1277 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
1278 u64 gen = old_memslots->generation;
1279
1280 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1281 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1282
1283 rcu_assign_pointer(kvm->memslots[as_id], slots);
1284 synchronize_srcu_expedited(&kvm->srcu);
1285
1286
1287
1288
1289
1290
1291
1292 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1293
1294
1295
1296
1297
1298
1299
1300
1301 gen += KVM_ADDRESS_SPACE_NUM;
1302
1303 kvm_arch_memslots_updated(kvm, gen);
1304
1305 slots->generation = gen;
1306
1307 return old_memslots;
1308}
1309
1310
1311
1312
1313
1314
1315static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1316 enum kvm_mr_change change)
1317{
1318 struct kvm_memslots *slots;
1319 size_t old_size, new_size;
1320
1321 old_size = sizeof(struct kvm_memslots) +
1322 (sizeof(struct kvm_memory_slot) * old->used_slots);
1323
1324 if (change == KVM_MR_CREATE)
1325 new_size = old_size + sizeof(struct kvm_memory_slot);
1326 else
1327 new_size = old_size;
1328
1329 slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1330 if (likely(slots))
1331 memcpy(slots, old, old_size);
1332
1333 return slots;
1334}
1335
1336static int kvm_set_memslot(struct kvm *kvm,
1337 const struct kvm_userspace_memory_region *mem,
1338 struct kvm_memory_slot *old,
1339 struct kvm_memory_slot *new, int as_id,
1340 enum kvm_mr_change change)
1341{
1342 struct kvm_memory_slot *slot;
1343 struct kvm_memslots *slots;
1344 int r;
1345
1346 slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
1347 if (!slots)
1348 return -ENOMEM;
1349
1350 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1351
1352
1353
1354
1355 slot = id_to_memslot(slots, old->id);
1356 slot->flags |= KVM_MEMSLOT_INVALID;
1357
1358
1359
1360
1361
1362
1363
1364 slots = install_new_memslots(kvm, as_id, slots);
1365
1366
1367
1368
1369
1370
1371
1372
1373 kvm_arch_flush_shadow_memslot(kvm, slot);
1374 }
1375
1376 r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
1377 if (r)
1378 goto out_slots;
1379
1380 update_memslots(slots, new, change);
1381 slots = install_new_memslots(kvm, as_id, slots);
1382
1383 kvm_arch_commit_memory_region(kvm, mem, old, new, change);
1384
1385 kvfree(slots);
1386 return 0;
1387
1388out_slots:
1389 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1390 slots = install_new_memslots(kvm, as_id, slots);
1391 kvfree(slots);
1392 return r;
1393}
1394
1395static int kvm_delete_memslot(struct kvm *kvm,
1396 const struct kvm_userspace_memory_region *mem,
1397 struct kvm_memory_slot *old, int as_id)
1398{
1399 struct kvm_memory_slot new;
1400 int r;
1401
1402 if (!old->npages)
1403 return -EINVAL;
1404
1405 memset(&new, 0, sizeof(new));
1406 new.id = old->id;
1407
1408
1409
1410
1411 new.as_id = as_id;
1412
1413 r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
1414 if (r)
1415 return r;
1416
1417 kvm_free_memslot(kvm, old);
1418 return 0;
1419}
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429int __kvm_set_memory_region(struct kvm *kvm,
1430 const struct kvm_userspace_memory_region *mem)
1431{
1432 struct kvm_memory_slot old, new;
1433 struct kvm_memory_slot *tmp;
1434 enum kvm_mr_change change;
1435 int as_id, id;
1436 int r;
1437
1438 r = check_memory_region_flags(mem);
1439 if (r)
1440 return r;
1441
1442 as_id = mem->slot >> 16;
1443 id = (u16)mem->slot;
1444
1445
1446 if (mem->memory_size & (PAGE_SIZE - 1))
1447 return -EINVAL;
1448 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1449 return -EINVAL;
1450
1451 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1452 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1453 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1454 mem->memory_size))
1455 return -EINVAL;
1456 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1457 return -EINVAL;
1458 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1459 return -EINVAL;
1460
1461
1462
1463
1464
1465
1466
1467 tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1468 if (tmp) {
1469 old = *tmp;
1470 tmp = NULL;
1471 } else {
1472 memset(&old, 0, sizeof(old));
1473 old.id = id;
1474 }
1475
1476 if (!mem->memory_size)
1477 return kvm_delete_memslot(kvm, mem, &old, as_id);
1478
1479 new.as_id = as_id;
1480 new.id = id;
1481 new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1482 new.npages = mem->memory_size >> PAGE_SHIFT;
1483 new.flags = mem->flags;
1484 new.userspace_addr = mem->userspace_addr;
1485
1486 if (new.npages > KVM_MEM_MAX_NR_PAGES)
1487 return -EINVAL;
1488
1489 if (!old.npages) {
1490 change = KVM_MR_CREATE;
1491 new.dirty_bitmap = NULL;
1492 memset(&new.arch, 0, sizeof(new.arch));
1493 } else {
1494 if ((new.userspace_addr != old.userspace_addr) ||
1495 (new.npages != old.npages) ||
1496 ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1497 return -EINVAL;
1498
1499 if (new.base_gfn != old.base_gfn)
1500 change = KVM_MR_MOVE;
1501 else if (new.flags != old.flags)
1502 change = KVM_MR_FLAGS_ONLY;
1503 else
1504 return 0;
1505
1506
1507 new.dirty_bitmap = old.dirty_bitmap;
1508 memcpy(&new.arch, &old.arch, sizeof(new.arch));
1509 }
1510
1511 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
1512
1513 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1514 if (tmp->id == id)
1515 continue;
1516 if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1517 (new.base_gfn >= tmp->base_gfn + tmp->npages)))
1518 return -EEXIST;
1519 }
1520 }
1521
1522
1523 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1524 new.dirty_bitmap = NULL;
1525 else if (!new.dirty_bitmap && !kvm->dirty_ring_size) {
1526 r = kvm_alloc_dirty_bitmap(&new);
1527 if (r)
1528 return r;
1529
1530 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1531 bitmap_set(new.dirty_bitmap, 0, new.npages);
1532 }
1533
1534 r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
1535 if (r)
1536 goto out_bitmap;
1537
1538 if (old.dirty_bitmap && !new.dirty_bitmap)
1539 kvm_destroy_dirty_bitmap(&old);
1540 return 0;
1541
1542out_bitmap:
1543 if (new.dirty_bitmap && !old.dirty_bitmap)
1544 kvm_destroy_dirty_bitmap(&new);
1545 return r;
1546}
1547EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1548
1549int kvm_set_memory_region(struct kvm *kvm,
1550 const struct kvm_userspace_memory_region *mem)
1551{
1552 int r;
1553
1554 mutex_lock(&kvm->slots_lock);
1555 r = __kvm_set_memory_region(kvm, mem);
1556 mutex_unlock(&kvm->slots_lock);
1557 return r;
1558}
1559EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1560
1561static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1562 struct kvm_userspace_memory_region *mem)
1563{
1564 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1565 return -EINVAL;
1566
1567 return kvm_set_memory_region(kvm, mem);
1568}
1569
1570#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1571
1572
1573
1574
1575
1576
1577
1578int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1579 int *is_dirty, struct kvm_memory_slot **memslot)
1580{
1581 struct kvm_memslots *slots;
1582 int i, as_id, id;
1583 unsigned long n;
1584 unsigned long any = 0;
1585
1586
1587 if (kvm->dirty_ring_size)
1588 return -ENXIO;
1589
1590 *memslot = NULL;
1591 *is_dirty = 0;
1592
1593 as_id = log->slot >> 16;
1594 id = (u16)log->slot;
1595 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1596 return -EINVAL;
1597
1598 slots = __kvm_memslots(kvm, as_id);
1599 *memslot = id_to_memslot(slots, id);
1600 if (!(*memslot) || !(*memslot)->dirty_bitmap)
1601 return -ENOENT;
1602
1603 kvm_arch_sync_dirty_log(kvm, *memslot);
1604
1605 n = kvm_dirty_bitmap_bytes(*memslot);
1606
1607 for (i = 0; !any && i < n/sizeof(long); ++i)
1608 any = (*memslot)->dirty_bitmap[i];
1609
1610 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1611 return -EFAULT;
1612
1613 if (any)
1614 *is_dirty = 1;
1615 return 0;
1616}
1617EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1618
1619#else
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
1642{
1643 struct kvm_memslots *slots;
1644 struct kvm_memory_slot *memslot;
1645 int i, as_id, id;
1646 unsigned long n;
1647 unsigned long *dirty_bitmap;
1648 unsigned long *dirty_bitmap_buffer;
1649 bool flush;
1650
1651
1652 if (kvm->dirty_ring_size)
1653 return -ENXIO;
1654
1655 as_id = log->slot >> 16;
1656 id = (u16)log->slot;
1657 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1658 return -EINVAL;
1659
1660 slots = __kvm_memslots(kvm, as_id);
1661 memslot = id_to_memslot(slots, id);
1662 if (!memslot || !memslot->dirty_bitmap)
1663 return -ENOENT;
1664
1665 dirty_bitmap = memslot->dirty_bitmap;
1666
1667 kvm_arch_sync_dirty_log(kvm, memslot);
1668
1669 n = kvm_dirty_bitmap_bytes(memslot);
1670 flush = false;
1671 if (kvm->manual_dirty_log_protect) {
1672
1673
1674
1675
1676
1677
1678
1679
1680 dirty_bitmap_buffer = dirty_bitmap;
1681 } else {
1682 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1683 memset(dirty_bitmap_buffer, 0, n);
1684
1685 KVM_MMU_LOCK(kvm);
1686 for (i = 0; i < n / sizeof(long); i++) {
1687 unsigned long mask;
1688 gfn_t offset;
1689
1690 if (!dirty_bitmap[i])
1691 continue;
1692
1693 flush = true;
1694 mask = xchg(&dirty_bitmap[i], 0);
1695 dirty_bitmap_buffer[i] = mask;
1696
1697 offset = i * BITS_PER_LONG;
1698 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1699 offset, mask);
1700 }
1701 KVM_MMU_UNLOCK(kvm);
1702 }
1703
1704 if (flush)
1705 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1706
1707 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1708 return -EFAULT;
1709 return 0;
1710}
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1733 struct kvm_dirty_log *log)
1734{
1735 int r;
1736
1737 mutex_lock(&kvm->slots_lock);
1738
1739 r = kvm_get_dirty_log_protect(kvm, log);
1740
1741 mutex_unlock(&kvm->slots_lock);
1742 return r;
1743}
1744
1745
1746
1747
1748
1749
1750
1751static int kvm_clear_dirty_log_protect(struct kvm *kvm,
1752 struct kvm_clear_dirty_log *log)
1753{
1754 struct kvm_memslots *slots;
1755 struct kvm_memory_slot *memslot;
1756 int as_id, id;
1757 gfn_t offset;
1758 unsigned long i, n;
1759 unsigned long *dirty_bitmap;
1760 unsigned long *dirty_bitmap_buffer;
1761 bool flush;
1762
1763
1764 if (kvm->dirty_ring_size)
1765 return -ENXIO;
1766
1767 as_id = log->slot >> 16;
1768 id = (u16)log->slot;
1769 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1770 return -EINVAL;
1771
1772 if (log->first_page & 63)
1773 return -EINVAL;
1774
1775 slots = __kvm_memslots(kvm, as_id);
1776 memslot = id_to_memslot(slots, id);
1777 if (!memslot || !memslot->dirty_bitmap)
1778 return -ENOENT;
1779
1780 dirty_bitmap = memslot->dirty_bitmap;
1781
1782 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
1783
1784 if (log->first_page > memslot->npages ||
1785 log->num_pages > memslot->npages - log->first_page ||
1786 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
1787 return -EINVAL;
1788
1789 kvm_arch_sync_dirty_log(kvm, memslot);
1790
1791 flush = false;
1792 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1793 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
1794 return -EFAULT;
1795
1796 KVM_MMU_LOCK(kvm);
1797 for (offset = log->first_page, i = offset / BITS_PER_LONG,
1798 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
1799 i++, offset += BITS_PER_LONG) {
1800 unsigned long mask = *dirty_bitmap_buffer++;
1801 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
1802 if (!mask)
1803 continue;
1804
1805 mask &= atomic_long_fetch_andnot(mask, p);
1806
1807
1808
1809
1810
1811
1812
1813 if (mask) {
1814 flush = true;
1815 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1816 offset, mask);
1817 }
1818 }
1819 KVM_MMU_UNLOCK(kvm);
1820
1821 if (flush)
1822 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1823
1824 return 0;
1825}
1826
1827static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
1828 struct kvm_clear_dirty_log *log)
1829{
1830 int r;
1831
1832 mutex_lock(&kvm->slots_lock);
1833
1834 r = kvm_clear_dirty_log_protect(kvm, log);
1835
1836 mutex_unlock(&kvm->slots_lock);
1837 return r;
1838}
1839#endif
1840
1841struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1842{
1843 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
1844}
1845EXPORT_SYMBOL_GPL(gfn_to_memslot);
1846
1847struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
1848{
1849 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
1850}
1851EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
1852
1853bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
1854{
1855 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
1856
1857 return kvm_is_visible_memslot(memslot);
1858}
1859EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
1860
1861bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
1862{
1863 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1864
1865 return kvm_is_visible_memslot(memslot);
1866}
1867EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
1868
1869unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
1870{
1871 struct vm_area_struct *vma;
1872 unsigned long addr, size;
1873
1874 size = PAGE_SIZE;
1875
1876 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
1877 if (kvm_is_error_hva(addr))
1878 return PAGE_SIZE;
1879
1880 mmap_read_lock(current->mm);
1881 vma = find_vma(current->mm, addr);
1882 if (!vma)
1883 goto out;
1884
1885 size = vma_kernel_pagesize(vma);
1886
1887out:
1888 mmap_read_unlock(current->mm);
1889
1890 return size;
1891}
1892
1893static bool memslot_is_readonly(struct kvm_memory_slot *slot)
1894{
1895 return slot->flags & KVM_MEM_READONLY;
1896}
1897
1898static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1899 gfn_t *nr_pages, bool write)
1900{
1901 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1902 return KVM_HVA_ERR_BAD;
1903
1904 if (memslot_is_readonly(slot) && write)
1905 return KVM_HVA_ERR_RO_BAD;
1906
1907 if (nr_pages)
1908 *nr_pages = slot->npages - (gfn - slot->base_gfn);
1909
1910 return __gfn_to_hva_memslot(slot, gfn);
1911}
1912
1913static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1914 gfn_t *nr_pages)
1915{
1916 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
1917}
1918
1919unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
1920 gfn_t gfn)
1921{
1922 return gfn_to_hva_many(slot, gfn, NULL);
1923}
1924EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
1925
1926unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1927{
1928 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1929}
1930EXPORT_SYMBOL_GPL(gfn_to_hva);
1931
1932unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
1933{
1934 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
1935}
1936EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
1947 gfn_t gfn, bool *writable)
1948{
1949 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
1950
1951 if (!kvm_is_error_hva(hva) && writable)
1952 *writable = !memslot_is_readonly(slot);
1953
1954 return hva;
1955}
1956
1957unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
1958{
1959 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
1960
1961 return gfn_to_hva_memslot_prot(slot, gfn, writable);
1962}
1963
1964unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
1965{
1966 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1967
1968 return gfn_to_hva_memslot_prot(slot, gfn, writable);
1969}
1970
1971static inline int check_user_page_hwpoison(unsigned long addr)
1972{
1973 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
1974
1975 rc = get_user_pages(addr, 1, flags, NULL, NULL);
1976 return rc == -EHWPOISON;
1977}
1978
1979
1980
1981
1982
1983
1984static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
1985 bool *writable, kvm_pfn_t *pfn)
1986{
1987 struct page *page[1];
1988
1989
1990
1991
1992
1993
1994 if (!(write_fault || writable))
1995 return false;
1996
1997 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
1998 *pfn = page_to_pfn(page[0]);
1999
2000 if (writable)
2001 *writable = true;
2002 return true;
2003 }
2004
2005 return false;
2006}
2007
2008
2009
2010
2011
2012static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2013 bool *writable, kvm_pfn_t *pfn)
2014{
2015 unsigned int flags = FOLL_HWPOISON;
2016 struct page *page;
2017 int npages = 0;
2018
2019 might_sleep();
2020
2021 if (writable)
2022 *writable = write_fault;
2023
2024 if (write_fault)
2025 flags |= FOLL_WRITE;
2026 if (async)
2027 flags |= FOLL_NOWAIT;
2028
2029 npages = get_user_pages_unlocked(addr, 1, &page, flags);
2030 if (npages != 1)
2031 return npages;
2032
2033
2034 if (unlikely(!write_fault) && writable) {
2035 struct page *wpage;
2036
2037 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2038 *writable = true;
2039 put_page(page);
2040 page = wpage;
2041 }
2042 }
2043 *pfn = page_to_pfn(page);
2044 return npages;
2045}
2046
2047static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2048{
2049 if (unlikely(!(vma->vm_flags & VM_READ)))
2050 return false;
2051
2052 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2053 return false;
2054
2055 return true;
2056}
2057
2058static int kvm_try_get_pfn(kvm_pfn_t pfn)
2059{
2060 if (kvm_is_reserved_pfn(pfn))
2061 return 1;
2062 return get_page_unless_zero(pfn_to_page(pfn));
2063}
2064
2065static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2066 unsigned long addr, bool *async,
2067 bool write_fault, bool *writable,
2068 kvm_pfn_t *p_pfn)
2069{
2070 kvm_pfn_t pfn;
2071 pte_t *ptep;
2072 spinlock_t *ptl;
2073 int r;
2074
2075 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2076 if (r) {
2077
2078
2079
2080
2081 bool unlocked = false;
2082 r = fixup_user_fault(current->mm, addr,
2083 (write_fault ? FAULT_FLAG_WRITE : 0),
2084 &unlocked);
2085 if (unlocked)
2086 return -EAGAIN;
2087 if (r)
2088 return r;
2089
2090 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2091 if (r)
2092 return r;
2093 }
2094
2095 if (write_fault && !pte_write(*ptep)) {
2096 pfn = KVM_PFN_ERR_RO_FAULT;
2097 goto out;
2098 }
2099
2100 if (writable)
2101 *writable = pte_write(*ptep);
2102 pfn = pte_pfn(*ptep);
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121 if (!kvm_try_get_pfn(pfn))
2122 r = -EFAULT;
2123
2124out:
2125 pte_unmap_unlock(ptep, ptl);
2126 *p_pfn = pfn;
2127
2128 return r;
2129}
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
2146 bool write_fault, bool *writable)
2147{
2148 struct vm_area_struct *vma;
2149 kvm_pfn_t pfn = 0;
2150 int npages, r;
2151
2152
2153 BUG_ON(atomic && async);
2154
2155 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2156 return pfn;
2157
2158 if (atomic)
2159 return KVM_PFN_ERR_FAULT;
2160
2161 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
2162 if (npages == 1)
2163 return pfn;
2164
2165 mmap_read_lock(current->mm);
2166 if (npages == -EHWPOISON ||
2167 (!async && check_user_page_hwpoison(addr))) {
2168 pfn = KVM_PFN_ERR_HWPOISON;
2169 goto exit;
2170 }
2171
2172retry:
2173 vma = find_vma_intersection(current->mm, addr, addr + 1);
2174
2175 if (vma == NULL)
2176 pfn = KVM_PFN_ERR_FAULT;
2177 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2178 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
2179 if (r == -EAGAIN)
2180 goto retry;
2181 if (r < 0)
2182 pfn = KVM_PFN_ERR_FAULT;
2183 } else {
2184 if (async && vma_is_valid(vma, write_fault))
2185 *async = true;
2186 pfn = KVM_PFN_ERR_FAULT;
2187 }
2188exit:
2189 mmap_read_unlock(current->mm);
2190 return pfn;
2191}
2192
2193kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
2194 bool atomic, bool *async, bool write_fault,
2195 bool *writable, hva_t *hva)
2196{
2197 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2198
2199 if (hva)
2200 *hva = addr;
2201
2202 if (addr == KVM_HVA_ERR_RO_BAD) {
2203 if (writable)
2204 *writable = false;
2205 return KVM_PFN_ERR_RO_FAULT;
2206 }
2207
2208 if (kvm_is_error_hva(addr)) {
2209 if (writable)
2210 *writable = false;
2211 return KVM_PFN_NOSLOT;
2212 }
2213
2214
2215 if (writable && memslot_is_readonly(slot)) {
2216 *writable = false;
2217 writable = NULL;
2218 }
2219
2220 return hva_to_pfn(addr, atomic, async, write_fault,
2221 writable);
2222}
2223EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2224
2225kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2226 bool *writable)
2227{
2228 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2229 write_fault, writable, NULL);
2230}
2231EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2232
2233kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
2234{
2235 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
2236}
2237EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2238
2239kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
2240{
2241 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
2242}
2243EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2244
2245kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2246{
2247 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2248}
2249EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2250
2251kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2252{
2253 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2254}
2255EXPORT_SYMBOL_GPL(gfn_to_pfn);
2256
2257kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2258{
2259 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2260}
2261EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2262
2263int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2264 struct page **pages, int nr_pages)
2265{
2266 unsigned long addr;
2267 gfn_t entry = 0;
2268
2269 addr = gfn_to_hva_many(slot, gfn, &entry);
2270 if (kvm_is_error_hva(addr))
2271 return -1;
2272
2273 if (entry < nr_pages)
2274 return 0;
2275
2276 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2277}
2278EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2279
2280static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
2281{
2282 if (is_error_noslot_pfn(pfn))
2283 return KVM_ERR_PTR_BAD_PAGE;
2284
2285 if (kvm_is_reserved_pfn(pfn)) {
2286 WARN_ON(1);
2287 return KVM_ERR_PTR_BAD_PAGE;
2288 }
2289
2290 return pfn_to_page(pfn);
2291}
2292
2293struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2294{
2295 kvm_pfn_t pfn;
2296
2297 pfn = gfn_to_pfn(kvm, gfn);
2298
2299 return kvm_pfn_to_page(pfn);
2300}
2301EXPORT_SYMBOL_GPL(gfn_to_page);
2302
2303void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
2304{
2305 if (pfn == 0)
2306 return;
2307
2308 if (cache)
2309 cache->pfn = cache->gfn = 0;
2310
2311 if (dirty)
2312 kvm_release_pfn_dirty(pfn);
2313 else
2314 kvm_release_pfn_clean(pfn);
2315}
2316
2317static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
2318 struct gfn_to_pfn_cache *cache, u64 gen)
2319{
2320 kvm_release_pfn(cache->pfn, cache->dirty, cache);
2321
2322 cache->pfn = gfn_to_pfn_memslot(slot, gfn);
2323 cache->gfn = gfn;
2324 cache->dirty = false;
2325 cache->generation = gen;
2326}
2327
2328static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
2329 struct kvm_host_map *map,
2330 struct gfn_to_pfn_cache *cache,
2331 bool atomic)
2332{
2333 kvm_pfn_t pfn;
2334 void *hva = NULL;
2335 struct page *page = KVM_UNMAPPED_PAGE;
2336 struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
2337 u64 gen = slots->generation;
2338
2339 if (!map)
2340 return -EINVAL;
2341
2342 if (cache) {
2343 if (!cache->pfn || cache->gfn != gfn ||
2344 cache->generation != gen) {
2345 if (atomic)
2346 return -EAGAIN;
2347 kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
2348 }
2349 pfn = cache->pfn;
2350 } else {
2351 if (atomic)
2352 return -EAGAIN;
2353 pfn = gfn_to_pfn_memslot(slot, gfn);
2354 }
2355 if (is_error_noslot_pfn(pfn))
2356 return -EINVAL;
2357
2358 if (pfn_valid(pfn)) {
2359 page = pfn_to_page(pfn);
2360 if (atomic)
2361 hva = kmap_atomic(page);
2362 else
2363 hva = kmap(page);
2364#ifdef CONFIG_HAS_IOMEM
2365 } else if (!atomic) {
2366 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2367 } else {
2368 return -EINVAL;
2369#endif
2370 }
2371
2372 if (!hva)
2373 return -EFAULT;
2374
2375 map->page = page;
2376 map->hva = hva;
2377 map->pfn = pfn;
2378 map->gfn = gfn;
2379
2380 return 0;
2381}
2382
2383int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
2384 struct gfn_to_pfn_cache *cache, bool atomic)
2385{
2386 return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
2387 cache, atomic);
2388}
2389EXPORT_SYMBOL_GPL(kvm_map_gfn);
2390
2391int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2392{
2393 return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
2394 NULL, false);
2395}
2396EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2397
2398static void __kvm_unmap_gfn(struct kvm *kvm,
2399 struct kvm_memory_slot *memslot,
2400 struct kvm_host_map *map,
2401 struct gfn_to_pfn_cache *cache,
2402 bool dirty, bool atomic)
2403{
2404 if (!map)
2405 return;
2406
2407 if (!map->hva)
2408 return;
2409
2410 if (map->page != KVM_UNMAPPED_PAGE) {
2411 if (atomic)
2412 kunmap_atomic(map->hva);
2413 else
2414 kunmap(map->page);
2415 }
2416#ifdef CONFIG_HAS_IOMEM
2417 else if (!atomic)
2418 memunmap(map->hva);
2419 else
2420 WARN_ONCE(1, "Unexpected unmapping in atomic context");
2421#endif
2422
2423 if (dirty)
2424 mark_page_dirty_in_slot(kvm, memslot, map->gfn);
2425
2426 if (cache)
2427 cache->dirty |= dirty;
2428 else
2429 kvm_release_pfn(map->pfn, dirty, NULL);
2430
2431 map->hva = NULL;
2432 map->page = NULL;
2433}
2434
2435int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
2436 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
2437{
2438 __kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map,
2439 cache, dirty, atomic);
2440 return 0;
2441}
2442EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
2443
2444void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2445{
2446 __kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn),
2447 map, NULL, dirty, false);
2448}
2449EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2450
2451struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2452{
2453 kvm_pfn_t pfn;
2454
2455 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2456
2457 return kvm_pfn_to_page(pfn);
2458}
2459EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2460
2461void kvm_release_page_clean(struct page *page)
2462{
2463 WARN_ON(is_error_page(page));
2464
2465 kvm_release_pfn_clean(page_to_pfn(page));
2466}
2467EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2468
2469void kvm_release_pfn_clean(kvm_pfn_t pfn)
2470{
2471 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2472 put_page(pfn_to_page(pfn));
2473}
2474EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2475
2476void kvm_release_page_dirty(struct page *page)
2477{
2478 WARN_ON(is_error_page(page));
2479
2480 kvm_release_pfn_dirty(page_to_pfn(page));
2481}
2482EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2483
2484void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2485{
2486 kvm_set_pfn_dirty(pfn);
2487 kvm_release_pfn_clean(pfn);
2488}
2489EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2490
2491void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2492{
2493 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2494 SetPageDirty(pfn_to_page(pfn));
2495}
2496EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2497
2498void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2499{
2500 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2501 mark_page_accessed(pfn_to_page(pfn));
2502}
2503EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2504
2505void kvm_get_pfn(kvm_pfn_t pfn)
2506{
2507 if (!kvm_is_reserved_pfn(pfn))
2508 get_page(pfn_to_page(pfn));
2509}
2510EXPORT_SYMBOL_GPL(kvm_get_pfn);
2511
2512static int next_segment(unsigned long len, int offset)
2513{
2514 if (len > PAGE_SIZE - offset)
2515 return PAGE_SIZE - offset;
2516 else
2517 return len;
2518}
2519
2520static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2521 void *data, int offset, int len)
2522{
2523 int r;
2524 unsigned long addr;
2525
2526 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2527 if (kvm_is_error_hva(addr))
2528 return -EFAULT;
2529 r = __copy_from_user(data, (void __user *)addr + offset, len);
2530 if (r)
2531 return -EFAULT;
2532 return 0;
2533}
2534
2535int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2536 int len)
2537{
2538 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2539
2540 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2541}
2542EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2543
2544int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2545 int offset, int len)
2546{
2547 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2548
2549 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2550}
2551EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2552
2553int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2554{
2555 gfn_t gfn = gpa >> PAGE_SHIFT;
2556 int seg;
2557 int offset = offset_in_page(gpa);
2558 int ret;
2559
2560 while ((seg = next_segment(len, offset)) != 0) {
2561 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2562 if (ret < 0)
2563 return ret;
2564 offset = 0;
2565 len -= seg;
2566 data += seg;
2567 ++gfn;
2568 }
2569 return 0;
2570}
2571EXPORT_SYMBOL_GPL(kvm_read_guest);
2572
2573int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2574{
2575 gfn_t gfn = gpa >> PAGE_SHIFT;
2576 int seg;
2577 int offset = offset_in_page(gpa);
2578 int ret;
2579
2580 while ((seg = next_segment(len, offset)) != 0) {
2581 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2582 if (ret < 0)
2583 return ret;
2584 offset = 0;
2585 len -= seg;
2586 data += seg;
2587 ++gfn;
2588 }
2589 return 0;
2590}
2591EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2592
2593static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2594 void *data, int offset, unsigned long len)
2595{
2596 int r;
2597 unsigned long addr;
2598
2599 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2600 if (kvm_is_error_hva(addr))
2601 return -EFAULT;
2602 pagefault_disable();
2603 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2604 pagefault_enable();
2605 if (r)
2606 return -EFAULT;
2607 return 0;
2608}
2609
2610int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2611 void *data, unsigned long len)
2612{
2613 gfn_t gfn = gpa >> PAGE_SHIFT;
2614 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2615 int offset = offset_in_page(gpa);
2616
2617 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2618}
2619EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2620
2621static int __kvm_write_guest_page(struct kvm *kvm,
2622 struct kvm_memory_slot *memslot, gfn_t gfn,
2623 const void *data, int offset, int len)
2624{
2625 int r;
2626 unsigned long addr;
2627
2628 addr = gfn_to_hva_memslot(memslot, gfn);
2629 if (kvm_is_error_hva(addr))
2630 return -EFAULT;
2631 r = __copy_to_user((void __user *)addr + offset, data, len);
2632 if (r)
2633 return -EFAULT;
2634 mark_page_dirty_in_slot(kvm, memslot, gfn);
2635 return 0;
2636}
2637
2638int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2639 const void *data, int offset, int len)
2640{
2641 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2642
2643 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
2644}
2645EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2646
2647int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2648 const void *data, int offset, int len)
2649{
2650 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2651
2652 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
2653}
2654EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2655
2656int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2657 unsigned long len)
2658{
2659 gfn_t gfn = gpa >> PAGE_SHIFT;
2660 int seg;
2661 int offset = offset_in_page(gpa);
2662 int ret;
2663
2664 while ((seg = next_segment(len, offset)) != 0) {
2665 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2666 if (ret < 0)
2667 return ret;
2668 offset = 0;
2669 len -= seg;
2670 data += seg;
2671 ++gfn;
2672 }
2673 return 0;
2674}
2675EXPORT_SYMBOL_GPL(kvm_write_guest);
2676
2677int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2678 unsigned long len)
2679{
2680 gfn_t gfn = gpa >> PAGE_SHIFT;
2681 int seg;
2682 int offset = offset_in_page(gpa);
2683 int ret;
2684
2685 while ((seg = next_segment(len, offset)) != 0) {
2686 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2687 if (ret < 0)
2688 return ret;
2689 offset = 0;
2690 len -= seg;
2691 data += seg;
2692 ++gfn;
2693 }
2694 return 0;
2695}
2696EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2697
2698static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2699 struct gfn_to_hva_cache *ghc,
2700 gpa_t gpa, unsigned long len)
2701{
2702 int offset = offset_in_page(gpa);
2703 gfn_t start_gfn = gpa >> PAGE_SHIFT;
2704 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2705 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2706 gfn_t nr_pages_avail;
2707
2708
2709 ghc->generation = slots->generation;
2710
2711 if (start_gfn > end_gfn) {
2712 ghc->hva = KVM_HVA_ERR_BAD;
2713 return -EINVAL;
2714 }
2715
2716
2717
2718
2719
2720 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
2721 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2722 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2723 &nr_pages_avail);
2724 if (kvm_is_error_hva(ghc->hva))
2725 return -EFAULT;
2726 }
2727
2728
2729 if (nr_pages_needed == 1)
2730 ghc->hva += offset;
2731 else
2732 ghc->memslot = NULL;
2733
2734 ghc->gpa = gpa;
2735 ghc->len = len;
2736 return 0;
2737}
2738
2739int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2740 gpa_t gpa, unsigned long len)
2741{
2742 struct kvm_memslots *slots = kvm_memslots(kvm);
2743 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
2744}
2745EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
2746
2747int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2748 void *data, unsigned int offset,
2749 unsigned long len)
2750{
2751 struct kvm_memslots *slots = kvm_memslots(kvm);
2752 int r;
2753 gpa_t gpa = ghc->gpa + offset;
2754
2755 BUG_ON(len + offset > ghc->len);
2756
2757 if (slots->generation != ghc->generation) {
2758 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2759 return -EFAULT;
2760 }
2761
2762 if (kvm_is_error_hva(ghc->hva))
2763 return -EFAULT;
2764
2765 if (unlikely(!ghc->memslot))
2766 return kvm_write_guest(kvm, gpa, data, len);
2767
2768 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
2769 if (r)
2770 return -EFAULT;
2771 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
2772
2773 return 0;
2774}
2775EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
2776
2777int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2778 void *data, unsigned long len)
2779{
2780 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
2781}
2782EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
2783
2784int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2785 void *data, unsigned int offset,
2786 unsigned long len)
2787{
2788 struct kvm_memslots *slots = kvm_memslots(kvm);
2789 int r;
2790 gpa_t gpa = ghc->gpa + offset;
2791
2792 BUG_ON(len + offset > ghc->len);
2793
2794 if (slots->generation != ghc->generation) {
2795 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2796 return -EFAULT;
2797 }
2798
2799 if (kvm_is_error_hva(ghc->hva))
2800 return -EFAULT;
2801
2802 if (unlikely(!ghc->memslot))
2803 return kvm_read_guest(kvm, gpa, data, len);
2804
2805 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
2806 if (r)
2807 return -EFAULT;
2808
2809 return 0;
2810}
2811EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
2812
2813int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2814 void *data, unsigned long len)
2815{
2816 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
2817}
2818EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
2819
2820int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
2821{
2822 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
2823 gfn_t gfn = gpa >> PAGE_SHIFT;
2824 int seg;
2825 int offset = offset_in_page(gpa);
2826 int ret;
2827
2828 while ((seg = next_segment(len, offset)) != 0) {
2829 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
2830 if (ret < 0)
2831 return ret;
2832 offset = 0;
2833 len -= seg;
2834 ++gfn;
2835 }
2836 return 0;
2837}
2838EXPORT_SYMBOL_GPL(kvm_clear_guest);
2839
2840void mark_page_dirty_in_slot(struct kvm *kvm,
2841 struct kvm_memory_slot *memslot,
2842 gfn_t gfn)
2843{
2844 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
2845 unsigned long rel_gfn = gfn - memslot->base_gfn;
2846 u32 slot = (memslot->as_id << 16) | memslot->id;
2847
2848 if (kvm->dirty_ring_size)
2849 kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
2850 slot, rel_gfn);
2851 else
2852 set_bit_le(rel_gfn, memslot->dirty_bitmap);
2853 }
2854}
2855EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
2856
2857void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
2858{
2859 struct kvm_memory_slot *memslot;
2860
2861 memslot = gfn_to_memslot(kvm, gfn);
2862 mark_page_dirty_in_slot(kvm, memslot, gfn);
2863}
2864EXPORT_SYMBOL_GPL(mark_page_dirty);
2865
2866void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
2867{
2868 struct kvm_memory_slot *memslot;
2869
2870 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2871 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
2872}
2873EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
2874
2875void kvm_sigset_activate(struct kvm_vcpu *vcpu)
2876{
2877 if (!vcpu->sigset_active)
2878 return;
2879
2880
2881
2882
2883
2884
2885
2886 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked);
2887}
2888
2889void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
2890{
2891 if (!vcpu->sigset_active)
2892 return;
2893
2894 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL);
2895 sigemptyset(¤t->real_blocked);
2896}
2897
2898static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
2899{
2900 unsigned int old, val, grow, grow_start;
2901
2902 old = val = vcpu->halt_poll_ns;
2903 grow_start = READ_ONCE(halt_poll_ns_grow_start);
2904 grow = READ_ONCE(halt_poll_ns_grow);
2905 if (!grow)
2906 goto out;
2907
2908 val *= grow;
2909 if (val < grow_start)
2910 val = grow_start;
2911
2912 if (val > vcpu->kvm->max_halt_poll_ns)
2913 val = vcpu->kvm->max_halt_poll_ns;
2914
2915 vcpu->halt_poll_ns = val;
2916out:
2917 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
2918}
2919
2920static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
2921{
2922 unsigned int old, val, shrink;
2923
2924 old = val = vcpu->halt_poll_ns;
2925 shrink = READ_ONCE(halt_poll_ns_shrink);
2926 if (shrink == 0)
2927 val = 0;
2928 else
2929 val /= shrink;
2930
2931 vcpu->halt_poll_ns = val;
2932 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
2933}
2934
2935static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
2936{
2937 int ret = -EINTR;
2938 int idx = srcu_read_lock(&vcpu->kvm->srcu);
2939
2940 if (kvm_arch_vcpu_runnable(vcpu)) {
2941 kvm_make_request(KVM_REQ_UNHALT, vcpu);
2942 goto out;
2943 }
2944 if (kvm_cpu_has_pending_timer(vcpu))
2945 goto out;
2946 if (signal_pending(current))
2947 goto out;
2948 if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
2949 goto out;
2950
2951 ret = 0;
2952out:
2953 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2954 return ret;
2955}
2956
2957static inline void
2958update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
2959{
2960 if (waited)
2961 vcpu->stat.halt_poll_fail_ns += poll_ns;
2962 else
2963 vcpu->stat.halt_poll_success_ns += poll_ns;
2964}
2965
2966
2967
2968
2969void kvm_vcpu_block(struct kvm_vcpu *vcpu)
2970{
2971 ktime_t start, cur, poll_end;
2972 bool waited = false;
2973 u64 block_ns;
2974
2975 kvm_arch_vcpu_blocking(vcpu);
2976
2977 start = cur = poll_end = ktime_get();
2978 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
2979 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
2980
2981 ++vcpu->stat.halt_attempted_poll;
2982 do {
2983
2984
2985
2986
2987 if (kvm_vcpu_check_block(vcpu) < 0) {
2988 ++vcpu->stat.halt_successful_poll;
2989 if (!vcpu_valid_wakeup(vcpu))
2990 ++vcpu->stat.halt_poll_invalid;
2991 goto out;
2992 }
2993 poll_end = cur = ktime_get();
2994 } while (kvm_vcpu_can_poll(cur, stop));
2995 }
2996
2997 prepare_to_rcuwait(&vcpu->wait);
2998 for (;;) {
2999 set_current_state(TASK_INTERRUPTIBLE);
3000
3001 if (kvm_vcpu_check_block(vcpu) < 0)
3002 break;
3003
3004 waited = true;
3005 schedule();
3006 }
3007 finish_rcuwait(&vcpu->wait);
3008 cur = ktime_get();
3009out:
3010 kvm_arch_vcpu_unblocking(vcpu);
3011 block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3012
3013 update_halt_poll_stats(
3014 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
3015
3016 if (!kvm_arch_no_poll(vcpu)) {
3017 if (!vcpu_valid_wakeup(vcpu)) {
3018 shrink_halt_poll_ns(vcpu);
3019 } else if (vcpu->kvm->max_halt_poll_ns) {
3020 if (block_ns <= vcpu->halt_poll_ns)
3021 ;
3022
3023 else if (vcpu->halt_poll_ns &&
3024 block_ns > vcpu->kvm->max_halt_poll_ns)
3025 shrink_halt_poll_ns(vcpu);
3026
3027 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
3028 block_ns < vcpu->kvm->max_halt_poll_ns)
3029 grow_halt_poll_ns(vcpu);
3030 } else {
3031 vcpu->halt_poll_ns = 0;
3032 }
3033 }
3034
3035 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
3036 kvm_arch_vcpu_block_finish(vcpu);
3037}
3038EXPORT_SYMBOL_GPL(kvm_vcpu_block);
3039
3040bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3041{
3042 struct rcuwait *waitp;
3043
3044 waitp = kvm_arch_vcpu_get_wait(vcpu);
3045 if (rcuwait_wake_up(waitp)) {
3046 WRITE_ONCE(vcpu->ready, true);
3047 ++vcpu->stat.halt_wakeup;
3048 return true;
3049 }
3050
3051 return false;
3052}
3053EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3054
3055#ifndef CONFIG_S390
3056
3057
3058
3059void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3060{
3061 int me;
3062 int cpu = vcpu->cpu;
3063
3064 if (kvm_vcpu_wake_up(vcpu))
3065 return;
3066
3067 me = get_cpu();
3068 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3069 if (kvm_arch_vcpu_should_kick(vcpu))
3070 smp_send_reschedule(cpu);
3071 put_cpu();
3072}
3073EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3074#endif
3075
3076int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3077{
3078 struct pid *pid;
3079 struct task_struct *task = NULL;
3080 int ret = 0;
3081
3082 rcu_read_lock();
3083 pid = rcu_dereference(target->pid);
3084 if (pid)
3085 task = get_pid_task(pid, PIDTYPE_PID);
3086 rcu_read_unlock();
3087 if (!task)
3088 return ret;
3089 ret = yield_to(task, 1);
3090 put_task_struct(task);
3091
3092 return ret;
3093}
3094EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3119{
3120#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3121 bool eligible;
3122
3123 eligible = !vcpu->spin_loop.in_spin_loop ||
3124 vcpu->spin_loop.dy_eligible;
3125
3126 if (vcpu->spin_loop.in_spin_loop)
3127 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3128
3129 return eligible;
3130#else
3131 return true;
3132#endif
3133}
3134
3135
3136
3137
3138
3139
3140bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3141{
3142 return kvm_arch_vcpu_runnable(vcpu);
3143}
3144
3145static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3146{
3147 if (kvm_arch_dy_runnable(vcpu))
3148 return true;
3149
3150#ifdef CONFIG_KVM_ASYNC_PF
3151 if (!list_empty_careful(&vcpu->async_pf.done))
3152 return true;
3153#endif
3154
3155 return false;
3156}
3157
3158bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3159{
3160 return false;
3161}
3162
3163void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3164{
3165 struct kvm *kvm = me->kvm;
3166 struct kvm_vcpu *vcpu;
3167 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3168 int yielded = 0;
3169 int try = 3;
3170 int pass;
3171 int i;
3172
3173 kvm_vcpu_set_in_spin_loop(me, true);
3174
3175
3176
3177
3178
3179
3180
3181 for (pass = 0; pass < 2 && !yielded && try; pass++) {
3182 kvm_for_each_vcpu(i, vcpu, kvm) {
3183 if (!pass && i <= last_boosted_vcpu) {
3184 i = last_boosted_vcpu;
3185 continue;
3186 } else if (pass && i > last_boosted_vcpu)
3187 break;
3188 if (!READ_ONCE(vcpu->ready))
3189 continue;
3190 if (vcpu == me)
3191 continue;
3192 if (rcuwait_active(&vcpu->wait) &&
3193 !vcpu_dy_runnable(vcpu))
3194 continue;
3195 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3196 !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3197 !kvm_arch_vcpu_in_kernel(vcpu))
3198 continue;
3199 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3200 continue;
3201
3202 yielded = kvm_vcpu_yield_to(vcpu);
3203 if (yielded > 0) {
3204 kvm->last_boosted_vcpu = i;
3205 break;
3206 } else if (yielded < 0) {
3207 try--;
3208 if (!try)
3209 break;
3210 }
3211 }
3212 }
3213 kvm_vcpu_set_in_spin_loop(me, false);
3214
3215
3216 kvm_vcpu_set_dy_eligible(me, false);
3217}
3218EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3219
3220static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3221{
3222#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
3223 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3224 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3225 kvm->dirty_ring_size / PAGE_SIZE);
3226#else
3227 return false;
3228#endif
3229}
3230
3231static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3232{
3233 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3234 struct page *page;
3235
3236 if (vmf->pgoff == 0)
3237 page = virt_to_page(vcpu->run);
3238#ifdef CONFIG_X86
3239 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3240 page = virt_to_page(vcpu->arch.pio_data);
3241#endif
3242#ifdef CONFIG_KVM_MMIO
3243 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3244 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3245#endif
3246 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3247 page = kvm_dirty_ring_get_page(
3248 &vcpu->dirty_ring,
3249 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3250 else
3251 return kvm_arch_vcpu_fault(vcpu, vmf);
3252 get_page(page);
3253 vmf->page = page;
3254 return 0;
3255}
3256
3257static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3258 .fault = kvm_vcpu_fault,
3259};
3260
3261static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3262{
3263 struct kvm_vcpu *vcpu = file->private_data;
3264 unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
3265
3266 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3267 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3268 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3269 return -EINVAL;
3270
3271 vma->vm_ops = &kvm_vcpu_vm_ops;
3272 return 0;
3273}
3274
3275static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3276{
3277 struct kvm_vcpu *vcpu = filp->private_data;
3278
3279 kvm_put_kvm(vcpu->kvm);
3280 return 0;
3281}
3282
3283static struct file_operations kvm_vcpu_fops = {
3284 .release = kvm_vcpu_release,
3285 .unlocked_ioctl = kvm_vcpu_ioctl,
3286 .mmap = kvm_vcpu_mmap,
3287 .llseek = noop_llseek,
3288 KVM_COMPAT(kvm_vcpu_compat_ioctl),
3289};
3290
3291
3292
3293
3294static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3295{
3296 char name[8 + 1 + ITOA_MAX_LEN + 1];
3297
3298 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3299 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3300}
3301
3302static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3303{
3304#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3305 struct dentry *debugfs_dentry;
3306 char dir_name[ITOA_MAX_LEN * 2];
3307
3308 if (!debugfs_initialized())
3309 return;
3310
3311 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3312 debugfs_dentry = debugfs_create_dir(dir_name,
3313 vcpu->kvm->debugfs_dentry);
3314
3315 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3316#endif
3317}
3318
3319
3320
3321
3322static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3323{
3324 int r;
3325 struct kvm_vcpu *vcpu;
3326 struct page *page;
3327
3328 if (id >= KVM_MAX_VCPU_ID)
3329 return -EINVAL;
3330
3331 mutex_lock(&kvm->lock);
3332 if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3333 mutex_unlock(&kvm->lock);
3334 return -EINVAL;
3335 }
3336
3337 kvm->created_vcpus++;
3338 mutex_unlock(&kvm->lock);
3339
3340 r = kvm_arch_vcpu_precreate(kvm, id);
3341 if (r)
3342 goto vcpu_decrement;
3343
3344 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3345 if (!vcpu) {
3346 r = -ENOMEM;
3347 goto vcpu_decrement;
3348 }
3349
3350 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3351 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3352 if (!page) {
3353 r = -ENOMEM;
3354 goto vcpu_free;
3355 }
3356 vcpu->run = page_address(page);
3357
3358 kvm_vcpu_init(vcpu, kvm, id);
3359
3360 r = kvm_arch_vcpu_create(vcpu);
3361 if (r)
3362 goto vcpu_free_run_page;
3363
3364 if (kvm->dirty_ring_size) {
3365 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3366 id, kvm->dirty_ring_size);
3367 if (r)
3368 goto arch_vcpu_destroy;
3369 }
3370
3371 mutex_lock(&kvm->lock);
3372 if (kvm_get_vcpu_by_id(kvm, id)) {
3373 r = -EEXIST;
3374 goto unlock_vcpu_destroy;
3375 }
3376
3377 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3378 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
3379
3380
3381 kvm_get_kvm(kvm);
3382 r = create_vcpu_fd(vcpu);
3383 if (r < 0) {
3384 kvm_put_kvm_no_destroy(kvm);
3385 goto unlock_vcpu_destroy;
3386 }
3387
3388 kvm->vcpus[vcpu->vcpu_idx] = vcpu;
3389
3390
3391
3392
3393
3394 smp_wmb();
3395 atomic_inc(&kvm->online_vcpus);
3396
3397 mutex_unlock(&kvm->lock);
3398 kvm_arch_vcpu_postcreate(vcpu);
3399 kvm_create_vcpu_debugfs(vcpu);
3400 return r;
3401
3402unlock_vcpu_destroy:
3403 mutex_unlock(&kvm->lock);
3404 kvm_dirty_ring_free(&vcpu->dirty_ring);
3405arch_vcpu_destroy:
3406 kvm_arch_vcpu_destroy(vcpu);
3407vcpu_free_run_page:
3408 free_page((unsigned long)vcpu->run);
3409vcpu_free:
3410 kmem_cache_free(kvm_vcpu_cache, vcpu);
3411vcpu_decrement:
3412 mutex_lock(&kvm->lock);
3413 kvm->created_vcpus--;
3414 mutex_unlock(&kvm->lock);
3415 return r;
3416}
3417
3418static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3419{
3420 if (sigset) {
3421 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3422 vcpu->sigset_active = 1;
3423 vcpu->sigset = *sigset;
3424 } else
3425 vcpu->sigset_active = 0;
3426 return 0;
3427}
3428
3429static long kvm_vcpu_ioctl(struct file *filp,
3430 unsigned int ioctl, unsigned long arg)
3431{
3432 struct kvm_vcpu *vcpu = filp->private_data;
3433 void __user *argp = (void __user *)arg;
3434 int r;
3435 struct kvm_fpu *fpu = NULL;
3436 struct kvm_sregs *kvm_sregs = NULL;
3437
3438 if (vcpu->kvm->mm != current->mm)
3439 return -EIO;
3440
3441 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3442 return -EINVAL;
3443
3444
3445
3446
3447
3448 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3449 if (r != -ENOIOCTLCMD)
3450 return r;
3451
3452 if (mutex_lock_killable(&vcpu->mutex))
3453 return -EINTR;
3454 switch (ioctl) {
3455 case KVM_RUN: {
3456 struct pid *oldpid;
3457 r = -EINVAL;
3458 if (arg)
3459 goto out;
3460 oldpid = rcu_access_pointer(vcpu->pid);
3461 if (unlikely(oldpid != task_pid(current))) {
3462
3463 struct pid *newpid;
3464
3465 r = kvm_arch_vcpu_run_pid_change(vcpu);
3466 if (r)
3467 break;
3468
3469 newpid = get_task_pid(current, PIDTYPE_PID);
3470 rcu_assign_pointer(vcpu->pid, newpid);
3471 if (oldpid)
3472 synchronize_rcu();
3473 put_pid(oldpid);
3474 }
3475 r = kvm_arch_vcpu_ioctl_run(vcpu);
3476 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
3477 break;
3478 }
3479 case KVM_GET_REGS: {
3480 struct kvm_regs *kvm_regs;
3481
3482 r = -ENOMEM;
3483 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3484 if (!kvm_regs)
3485 goto out;
3486 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
3487 if (r)
3488 goto out_free1;
3489 r = -EFAULT;
3490 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3491 goto out_free1;
3492 r = 0;
3493out_free1:
3494 kfree(kvm_regs);
3495 break;
3496 }
3497 case KVM_SET_REGS: {
3498 struct kvm_regs *kvm_regs;
3499
3500 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3501 if (IS_ERR(kvm_regs)) {
3502 r = PTR_ERR(kvm_regs);
3503 goto out;
3504 }
3505 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3506 kfree(kvm_regs);
3507 break;
3508 }
3509 case KVM_GET_SREGS: {
3510 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3511 GFP_KERNEL_ACCOUNT);
3512 r = -ENOMEM;
3513 if (!kvm_sregs)
3514 goto out;
3515 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3516 if (r)
3517 goto out;
3518 r = -EFAULT;
3519 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3520 goto out;
3521 r = 0;
3522 break;
3523 }
3524 case KVM_SET_SREGS: {
3525 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3526 if (IS_ERR(kvm_sregs)) {
3527 r = PTR_ERR(kvm_sregs);
3528 kvm_sregs = NULL;
3529 goto out;
3530 }
3531 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3532 break;
3533 }
3534 case KVM_GET_MP_STATE: {
3535 struct kvm_mp_state mp_state;
3536
3537 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3538 if (r)
3539 goto out;
3540 r = -EFAULT;
3541 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3542 goto out;
3543 r = 0;
3544 break;
3545 }
3546 case KVM_SET_MP_STATE: {
3547 struct kvm_mp_state mp_state;
3548
3549 r = -EFAULT;
3550 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3551 goto out;
3552 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3553 break;
3554 }
3555 case KVM_TRANSLATE: {
3556 struct kvm_translation tr;
3557
3558 r = -EFAULT;
3559 if (copy_from_user(&tr, argp, sizeof(tr)))
3560 goto out;
3561 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3562 if (r)
3563 goto out;
3564 r = -EFAULT;
3565 if (copy_to_user(argp, &tr, sizeof(tr)))
3566 goto out;
3567 r = 0;
3568 break;
3569 }
3570 case KVM_SET_GUEST_DEBUG: {
3571 struct kvm_guest_debug dbg;
3572
3573 r = -EFAULT;
3574 if (copy_from_user(&dbg, argp, sizeof(dbg)))
3575 goto out;
3576 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
3577 break;
3578 }
3579 case KVM_SET_SIGNAL_MASK: {
3580 struct kvm_signal_mask __user *sigmask_arg = argp;
3581 struct kvm_signal_mask kvm_sigmask;
3582 sigset_t sigset, *p;
3583
3584 p = NULL;
3585 if (argp) {
3586 r = -EFAULT;
3587 if (copy_from_user(&kvm_sigmask, argp,
3588 sizeof(kvm_sigmask)))
3589 goto out;
3590 r = -EINVAL;
3591 if (kvm_sigmask.len != sizeof(sigset))
3592 goto out;
3593 r = -EFAULT;
3594 if (copy_from_user(&sigset, sigmask_arg->sigset,
3595 sizeof(sigset)))
3596 goto out;
3597 p = &sigset;
3598 }
3599 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
3600 break;
3601 }
3602 case KVM_GET_FPU: {
3603 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
3604 r = -ENOMEM;
3605 if (!fpu)
3606 goto out;
3607 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
3608 if (r)
3609 goto out;
3610 r = -EFAULT;
3611 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
3612 goto out;
3613 r = 0;
3614 break;
3615 }
3616 case KVM_SET_FPU: {
3617 fpu = memdup_user(argp, sizeof(*fpu));
3618 if (IS_ERR(fpu)) {
3619 r = PTR_ERR(fpu);
3620 fpu = NULL;
3621 goto out;
3622 }
3623 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
3624 break;
3625 }
3626 default:
3627 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
3628 }
3629out:
3630 mutex_unlock(&vcpu->mutex);
3631 kfree(fpu);
3632 kfree(kvm_sregs);
3633 return r;
3634}
3635
3636#ifdef CONFIG_KVM_COMPAT
3637static long kvm_vcpu_compat_ioctl(struct file *filp,
3638 unsigned int ioctl, unsigned long arg)
3639{
3640 struct kvm_vcpu *vcpu = filp->private_data;
3641 void __user *argp = compat_ptr(arg);
3642 int r;
3643
3644 if (vcpu->kvm->mm != current->mm)
3645 return -EIO;
3646
3647 switch (ioctl) {
3648 case KVM_SET_SIGNAL_MASK: {
3649 struct kvm_signal_mask __user *sigmask_arg = argp;
3650 struct kvm_signal_mask kvm_sigmask;
3651 sigset_t sigset;
3652
3653 if (argp) {
3654 r = -EFAULT;
3655 if (copy_from_user(&kvm_sigmask, argp,
3656 sizeof(kvm_sigmask)))
3657 goto out;
3658 r = -EINVAL;
3659 if (kvm_sigmask.len != sizeof(compat_sigset_t))
3660 goto out;
3661 r = -EFAULT;
3662 if (get_compat_sigset(&sigset,
3663 (compat_sigset_t __user *)sigmask_arg->sigset))
3664 goto out;
3665 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3666 } else
3667 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
3668 break;
3669 }
3670 default:
3671 r = kvm_vcpu_ioctl(filp, ioctl, arg);
3672 }
3673
3674out:
3675 return r;
3676}
3677#endif
3678
3679static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3680{
3681 struct kvm_device *dev = filp->private_data;
3682
3683 if (dev->ops->mmap)
3684 return dev->ops->mmap(dev, vma);
3685
3686 return -ENODEV;
3687}
3688
3689static int kvm_device_ioctl_attr(struct kvm_device *dev,
3690 int (*accessor)(struct kvm_device *dev,
3691 struct kvm_device_attr *attr),
3692 unsigned long arg)
3693{
3694 struct kvm_device_attr attr;
3695
3696 if (!accessor)
3697 return -EPERM;
3698
3699 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
3700 return -EFAULT;
3701
3702 return accessor(dev, &attr);
3703}
3704
3705static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
3706 unsigned long arg)
3707{
3708 struct kvm_device *dev = filp->private_data;
3709
3710 if (dev->kvm->mm != current->mm)
3711 return -EIO;
3712
3713 switch (ioctl) {
3714 case KVM_SET_DEVICE_ATTR:
3715 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
3716 case KVM_GET_DEVICE_ATTR:
3717 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
3718 case KVM_HAS_DEVICE_ATTR:
3719 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
3720 default:
3721 if (dev->ops->ioctl)
3722 return dev->ops->ioctl(dev, ioctl, arg);
3723
3724 return -ENOTTY;
3725 }
3726}
3727
3728static int kvm_device_release(struct inode *inode, struct file *filp)
3729{
3730 struct kvm_device *dev = filp->private_data;
3731 struct kvm *kvm = dev->kvm;
3732
3733 if (dev->ops->release) {
3734 mutex_lock(&kvm->lock);
3735 list_del(&dev->vm_node);
3736 dev->ops->release(dev);
3737 mutex_unlock(&kvm->lock);
3738 }
3739
3740 kvm_put_kvm(kvm);
3741 return 0;
3742}
3743
3744static const struct file_operations kvm_device_fops = {
3745 .unlocked_ioctl = kvm_device_ioctl,
3746 .release = kvm_device_release,
3747 KVM_COMPAT(kvm_device_ioctl),
3748 .mmap = kvm_device_mmap,
3749};
3750
3751struct kvm_device *kvm_device_from_filp(struct file *filp)
3752{
3753 if (filp->f_op != &kvm_device_fops)
3754 return NULL;
3755
3756 return filp->private_data;
3757}
3758
3759static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
3760#ifdef CONFIG_KVM_MPIC
3761 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
3762 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
3763#endif
3764};
3765
3766int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
3767{
3768 if (type >= ARRAY_SIZE(kvm_device_ops_table))
3769 return -ENOSPC;
3770
3771 if (kvm_device_ops_table[type] != NULL)
3772 return -EEXIST;
3773
3774 kvm_device_ops_table[type] = ops;
3775 return 0;
3776}
3777
3778void kvm_unregister_device_ops(u32 type)
3779{
3780 if (kvm_device_ops_table[type] != NULL)
3781 kvm_device_ops_table[type] = NULL;
3782}
3783
3784static int kvm_ioctl_create_device(struct kvm *kvm,
3785 struct kvm_create_device *cd)
3786{
3787 const struct kvm_device_ops *ops = NULL;
3788 struct kvm_device *dev;
3789 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
3790 int type;
3791 int ret;
3792
3793 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
3794 return -ENODEV;
3795
3796 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
3797 ops = kvm_device_ops_table[type];
3798 if (ops == NULL)
3799 return -ENODEV;
3800
3801 if (test)
3802 return 0;
3803
3804 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
3805 if (!dev)
3806 return -ENOMEM;
3807
3808 dev->ops = ops;
3809 dev->kvm = kvm;
3810
3811 mutex_lock(&kvm->lock);
3812 ret = ops->create(dev, type);
3813 if (ret < 0) {
3814 mutex_unlock(&kvm->lock);
3815 kfree(dev);
3816 return ret;
3817 }
3818 list_add(&dev->vm_node, &kvm->devices);
3819 mutex_unlock(&kvm->lock);
3820
3821 if (ops->init)
3822 ops->init(dev);
3823
3824 kvm_get_kvm(kvm);
3825 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
3826 if (ret < 0) {
3827 kvm_put_kvm_no_destroy(kvm);
3828 mutex_lock(&kvm->lock);
3829 list_del(&dev->vm_node);
3830 mutex_unlock(&kvm->lock);
3831 ops->destroy(dev);
3832 return ret;
3833 }
3834
3835 cd->fd = ret;
3836 return 0;
3837}
3838
3839static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
3840{
3841 switch (arg) {
3842 case KVM_CAP_USER_MEMORY:
3843 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
3844 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
3845 case KVM_CAP_INTERNAL_ERROR_DATA:
3846#ifdef CONFIG_HAVE_KVM_MSI
3847 case KVM_CAP_SIGNAL_MSI:
3848#endif
3849#ifdef CONFIG_HAVE_KVM_IRQFD
3850 case KVM_CAP_IRQFD:
3851 case KVM_CAP_IRQFD_RESAMPLE:
3852#endif
3853 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
3854 case KVM_CAP_CHECK_EXTENSION_VM:
3855 case KVM_CAP_ENABLE_CAP_VM:
3856 case KVM_CAP_HALT_POLL:
3857 return 1;
3858#ifdef CONFIG_KVM_MMIO
3859 case KVM_CAP_COALESCED_MMIO:
3860 return KVM_COALESCED_MMIO_PAGE_OFFSET;
3861 case KVM_CAP_COALESCED_PIO:
3862 return 1;
3863#endif
3864#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3865 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
3866 return KVM_DIRTY_LOG_MANUAL_CAPS;
3867#endif
3868#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
3869 case KVM_CAP_IRQ_ROUTING:
3870 return KVM_MAX_IRQ_ROUTES;
3871#endif
3872#if KVM_ADDRESS_SPACE_NUM > 1
3873 case KVM_CAP_MULTI_ADDRESS_SPACE:
3874 return KVM_ADDRESS_SPACE_NUM;
3875#endif
3876 case KVM_CAP_NR_MEMSLOTS:
3877 return KVM_USER_MEM_SLOTS;
3878 case KVM_CAP_DIRTY_LOG_RING:
3879#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
3880 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
3881#else
3882 return 0;
3883#endif
3884 default:
3885 break;
3886 }
3887 return kvm_vm_ioctl_check_extension(kvm, arg);
3888}
3889
3890static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
3891{
3892 int r;
3893
3894 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
3895 return -EINVAL;
3896
3897
3898 if (!size || (size & (size - 1)))
3899 return -EINVAL;
3900
3901
3902 if (size < kvm_dirty_ring_get_rsvd_entries() *
3903 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
3904 return -EINVAL;
3905
3906 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
3907 sizeof(struct kvm_dirty_gfn))
3908 return -E2BIG;
3909
3910
3911 if (kvm->dirty_ring_size)
3912 return -EINVAL;
3913
3914 mutex_lock(&kvm->lock);
3915
3916 if (kvm->created_vcpus) {
3917
3918 r = -EINVAL;
3919 } else {
3920 kvm->dirty_ring_size = size;
3921 r = 0;
3922 }
3923
3924 mutex_unlock(&kvm->lock);
3925 return r;
3926}
3927
3928static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
3929{
3930 int i;
3931 struct kvm_vcpu *vcpu;
3932 int cleared = 0;
3933
3934 if (!kvm->dirty_ring_size)
3935 return -EINVAL;
3936
3937 mutex_lock(&kvm->slots_lock);
3938
3939 kvm_for_each_vcpu(i, vcpu, kvm)
3940 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
3941
3942 mutex_unlock(&kvm->slots_lock);
3943
3944 if (cleared)
3945 kvm_flush_remote_tlbs(kvm);
3946
3947 return cleared;
3948}
3949
3950int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
3951 struct kvm_enable_cap *cap)
3952{
3953 return -EINVAL;
3954}
3955
3956static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
3957 struct kvm_enable_cap *cap)
3958{
3959 switch (cap->cap) {
3960#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3961 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
3962 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
3963
3964 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
3965 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
3966
3967 if (cap->flags || (cap->args[0] & ~allowed_options))
3968 return -EINVAL;
3969 kvm->manual_dirty_log_protect = cap->args[0];
3970 return 0;
3971 }
3972#endif
3973 case KVM_CAP_HALT_POLL: {
3974 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
3975 return -EINVAL;
3976
3977 kvm->max_halt_poll_ns = cap->args[0];
3978 return 0;
3979 }
3980 case KVM_CAP_DIRTY_LOG_RING:
3981 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
3982 default:
3983 return kvm_vm_ioctl_enable_cap(kvm, cap);
3984 }
3985}
3986
3987static long kvm_vm_ioctl(struct file *filp,
3988 unsigned int ioctl, unsigned long arg)
3989{
3990 struct kvm *kvm = filp->private_data;
3991 void __user *argp = (void __user *)arg;
3992 int r;
3993
3994 if (kvm->mm != current->mm)
3995 return -EIO;
3996 switch (ioctl) {
3997 case KVM_CREATE_VCPU:
3998 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3999 break;
4000 case KVM_ENABLE_CAP: {
4001 struct kvm_enable_cap cap;
4002
4003 r = -EFAULT;
4004 if (copy_from_user(&cap, argp, sizeof(cap)))
4005 goto out;
4006 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4007 break;
4008 }
4009 case KVM_SET_USER_MEMORY_REGION: {
4010 struct kvm_userspace_memory_region kvm_userspace_mem;
4011
4012 r = -EFAULT;
4013 if (copy_from_user(&kvm_userspace_mem, argp,
4014 sizeof(kvm_userspace_mem)))
4015 goto out;
4016
4017 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4018 break;
4019 }
4020 case KVM_GET_DIRTY_LOG: {
4021 struct kvm_dirty_log log;
4022
4023 r = -EFAULT;
4024 if (copy_from_user(&log, argp, sizeof(log)))
4025 goto out;
4026 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4027 break;
4028 }
4029#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4030 case KVM_CLEAR_DIRTY_LOG: {
4031 struct kvm_clear_dirty_log log;
4032
4033 r = -EFAULT;
4034 if (copy_from_user(&log, argp, sizeof(log)))
4035 goto out;
4036 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4037 break;
4038 }
4039#endif
4040#ifdef CONFIG_KVM_MMIO
4041 case KVM_REGISTER_COALESCED_MMIO: {
4042 struct kvm_coalesced_mmio_zone zone;
4043
4044 r = -EFAULT;
4045 if (copy_from_user(&zone, argp, sizeof(zone)))
4046 goto out;
4047 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4048 break;
4049 }
4050 case KVM_UNREGISTER_COALESCED_MMIO: {
4051 struct kvm_coalesced_mmio_zone zone;
4052
4053 r = -EFAULT;
4054 if (copy_from_user(&zone, argp, sizeof(zone)))
4055 goto out;
4056 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4057 break;
4058 }
4059#endif
4060 case KVM_IRQFD: {
4061 struct kvm_irqfd data;
4062
4063 r = -EFAULT;
4064 if (copy_from_user(&data, argp, sizeof(data)))
4065 goto out;
4066 r = kvm_irqfd(kvm, &data);
4067 break;
4068 }
4069 case KVM_IOEVENTFD: {
4070 struct kvm_ioeventfd data;
4071
4072 r = -EFAULT;
4073 if (copy_from_user(&data, argp, sizeof(data)))
4074 goto out;
4075 r = kvm_ioeventfd(kvm, &data);
4076 break;
4077 }
4078#ifdef CONFIG_HAVE_KVM_MSI
4079 case KVM_SIGNAL_MSI: {
4080 struct kvm_msi msi;
4081
4082 r = -EFAULT;
4083 if (copy_from_user(&msi, argp, sizeof(msi)))
4084 goto out;
4085 r = kvm_send_userspace_msi(kvm, &msi);
4086 break;
4087 }
4088#endif
4089#ifdef __KVM_HAVE_IRQ_LINE
4090 case KVM_IRQ_LINE_STATUS:
4091 case KVM_IRQ_LINE: {
4092 struct kvm_irq_level irq_event;
4093
4094 r = -EFAULT;
4095 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4096 goto out;
4097
4098 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4099 ioctl == KVM_IRQ_LINE_STATUS);
4100 if (r)
4101 goto out;
4102
4103 r = -EFAULT;
4104 if (ioctl == KVM_IRQ_LINE_STATUS) {
4105 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4106 goto out;
4107 }
4108
4109 r = 0;
4110 break;
4111 }
4112#endif
4113#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4114 case KVM_SET_GSI_ROUTING: {
4115 struct kvm_irq_routing routing;
4116 struct kvm_irq_routing __user *urouting;
4117 struct kvm_irq_routing_entry *entries = NULL;
4118
4119 r = -EFAULT;
4120 if (copy_from_user(&routing, argp, sizeof(routing)))
4121 goto out;
4122 r = -EINVAL;
4123 if (!kvm_arch_can_set_irq_routing(kvm))
4124 goto out;
4125 if (routing.nr > KVM_MAX_IRQ_ROUTES)
4126 goto out;
4127 if (routing.flags)
4128 goto out;
4129 if (routing.nr) {
4130 urouting = argp;
4131 entries = vmemdup_user(urouting->entries,
4132 array_size(sizeof(*entries),
4133 routing.nr));
4134 if (IS_ERR(entries)) {
4135 r = PTR_ERR(entries);
4136 goto out;
4137 }
4138 }
4139 r = kvm_set_irq_routing(kvm, entries, routing.nr,
4140 routing.flags);
4141 kvfree(entries);
4142 break;
4143 }
4144#endif
4145 case KVM_CREATE_DEVICE: {
4146 struct kvm_create_device cd;
4147
4148 r = -EFAULT;
4149 if (copy_from_user(&cd, argp, sizeof(cd)))
4150 goto out;
4151
4152 r = kvm_ioctl_create_device(kvm, &cd);
4153 if (r)
4154 goto out;
4155
4156 r = -EFAULT;
4157 if (copy_to_user(argp, &cd, sizeof(cd)))
4158 goto out;
4159
4160 r = 0;
4161 break;
4162 }
4163 case KVM_CHECK_EXTENSION:
4164 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4165 break;
4166 case KVM_RESET_DIRTY_RINGS:
4167 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4168 break;
4169 default:
4170 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4171 }
4172out:
4173 return r;
4174}
4175
4176#ifdef CONFIG_KVM_COMPAT
4177struct compat_kvm_dirty_log {
4178 __u32 slot;
4179 __u32 padding1;
4180 union {
4181 compat_uptr_t dirty_bitmap;
4182 __u64 padding2;
4183 };
4184};
4185
4186static long kvm_vm_compat_ioctl(struct file *filp,
4187 unsigned int ioctl, unsigned long arg)
4188{
4189 struct kvm *kvm = filp->private_data;
4190 int r;
4191
4192 if (kvm->mm != current->mm)
4193 return -EIO;
4194 switch (ioctl) {
4195 case KVM_GET_DIRTY_LOG: {
4196 struct compat_kvm_dirty_log compat_log;
4197 struct kvm_dirty_log log;
4198
4199 if (copy_from_user(&compat_log, (void __user *)arg,
4200 sizeof(compat_log)))
4201 return -EFAULT;
4202 log.slot = compat_log.slot;
4203 log.padding1 = compat_log.padding1;
4204 log.padding2 = compat_log.padding2;
4205 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4206
4207 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4208 break;
4209 }
4210 default:
4211 r = kvm_vm_ioctl(filp, ioctl, arg);
4212 }
4213 return r;
4214}
4215#endif
4216
4217static struct file_operations kvm_vm_fops = {
4218 .release = kvm_vm_release,
4219 .unlocked_ioctl = kvm_vm_ioctl,
4220 .llseek = noop_llseek,
4221 KVM_COMPAT(kvm_vm_compat_ioctl),
4222};
4223
4224bool file_is_kvm(struct file *file)
4225{
4226 return file && file->f_op == &kvm_vm_fops;
4227}
4228EXPORT_SYMBOL_GPL(file_is_kvm);
4229
4230static int kvm_dev_ioctl_create_vm(unsigned long type)
4231{
4232 int r;
4233 struct kvm *kvm;
4234 struct file *file;
4235
4236 kvm = kvm_create_vm(type);
4237 if (IS_ERR(kvm))
4238 return PTR_ERR(kvm);
4239#ifdef CONFIG_KVM_MMIO
4240 r = kvm_coalesced_mmio_init(kvm);
4241 if (r < 0)
4242 goto put_kvm;
4243#endif
4244 r = get_unused_fd_flags(O_CLOEXEC);
4245 if (r < 0)
4246 goto put_kvm;
4247
4248 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4249 if (IS_ERR(file)) {
4250 put_unused_fd(r);
4251 r = PTR_ERR(file);
4252 goto put_kvm;
4253 }
4254
4255
4256
4257
4258
4259
4260
4261 if (kvm_create_vm_debugfs(kvm, r) < 0) {
4262 put_unused_fd(r);
4263 fput(file);
4264 return -ENOMEM;
4265 }
4266 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
4267
4268 fd_install(r, file);
4269 return r;
4270
4271put_kvm:
4272 kvm_put_kvm(kvm);
4273 return r;
4274}
4275
4276static long kvm_dev_ioctl(struct file *filp,
4277 unsigned int ioctl, unsigned long arg)
4278{
4279 long r = -EINVAL;
4280
4281 switch (ioctl) {
4282 case KVM_GET_API_VERSION:
4283 if (arg)
4284 goto out;
4285 r = KVM_API_VERSION;
4286 break;
4287 case KVM_CREATE_VM:
4288 r = kvm_dev_ioctl_create_vm(arg);
4289 break;
4290 case KVM_CHECK_EXTENSION:
4291 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
4292 break;
4293 case KVM_GET_VCPU_MMAP_SIZE:
4294 if (arg)
4295 goto out;
4296 r = PAGE_SIZE;
4297#ifdef CONFIG_X86
4298 r += PAGE_SIZE;
4299#endif
4300#ifdef CONFIG_KVM_MMIO
4301 r += PAGE_SIZE;
4302#endif
4303 break;
4304 case KVM_TRACE_ENABLE:
4305 case KVM_TRACE_PAUSE:
4306 case KVM_TRACE_DISABLE:
4307 r = -EOPNOTSUPP;
4308 break;
4309 default:
4310 return kvm_arch_dev_ioctl(filp, ioctl, arg);
4311 }
4312out:
4313 return r;
4314}
4315
4316static struct file_operations kvm_chardev_ops = {
4317 .unlocked_ioctl = kvm_dev_ioctl,
4318 .llseek = noop_llseek,
4319 KVM_COMPAT(kvm_dev_ioctl),
4320};
4321
4322static struct miscdevice kvm_dev = {
4323 KVM_MINOR,
4324 "kvm",
4325 &kvm_chardev_ops,
4326};
4327
4328static void hardware_enable_nolock(void *junk)
4329{
4330 int cpu = raw_smp_processor_id();
4331 int r;
4332
4333 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
4334 return;
4335
4336 cpumask_set_cpu(cpu, cpus_hardware_enabled);
4337
4338 r = kvm_arch_hardware_enable();
4339
4340 if (r) {
4341 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4342 atomic_inc(&hardware_enable_failed);
4343 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
4344 }
4345}
4346
4347static int kvm_starting_cpu(unsigned int cpu)
4348{
4349 raw_spin_lock(&kvm_count_lock);
4350 if (kvm_usage_count)
4351 hardware_enable_nolock(NULL);
4352 raw_spin_unlock(&kvm_count_lock);
4353 return 0;
4354}
4355
4356static void hardware_disable_nolock(void *junk)
4357{
4358 int cpu = raw_smp_processor_id();
4359
4360 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
4361 return;
4362 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4363 kvm_arch_hardware_disable();
4364}
4365
4366static int kvm_dying_cpu(unsigned int cpu)
4367{
4368 raw_spin_lock(&kvm_count_lock);
4369 if (kvm_usage_count)
4370 hardware_disable_nolock(NULL);
4371 raw_spin_unlock(&kvm_count_lock);
4372 return 0;
4373}
4374
4375static void hardware_disable_all_nolock(void)
4376{
4377 BUG_ON(!kvm_usage_count);
4378
4379 kvm_usage_count--;
4380 if (!kvm_usage_count)
4381 on_each_cpu(hardware_disable_nolock, NULL, 1);
4382}
4383
4384static void hardware_disable_all(void)
4385{
4386 raw_spin_lock(&kvm_count_lock);
4387 hardware_disable_all_nolock();
4388 raw_spin_unlock(&kvm_count_lock);
4389}
4390
4391static int hardware_enable_all(void)
4392{
4393 int r = 0;
4394
4395 raw_spin_lock(&kvm_count_lock);
4396
4397 kvm_usage_count++;
4398 if (kvm_usage_count == 1) {
4399 atomic_set(&hardware_enable_failed, 0);
4400 on_each_cpu(hardware_enable_nolock, NULL, 1);
4401
4402 if (atomic_read(&hardware_enable_failed)) {
4403 hardware_disable_all_nolock();
4404 r = -EBUSY;
4405 }
4406 }
4407
4408 raw_spin_unlock(&kvm_count_lock);
4409
4410 return r;
4411}
4412
4413static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
4414 void *v)
4415{
4416
4417
4418
4419
4420
4421
4422 pr_info("kvm: exiting hardware virtualization\n");
4423 kvm_rebooting = true;
4424 on_each_cpu(hardware_disable_nolock, NULL, 1);
4425 return NOTIFY_OK;
4426}
4427
4428static struct notifier_block kvm_reboot_notifier = {
4429 .notifier_call = kvm_reboot,
4430 .priority = 0,
4431};
4432
4433static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
4434{
4435 int i;
4436
4437 for (i = 0; i < bus->dev_count; i++) {
4438 struct kvm_io_device *pos = bus->range[i].dev;
4439
4440 kvm_iodevice_destructor(pos);
4441 }
4442 kfree(bus);
4443}
4444
4445static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
4446 const struct kvm_io_range *r2)
4447{
4448 gpa_t addr1 = r1->addr;
4449 gpa_t addr2 = r2->addr;
4450
4451 if (addr1 < addr2)
4452 return -1;
4453
4454
4455
4456
4457
4458
4459 if (r2->len) {
4460 addr1 += r1->len;
4461 addr2 += r2->len;
4462 }
4463
4464 if (addr1 > addr2)
4465 return 1;
4466
4467 return 0;
4468}
4469
4470static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4471{
4472 return kvm_io_bus_cmp(p1, p2);
4473}
4474
4475static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
4476 gpa_t addr, int len)
4477{
4478 struct kvm_io_range *range, key;
4479 int off;
4480
4481 key = (struct kvm_io_range) {
4482 .addr = addr,
4483 .len = len,
4484 };
4485
4486 range = bsearch(&key, bus->range, bus->dev_count,
4487 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4488 if (range == NULL)
4489 return -ENOENT;
4490
4491 off = range - bus->range;
4492
4493 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
4494 off--;
4495
4496 return off;
4497}
4498
4499static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4500 struct kvm_io_range *range, const void *val)
4501{
4502 int idx;
4503
4504 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4505 if (idx < 0)
4506 return -EOPNOTSUPP;
4507
4508 while (idx < bus->dev_count &&
4509 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4510 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
4511 range->len, val))
4512 return idx;
4513 idx++;
4514 }
4515
4516 return -EOPNOTSUPP;
4517}
4518
4519
4520int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4521 int len, const void *val)
4522{
4523 struct kvm_io_bus *bus;
4524 struct kvm_io_range range;
4525 int r;
4526
4527 range = (struct kvm_io_range) {
4528 .addr = addr,
4529 .len = len,
4530 };
4531
4532 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4533 if (!bus)
4534 return -ENOMEM;
4535 r = __kvm_io_bus_write(vcpu, bus, &range, val);
4536 return r < 0 ? r : 0;
4537}
4538EXPORT_SYMBOL_GPL(kvm_io_bus_write);
4539
4540
4541int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
4542 gpa_t addr, int len, const void *val, long cookie)
4543{
4544 struct kvm_io_bus *bus;
4545 struct kvm_io_range range;
4546
4547 range = (struct kvm_io_range) {
4548 .addr = addr,
4549 .len = len,
4550 };
4551
4552 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4553 if (!bus)
4554 return -ENOMEM;
4555
4556
4557 if ((cookie >= 0) && (cookie < bus->dev_count) &&
4558 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
4559 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
4560 val))
4561 return cookie;
4562
4563
4564
4565
4566
4567 return __kvm_io_bus_write(vcpu, bus, &range, val);
4568}
4569
4570static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4571 struct kvm_io_range *range, void *val)
4572{
4573 int idx;
4574
4575 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4576 if (idx < 0)
4577 return -EOPNOTSUPP;
4578
4579 while (idx < bus->dev_count &&
4580 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4581 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
4582 range->len, val))
4583 return idx;
4584 idx++;
4585 }
4586
4587 return -EOPNOTSUPP;
4588}
4589
4590
4591int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4592 int len, void *val)
4593{
4594 struct kvm_io_bus *bus;
4595 struct kvm_io_range range;
4596 int r;
4597
4598 range = (struct kvm_io_range) {
4599 .addr = addr,
4600 .len = len,
4601 };
4602
4603 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4604 if (!bus)
4605 return -ENOMEM;
4606 r = __kvm_io_bus_read(vcpu, bus, &range, val);
4607 return r < 0 ? r : 0;
4608}
4609
4610
4611int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
4612 int len, struct kvm_io_device *dev)
4613{
4614 int i;
4615 struct kvm_io_bus *new_bus, *bus;
4616 struct kvm_io_range range;
4617
4618 bus = kvm_get_bus(kvm, bus_idx);
4619 if (!bus)
4620 return -ENOMEM;
4621
4622
4623 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
4624 return -ENOSPC;
4625
4626 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
4627 GFP_KERNEL_ACCOUNT);
4628 if (!new_bus)
4629 return -ENOMEM;
4630
4631 range = (struct kvm_io_range) {
4632 .addr = addr,
4633 .len = len,
4634 .dev = dev,
4635 };
4636
4637 for (i = 0; i < bus->dev_count; i++)
4638 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
4639 break;
4640
4641 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
4642 new_bus->dev_count++;
4643 new_bus->range[i] = range;
4644 memcpy(new_bus->range + i + 1, bus->range + i,
4645 (bus->dev_count - i) * sizeof(struct kvm_io_range));
4646 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4647 synchronize_srcu_expedited(&kvm->srcu);
4648 kfree(bus);
4649
4650 return 0;
4651}
4652
4653int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4654 struct kvm_io_device *dev)
4655{
4656 int i, j;
4657 struct kvm_io_bus *new_bus, *bus;
4658
4659 lockdep_assert_held(&kvm->slots_lock);
4660
4661 bus = kvm_get_bus(kvm, bus_idx);
4662 if (!bus)
4663 return 0;
4664
4665 for (i = 0; i < bus->dev_count; i++) {
4666 if (bus->range[i].dev == dev) {
4667 break;
4668 }
4669 }
4670
4671 if (i == bus->dev_count)
4672 return 0;
4673
4674 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
4675 GFP_KERNEL_ACCOUNT);
4676 if (new_bus) {
4677 memcpy(new_bus, bus, struct_size(bus, range, i));
4678 new_bus->dev_count--;
4679 memcpy(new_bus->range + i, bus->range + i + 1,
4680 flex_array_size(new_bus, range, new_bus->dev_count - i));
4681 }
4682
4683 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4684 synchronize_srcu_expedited(&kvm->srcu);
4685
4686
4687 if (!new_bus) {
4688 pr_err("kvm: failed to shrink bus, removing it completely\n");
4689 for (j = 0; j < bus->dev_count; j++) {
4690 if (j == i)
4691 continue;
4692 kvm_iodevice_destructor(bus->range[j].dev);
4693 }
4694 }
4695
4696 kfree(bus);
4697 return new_bus ? 0 : -ENOMEM;
4698}
4699
4700struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4701 gpa_t addr)
4702{
4703 struct kvm_io_bus *bus;
4704 int dev_idx, srcu_idx;
4705 struct kvm_io_device *iodev = NULL;
4706
4707 srcu_idx = srcu_read_lock(&kvm->srcu);
4708
4709 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
4710 if (!bus)
4711 goto out_unlock;
4712
4713 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
4714 if (dev_idx < 0)
4715 goto out_unlock;
4716
4717 iodev = bus->range[dev_idx].dev;
4718
4719out_unlock:
4720 srcu_read_unlock(&kvm->srcu, srcu_idx);
4721
4722 return iodev;
4723}
4724EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
4725
4726static int kvm_debugfs_open(struct inode *inode, struct file *file,
4727 int (*get)(void *, u64 *), int (*set)(void *, u64),
4728 const char *fmt)
4729{
4730 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
4731 inode->i_private;
4732
4733
4734
4735
4736
4737
4738 if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
4739 return -ENOENT;
4740
4741 if (simple_attr_open(inode, file, get,
4742 KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
4743 ? set : NULL,
4744 fmt)) {
4745 kvm_put_kvm(stat_data->kvm);
4746 return -ENOMEM;
4747 }
4748
4749 return 0;
4750}
4751
4752static int kvm_debugfs_release(struct inode *inode, struct file *file)
4753{
4754 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
4755 inode->i_private;
4756
4757 simple_attr_release(inode, file);
4758 kvm_put_kvm(stat_data->kvm);
4759
4760 return 0;
4761}
4762
4763static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
4764{
4765 *val = *(ulong *)((void *)kvm + offset);
4766
4767 return 0;
4768}
4769
4770static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
4771{
4772 *(ulong *)((void *)kvm + offset) = 0;
4773
4774 return 0;
4775}
4776
4777static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
4778{
4779 int i;
4780 struct kvm_vcpu *vcpu;
4781
4782 *val = 0;
4783
4784 kvm_for_each_vcpu(i, vcpu, kvm)
4785 *val += *(u64 *)((void *)vcpu + offset);
4786
4787 return 0;
4788}
4789
4790static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
4791{
4792 int i;
4793 struct kvm_vcpu *vcpu;
4794
4795 kvm_for_each_vcpu(i, vcpu, kvm)
4796 *(u64 *)((void *)vcpu + offset) = 0;
4797
4798 return 0;
4799}
4800
4801static int kvm_stat_data_get(void *data, u64 *val)
4802{
4803 int r = -EFAULT;
4804 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4805
4806 switch (stat_data->dbgfs_item->kind) {
4807 case KVM_STAT_VM:
4808 r = kvm_get_stat_per_vm(stat_data->kvm,
4809 stat_data->dbgfs_item->offset, val);
4810 break;
4811 case KVM_STAT_VCPU:
4812 r = kvm_get_stat_per_vcpu(stat_data->kvm,
4813 stat_data->dbgfs_item->offset, val);
4814 break;
4815 }
4816
4817 return r;
4818}
4819
4820static int kvm_stat_data_clear(void *data, u64 val)
4821{
4822 int r = -EFAULT;
4823 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4824
4825 if (val)
4826 return -EINVAL;
4827
4828 switch (stat_data->dbgfs_item->kind) {
4829 case KVM_STAT_VM:
4830 r = kvm_clear_stat_per_vm(stat_data->kvm,
4831 stat_data->dbgfs_item->offset);
4832 break;
4833 case KVM_STAT_VCPU:
4834 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
4835 stat_data->dbgfs_item->offset);
4836 break;
4837 }
4838
4839 return r;
4840}
4841
4842static int kvm_stat_data_open(struct inode *inode, struct file *file)
4843{
4844 __simple_attr_check_format("%llu\n", 0ull);
4845 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
4846 kvm_stat_data_clear, "%llu\n");
4847}
4848
4849static const struct file_operations stat_fops_per_vm = {
4850 .owner = THIS_MODULE,
4851 .open = kvm_stat_data_open,
4852 .release = kvm_debugfs_release,
4853 .read = simple_attr_read,
4854 .write = simple_attr_write,
4855 .llseek = no_llseek,
4856};
4857
4858static int vm_stat_get(void *_offset, u64 *val)
4859{
4860 unsigned offset = (long)_offset;
4861 struct kvm *kvm;
4862 u64 tmp_val;
4863
4864 *val = 0;
4865 mutex_lock(&kvm_lock);
4866 list_for_each_entry(kvm, &vm_list, vm_list) {
4867 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
4868 *val += tmp_val;
4869 }
4870 mutex_unlock(&kvm_lock);
4871 return 0;
4872}
4873
4874static int vm_stat_clear(void *_offset, u64 val)
4875{
4876 unsigned offset = (long)_offset;
4877 struct kvm *kvm;
4878
4879 if (val)
4880 return -EINVAL;
4881
4882 mutex_lock(&kvm_lock);
4883 list_for_each_entry(kvm, &vm_list, vm_list) {
4884 kvm_clear_stat_per_vm(kvm, offset);
4885 }
4886 mutex_unlock(&kvm_lock);
4887
4888 return 0;
4889}
4890
4891DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
4892
4893static int vcpu_stat_get(void *_offset, u64 *val)
4894{
4895 unsigned offset = (long)_offset;
4896 struct kvm *kvm;
4897 u64 tmp_val;
4898
4899 *val = 0;
4900 mutex_lock(&kvm_lock);
4901 list_for_each_entry(kvm, &vm_list, vm_list) {
4902 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
4903 *val += tmp_val;
4904 }
4905 mutex_unlock(&kvm_lock);
4906 return 0;
4907}
4908
4909static int vcpu_stat_clear(void *_offset, u64 val)
4910{
4911 unsigned offset = (long)_offset;
4912 struct kvm *kvm;
4913
4914 if (val)
4915 return -EINVAL;
4916
4917 mutex_lock(&kvm_lock);
4918 list_for_each_entry(kvm, &vm_list, vm_list) {
4919 kvm_clear_stat_per_vcpu(kvm, offset);
4920 }
4921 mutex_unlock(&kvm_lock);
4922
4923 return 0;
4924}
4925
4926DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
4927 "%llu\n");
4928
4929static const struct file_operations *stat_fops[] = {
4930 [KVM_STAT_VCPU] = &vcpu_stat_fops,
4931 [KVM_STAT_VM] = &vm_stat_fops,
4932};
4933
4934static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
4935{
4936 struct kobj_uevent_env *env;
4937 unsigned long long created, active;
4938
4939 if (!kvm_dev.this_device || !kvm)
4940 return;
4941
4942 mutex_lock(&kvm_lock);
4943 if (type == KVM_EVENT_CREATE_VM) {
4944 kvm_createvm_count++;
4945 kvm_active_vms++;
4946 } else if (type == KVM_EVENT_DESTROY_VM) {
4947 kvm_active_vms--;
4948 }
4949 created = kvm_createvm_count;
4950 active = kvm_active_vms;
4951 mutex_unlock(&kvm_lock);
4952
4953 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
4954 if (!env)
4955 return;
4956
4957 add_uevent_var(env, "CREATED=%llu", created);
4958 add_uevent_var(env, "COUNT=%llu", active);
4959
4960 if (type == KVM_EVENT_CREATE_VM) {
4961 add_uevent_var(env, "EVENT=create");
4962 kvm->userspace_pid = task_pid_nr(current);
4963 } else if (type == KVM_EVENT_DESTROY_VM) {
4964 add_uevent_var(env, "EVENT=destroy");
4965 }
4966 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
4967
4968 if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
4969 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
4970
4971 if (p) {
4972 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
4973 if (!IS_ERR(tmp))
4974 add_uevent_var(env, "STATS_PATH=%s", tmp);
4975 kfree(p);
4976 }
4977 }
4978
4979 env->envp[env->envp_idx++] = NULL;
4980 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
4981 kfree(env);
4982}
4983
4984static void kvm_init_debug(void)
4985{
4986 struct kvm_stats_debugfs_item *p;
4987
4988 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
4989
4990 kvm_debugfs_num_entries = 0;
4991 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
4992 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
4993 kvm_debugfs_dir, (void *)(long)p->offset,
4994 stat_fops[p->kind]);
4995 }
4996}
4997
4998static int kvm_suspend(void)
4999{
5000 if (kvm_usage_count)
5001 hardware_disable_nolock(NULL);
5002 return 0;
5003}
5004
5005static void kvm_resume(void)
5006{
5007 if (kvm_usage_count) {
5008#ifdef CONFIG_LOCKDEP
5009 WARN_ON(lockdep_is_held(&kvm_count_lock));
5010#endif
5011 hardware_enable_nolock(NULL);
5012 }
5013}
5014
5015static struct syscore_ops kvm_syscore_ops = {
5016 .suspend = kvm_suspend,
5017 .resume = kvm_resume,
5018};
5019
5020static inline
5021struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5022{
5023 return container_of(pn, struct kvm_vcpu, preempt_notifier);
5024}
5025
5026static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5027{
5028 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5029
5030 WRITE_ONCE(vcpu->preempted, false);
5031 WRITE_ONCE(vcpu->ready, false);
5032
5033 __this_cpu_write(kvm_running_vcpu, vcpu);
5034 kvm_arch_sched_in(vcpu, cpu);
5035 kvm_arch_vcpu_load(vcpu, cpu);
5036}
5037
5038static void kvm_sched_out(struct preempt_notifier *pn,
5039 struct task_struct *next)
5040{
5041 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5042
5043 if (current->state == TASK_RUNNING) {
5044 WRITE_ONCE(vcpu->preempted, true);
5045 WRITE_ONCE(vcpu->ready, true);
5046 }
5047 kvm_arch_vcpu_put(vcpu);
5048 __this_cpu_write(kvm_running_vcpu, NULL);
5049}
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060struct kvm_vcpu *kvm_get_running_vcpu(void)
5061{
5062 struct kvm_vcpu *vcpu;
5063
5064 preempt_disable();
5065 vcpu = __this_cpu_read(kvm_running_vcpu);
5066 preempt_enable();
5067
5068 return vcpu;
5069}
5070EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5071
5072
5073
5074
5075struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5076{
5077 return &kvm_running_vcpu;
5078}
5079
5080struct kvm_cpu_compat_check {
5081 void *opaque;
5082 int *ret;
5083};
5084
5085static void check_processor_compat(void *data)
5086{
5087 struct kvm_cpu_compat_check *c = data;
5088
5089 *c->ret = kvm_arch_check_processor_compat(c->opaque);
5090}
5091
5092int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
5093 struct module *module)
5094{
5095 struct kvm_cpu_compat_check c;
5096 int r;
5097 int cpu;
5098
5099 r = kvm_arch_init(opaque);
5100 if (r)
5101 goto out_fail;
5102
5103
5104
5105
5106
5107
5108
5109
5110 r = kvm_irqfd_init();
5111 if (r)
5112 goto out_irqfd;
5113
5114 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
5115 r = -ENOMEM;
5116 goto out_free_0;
5117 }
5118
5119 r = kvm_arch_hardware_setup(opaque);
5120 if (r < 0)
5121 goto out_free_1;
5122
5123 c.ret = &r;
5124 c.opaque = opaque;
5125 for_each_online_cpu(cpu) {
5126 smp_call_function_single(cpu, check_processor_compat, &c, 1);
5127 if (r < 0)
5128 goto out_free_2;
5129 }
5130
5131 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
5132 kvm_starting_cpu, kvm_dying_cpu);
5133 if (r)
5134 goto out_free_2;
5135 register_reboot_notifier(&kvm_reboot_notifier);
5136
5137
5138 if (!vcpu_align)
5139 vcpu_align = __alignof__(struct kvm_vcpu);
5140 kvm_vcpu_cache =
5141 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5142 SLAB_ACCOUNT,
5143 offsetof(struct kvm_vcpu, arch),
5144 sizeof_field(struct kvm_vcpu, arch),
5145 NULL);
5146 if (!kvm_vcpu_cache) {
5147 r = -ENOMEM;
5148 goto out_free_3;
5149 }
5150
5151 r = kvm_async_pf_init();
5152 if (r)
5153 goto out_free;
5154
5155 kvm_chardev_ops.owner = module;
5156 kvm_vm_fops.owner = module;
5157 kvm_vcpu_fops.owner = module;
5158
5159 r = misc_register(&kvm_dev);
5160 if (r) {
5161 pr_err("kvm: misc device register failed\n");
5162 goto out_unreg;
5163 }
5164
5165 register_syscore_ops(&kvm_syscore_ops);
5166
5167 kvm_preempt_ops.sched_in = kvm_sched_in;
5168 kvm_preempt_ops.sched_out = kvm_sched_out;
5169
5170 kvm_init_debug();
5171
5172 r = kvm_vfio_ops_init();
5173 WARN_ON(r);
5174
5175 return 0;
5176
5177out_unreg:
5178 kvm_async_pf_deinit();
5179out_free:
5180 kmem_cache_destroy(kvm_vcpu_cache);
5181out_free_3:
5182 unregister_reboot_notifier(&kvm_reboot_notifier);
5183 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5184out_free_2:
5185 kvm_arch_hardware_unsetup();
5186out_free_1:
5187 free_cpumask_var(cpus_hardware_enabled);
5188out_free_0:
5189 kvm_irqfd_exit();
5190out_irqfd:
5191 kvm_arch_exit();
5192out_fail:
5193 return r;
5194}
5195EXPORT_SYMBOL_GPL(kvm_init);
5196
5197void kvm_exit(void)
5198{
5199 debugfs_remove_recursive(kvm_debugfs_dir);
5200 misc_deregister(&kvm_dev);
5201 kmem_cache_destroy(kvm_vcpu_cache);
5202 kvm_async_pf_deinit();
5203 unregister_syscore_ops(&kvm_syscore_ops);
5204 unregister_reboot_notifier(&kvm_reboot_notifier);
5205 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5206 on_each_cpu(hardware_disable_nolock, NULL, 1);
5207 kvm_arch_hardware_unsetup();
5208 kvm_arch_exit();
5209 kvm_irqfd_exit();
5210 free_cpumask_var(cpus_hardware_enabled);
5211 kvm_vfio_ops_exit();
5212}
5213EXPORT_SYMBOL_GPL(kvm_exit);
5214
5215struct kvm_vm_worker_thread_context {
5216 struct kvm *kvm;
5217 struct task_struct *parent;
5218 struct completion init_done;
5219 kvm_vm_thread_fn_t thread_fn;
5220 uintptr_t data;
5221 int err;
5222};
5223
5224static int kvm_vm_worker_thread(void *context)
5225{
5226
5227
5228
5229
5230 struct kvm_vm_worker_thread_context *init_context = context;
5231 struct kvm *kvm = init_context->kvm;
5232 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5233 uintptr_t data = init_context->data;
5234 int err;
5235
5236 err = kthread_park(current);
5237
5238 WARN_ON(err != 0);
5239 if (err)
5240 goto init_complete;
5241
5242 err = cgroup_attach_task_all(init_context->parent, current);
5243 if (err) {
5244 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5245 __func__, err);
5246 goto init_complete;
5247 }
5248
5249 set_user_nice(current, task_nice(init_context->parent));
5250
5251init_complete:
5252 init_context->err = err;
5253 complete(&init_context->init_done);
5254 init_context = NULL;
5255
5256 if (err)
5257 return err;
5258
5259
5260 kthread_parkme();
5261
5262 if (!kthread_should_stop())
5263 err = thread_fn(kvm, data);
5264
5265 return err;
5266}
5267
5268int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5269 uintptr_t data, const char *name,
5270 struct task_struct **thread_ptr)
5271{
5272 struct kvm_vm_worker_thread_context init_context = {};
5273 struct task_struct *thread;
5274
5275 *thread_ptr = NULL;
5276 init_context.kvm = kvm;
5277 init_context.parent = current;
5278 init_context.thread_fn = thread_fn;
5279 init_context.data = data;
5280 init_completion(&init_context.init_done);
5281
5282 thread = kthread_run(kvm_vm_worker_thread, &init_context,
5283 "%s-%d", name, task_pid_nr(current));
5284 if (IS_ERR(thread))
5285 return PTR_ERR(thread);
5286
5287
5288 WARN_ON(thread == NULL);
5289
5290 wait_for_completion(&init_context.init_done);
5291
5292 if (!init_context.err)
5293 *thread_ptr = thread;
5294
5295 return init_context.err;
5296}
5297