1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <kvm/iodev.h>
17
18#include <linux/kvm_host.h>
19#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
22#include <linux/percpu.h>
23#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
26#include <linux/reboot.h>
27#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
30#include <linux/syscore_ops.h>
31#include <linux/cpu.h>
32#include <linux/sched/signal.h>
33#include <linux/sched/mm.h>
34#include <linux/sched/stat.h>
35#include <linux/cpumask.h>
36#include <linux/smp.h>
37#include <linux/anon_inodes.h>
38#include <linux/profile.h>
39#include <linux/kvm_para.h>
40#include <linux/pagemap.h>
41#include <linux/mman.h>
42#include <linux/swap.h>
43#include <linux/bitops.h>
44#include <linux/spinlock.h>
45#include <linux/compat.h>
46#include <linux/srcu.h>
47#include <linux/hugetlb.h>
48#include <linux/slab.h>
49#include <linux/sort.h>
50#include <linux/bsearch.h>
51#include <linux/io.h>
52#include <linux/lockdep.h>
53#include <linux/kthread.h>
54
55#include <asm/processor.h>
56#include <asm/ioctl.h>
57#include <linux/uaccess.h>
58
59#include "coalesced_mmio.h"
60#include "async_pf.h"
61#include "vfio.h"
62
63#define CREATE_TRACE_POINTS
64#include <trace/events/kvm.h>
65
66#include <linux/kvm_dirty_ring.h>
67
68
69#define ITOA_MAX_LEN 12
70
71MODULE_AUTHOR("Qumranet");
72MODULE_LICENSE("GPL");
73
74
75unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
76module_param(halt_poll_ns, uint, 0644);
77EXPORT_SYMBOL_GPL(halt_poll_ns);
78
79
80unsigned int halt_poll_ns_grow = 2;
81module_param(halt_poll_ns_grow, uint, 0644);
82EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
83
84
85unsigned int halt_poll_ns_grow_start = 10000;
86module_param(halt_poll_ns_grow_start, uint, 0644);
87EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
88
89
90unsigned int halt_poll_ns_shrink;
91module_param(halt_poll_ns_shrink, uint, 0644);
92EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
93
94
95
96
97
98
99
100DEFINE_MUTEX(kvm_lock);
101static DEFINE_RAW_SPINLOCK(kvm_count_lock);
102LIST_HEAD(vm_list);
103
104static cpumask_var_t cpus_hardware_enabled;
105static int kvm_usage_count;
106static atomic_t hardware_enable_failed;
107
108static struct kmem_cache *kvm_vcpu_cache;
109
110static __read_mostly struct preempt_ops kvm_preempt_ops;
111static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
112
113struct dentry *kvm_debugfs_dir;
114EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
115
116static int kvm_debugfs_num_entries;
117static const struct file_operations stat_fops_per_vm;
118
119static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
120 unsigned long arg);
121#ifdef CONFIG_KVM_COMPAT
122static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
123 unsigned long arg);
124#define KVM_COMPAT(c) .compat_ioctl = (c)
125#else
126
127
128
129
130
131
132
133static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
134 unsigned long arg) { return -EINVAL; }
135
136static int kvm_no_compat_open(struct inode *inode, struct file *file)
137{
138 return is_compat_task() ? -ENODEV : 0;
139}
140#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
141 .open = kvm_no_compat_open
142#endif
143static int hardware_enable_all(void);
144static void hardware_disable_all(void);
145
146static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
147
148__visible bool kvm_rebooting;
149EXPORT_SYMBOL_GPL(kvm_rebooting);
150
151#define KVM_EVENT_CREATE_VM 0
152#define KVM_EVENT_DESTROY_VM 1
153static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
154static unsigned long long kvm_createvm_count;
155static unsigned long long kvm_active_vms;
156
157__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
158 unsigned long start, unsigned long end)
159{
160}
161
162bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
163{
164
165
166
167
168
169
170 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
171 return false;
172
173 return is_zone_device_page(pfn_to_page(pfn));
174}
175
176bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
177{
178
179
180
181
182
183 if (pfn_valid(pfn))
184 return PageReserved(pfn_to_page(pfn)) &&
185 !is_zero_pfn(pfn) &&
186 !kvm_is_zone_device_pfn(pfn);
187
188 return true;
189}
190
191bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
192{
193 struct page *page = pfn_to_page(pfn);
194
195 if (!PageTransCompoundMap(page))
196 return false;
197
198 return is_transparent_hugepage(compound_head(page));
199}
200
201
202
203
204void vcpu_load(struct kvm_vcpu *vcpu)
205{
206 int cpu = get_cpu();
207
208 __this_cpu_write(kvm_running_vcpu, vcpu);
209 preempt_notifier_register(&vcpu->preempt_notifier);
210 kvm_arch_vcpu_load(vcpu, cpu);
211 put_cpu();
212}
213EXPORT_SYMBOL_GPL(vcpu_load);
214
215void vcpu_put(struct kvm_vcpu *vcpu)
216{
217 preempt_disable();
218 kvm_arch_vcpu_put(vcpu);
219 preempt_notifier_unregister(&vcpu->preempt_notifier);
220 __this_cpu_write(kvm_running_vcpu, NULL);
221 preempt_enable();
222}
223EXPORT_SYMBOL_GPL(vcpu_put);
224
225
226static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
227{
228 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
229
230
231
232
233
234 if (req & KVM_REQUEST_WAIT)
235 return mode != OUTSIDE_GUEST_MODE;
236
237
238
239
240 return mode == IN_GUEST_MODE;
241}
242
243static void ack_flush(void *_completed)
244{
245}
246
247static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
248{
249 if (unlikely(!cpus))
250 cpus = cpu_online_mask;
251
252 if (cpumask_empty(cpus))
253 return false;
254
255 smp_call_function_many(cpus, ack_flush, NULL, wait);
256 return true;
257}
258
259bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
260 struct kvm_vcpu *except,
261 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
262{
263 int i, cpu, me;
264 struct kvm_vcpu *vcpu;
265 bool called;
266
267 me = get_cpu();
268
269 kvm_for_each_vcpu(i, vcpu, kvm) {
270 if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
271 vcpu == except)
272 continue;
273
274 kvm_make_request(req, vcpu);
275 cpu = vcpu->cpu;
276
277 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
278 continue;
279
280 if (tmp != NULL && cpu != -1 && cpu != me &&
281 kvm_request_needs_ipi(vcpu, req))
282 __cpumask_set_cpu(cpu, tmp);
283 }
284
285 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
286 put_cpu();
287
288 return called;
289}
290
291bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
292 struct kvm_vcpu *except)
293{
294 cpumask_var_t cpus;
295 bool called;
296
297 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
298
299 called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus);
300
301 free_cpumask_var(cpus);
302 return called;
303}
304
305bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
306{
307 return kvm_make_all_cpus_request_except(kvm, req, NULL);
308}
309
310#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
311void kvm_flush_remote_tlbs(struct kvm *kvm)
312{
313
314
315
316
317 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
318
319
320
321
322
323
324
325
326
327
328
329
330 if (!kvm_arch_flush_remote_tlb(kvm)
331 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
332 ++kvm->stat.remote_tlb_flush;
333 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
334}
335EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
336#endif
337
338void kvm_reload_remote_mmus(struct kvm *kvm)
339{
340 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
341}
342
343#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
344static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
345 gfp_t gfp_flags)
346{
347 gfp_flags |= mc->gfp_zero;
348
349 if (mc->kmem_cache)
350 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
351 else
352 return (void *)__get_free_page(gfp_flags);
353}
354
355int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
356{
357 void *obj;
358
359 if (mc->nobjs >= min)
360 return 0;
361 while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
362 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
363 if (!obj)
364 return mc->nobjs >= min ? 0 : -ENOMEM;
365 mc->objects[mc->nobjs++] = obj;
366 }
367 return 0;
368}
369
370int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
371{
372 return mc->nobjs;
373}
374
375void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
376{
377 while (mc->nobjs) {
378 if (mc->kmem_cache)
379 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
380 else
381 free_page((unsigned long)mc->objects[--mc->nobjs]);
382 }
383}
384
385void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
386{
387 void *p;
388
389 if (WARN_ON(!mc->nobjs))
390 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
391 else
392 p = mc->objects[--mc->nobjs];
393 BUG_ON(!p);
394 return p;
395}
396#endif
397
398static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
399{
400 mutex_init(&vcpu->mutex);
401 vcpu->cpu = -1;
402 vcpu->kvm = kvm;
403 vcpu->vcpu_id = id;
404 vcpu->pid = NULL;
405 rcuwait_init(&vcpu->wait);
406 kvm_async_pf_vcpu_init(vcpu);
407
408 vcpu->pre_pcpu = -1;
409 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
410
411 kvm_vcpu_set_in_spin_loop(vcpu, false);
412 kvm_vcpu_set_dy_eligible(vcpu, false);
413 vcpu->preempted = false;
414 vcpu->ready = false;
415 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
416}
417
418void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
419{
420 kvm_dirty_ring_free(&vcpu->dirty_ring);
421 kvm_arch_vcpu_destroy(vcpu);
422
423
424
425
426
427
428 put_pid(rcu_dereference_protected(vcpu->pid, 1));
429
430 free_page((unsigned long)vcpu->run);
431 kmem_cache_free(kvm_vcpu_cache, vcpu);
432}
433EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
434
435#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
436static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
437{
438 return container_of(mn, struct kvm, mmu_notifier);
439}
440
441static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
442 struct mm_struct *mm,
443 unsigned long start, unsigned long end)
444{
445 struct kvm *kvm = mmu_notifier_to_kvm(mn);
446 int idx;
447
448 idx = srcu_read_lock(&kvm->srcu);
449 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
450 srcu_read_unlock(&kvm->srcu, idx);
451}
452
453static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
454 struct mm_struct *mm,
455 unsigned long address,
456 pte_t pte)
457{
458 struct kvm *kvm = mmu_notifier_to_kvm(mn);
459 int idx;
460
461 idx = srcu_read_lock(&kvm->srcu);
462 spin_lock(&kvm->mmu_lock);
463 kvm->mmu_notifier_seq++;
464
465 if (kvm_set_spte_hva(kvm, address, pte))
466 kvm_flush_remote_tlbs(kvm);
467
468 spin_unlock(&kvm->mmu_lock);
469 srcu_read_unlock(&kvm->srcu, idx);
470}
471
472static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
473 const struct mmu_notifier_range *range)
474{
475 struct kvm *kvm = mmu_notifier_to_kvm(mn);
476 int need_tlb_flush = 0, idx;
477
478 idx = srcu_read_lock(&kvm->srcu);
479 spin_lock(&kvm->mmu_lock);
480
481
482
483
484
485 kvm->mmu_notifier_count++;
486 need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
487 range->flags);
488
489 if (need_tlb_flush || kvm->tlbs_dirty)
490 kvm_flush_remote_tlbs(kvm);
491
492 spin_unlock(&kvm->mmu_lock);
493 srcu_read_unlock(&kvm->srcu, idx);
494
495 return 0;
496}
497
498static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
499 const struct mmu_notifier_range *range)
500{
501 struct kvm *kvm = mmu_notifier_to_kvm(mn);
502
503 spin_lock(&kvm->mmu_lock);
504
505
506
507
508
509 kvm->mmu_notifier_seq++;
510 smp_wmb();
511
512
513
514
515
516 kvm->mmu_notifier_count--;
517 spin_unlock(&kvm->mmu_lock);
518
519 BUG_ON(kvm->mmu_notifier_count < 0);
520}
521
522static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
523 struct mm_struct *mm,
524 unsigned long start,
525 unsigned long end)
526{
527 struct kvm *kvm = mmu_notifier_to_kvm(mn);
528 int young, idx;
529
530 idx = srcu_read_lock(&kvm->srcu);
531 spin_lock(&kvm->mmu_lock);
532
533 young = kvm_age_hva(kvm, start, end);
534 if (young)
535 kvm_flush_remote_tlbs(kvm);
536
537 spin_unlock(&kvm->mmu_lock);
538 srcu_read_unlock(&kvm->srcu, idx);
539
540 return young;
541}
542
543static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
544 struct mm_struct *mm,
545 unsigned long start,
546 unsigned long end)
547{
548 struct kvm *kvm = mmu_notifier_to_kvm(mn);
549 int young, idx;
550
551 idx = srcu_read_lock(&kvm->srcu);
552 spin_lock(&kvm->mmu_lock);
553
554
555
556
557
558
559
560
561
562
563
564
565
566 young = kvm_age_hva(kvm, start, end);
567 spin_unlock(&kvm->mmu_lock);
568 srcu_read_unlock(&kvm->srcu, idx);
569
570 return young;
571}
572
573static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
574 struct mm_struct *mm,
575 unsigned long address)
576{
577 struct kvm *kvm = mmu_notifier_to_kvm(mn);
578 int young, idx;
579
580 idx = srcu_read_lock(&kvm->srcu);
581 spin_lock(&kvm->mmu_lock);
582 young = kvm_test_age_hva(kvm, address);
583 spin_unlock(&kvm->mmu_lock);
584 srcu_read_unlock(&kvm->srcu, idx);
585
586 return young;
587}
588
589static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
590 struct mm_struct *mm)
591{
592 struct kvm *kvm = mmu_notifier_to_kvm(mn);
593 int idx;
594
595 idx = srcu_read_lock(&kvm->srcu);
596 kvm_arch_flush_shadow_all(kvm);
597 srcu_read_unlock(&kvm->srcu, idx);
598}
599
600static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
601 .invalidate_range = kvm_mmu_notifier_invalidate_range,
602 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
603 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
604 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
605 .clear_young = kvm_mmu_notifier_clear_young,
606 .test_young = kvm_mmu_notifier_test_young,
607 .change_pte = kvm_mmu_notifier_change_pte,
608 .release = kvm_mmu_notifier_release,
609};
610
611static int kvm_init_mmu_notifier(struct kvm *kvm)
612{
613 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
614 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
615}
616
617#else
618
619static int kvm_init_mmu_notifier(struct kvm *kvm)
620{
621 return 0;
622}
623
624#endif
625
626static struct kvm_memslots *kvm_alloc_memslots(void)
627{
628 int i;
629 struct kvm_memslots *slots;
630
631 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
632 if (!slots)
633 return NULL;
634
635 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
636 slots->id_to_index[i] = -1;
637
638 return slots;
639}
640
641static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
642{
643 if (!memslot->dirty_bitmap)
644 return;
645
646 kvfree(memslot->dirty_bitmap);
647 memslot->dirty_bitmap = NULL;
648}
649
650static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
651{
652 kvm_destroy_dirty_bitmap(slot);
653
654 kvm_arch_free_memslot(kvm, slot);
655
656 slot->flags = 0;
657 slot->npages = 0;
658}
659
660static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
661{
662 struct kvm_memory_slot *memslot;
663
664 if (!slots)
665 return;
666
667 kvm_for_each_memslot(memslot, slots)
668 kvm_free_memslot(kvm, memslot);
669
670 kvfree(slots);
671}
672
673static void kvm_destroy_vm_debugfs(struct kvm *kvm)
674{
675 int i;
676
677 if (!kvm->debugfs_dentry)
678 return;
679
680 debugfs_remove_recursive(kvm->debugfs_dentry);
681
682 if (kvm->debugfs_stat_data) {
683 for (i = 0; i < kvm_debugfs_num_entries; i++)
684 kfree(kvm->debugfs_stat_data[i]);
685 kfree(kvm->debugfs_stat_data);
686 }
687}
688
689static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
690{
691 char dir_name[ITOA_MAX_LEN * 2];
692 struct kvm_stat_data *stat_data;
693 struct kvm_stats_debugfs_item *p;
694
695 if (!debugfs_initialized())
696 return 0;
697
698 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
699 kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
700
701 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
702 sizeof(*kvm->debugfs_stat_data),
703 GFP_KERNEL_ACCOUNT);
704 if (!kvm->debugfs_stat_data)
705 return -ENOMEM;
706
707 for (p = debugfs_entries; p->name; p++) {
708 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
709 if (!stat_data)
710 return -ENOMEM;
711
712 stat_data->kvm = kvm;
713 stat_data->dbgfs_item = p;
714 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
715 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
716 kvm->debugfs_dentry, stat_data,
717 &stat_fops_per_vm);
718 }
719 return 0;
720}
721
722
723
724
725
726int __weak kvm_arch_post_init_vm(struct kvm *kvm)
727{
728 return 0;
729}
730
731
732
733
734
735void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
736{
737}
738
739static struct kvm *kvm_create_vm(unsigned long type)
740{
741 struct kvm *kvm = kvm_arch_alloc_vm();
742 int r = -ENOMEM;
743 int i;
744
745 if (!kvm)
746 return ERR_PTR(-ENOMEM);
747
748 spin_lock_init(&kvm->mmu_lock);
749 mmgrab(current->mm);
750 kvm->mm = current->mm;
751 kvm_eventfd_init(kvm);
752 mutex_init(&kvm->lock);
753 mutex_init(&kvm->irq_lock);
754 mutex_init(&kvm->slots_lock);
755 INIT_LIST_HEAD(&kvm->devices);
756
757 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
758
759 if (init_srcu_struct(&kvm->srcu))
760 goto out_err_no_srcu;
761 if (init_srcu_struct(&kvm->irq_srcu))
762 goto out_err_no_irq_srcu;
763
764 refcount_set(&kvm->users_count, 1);
765 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
766 struct kvm_memslots *slots = kvm_alloc_memslots();
767
768 if (!slots)
769 goto out_err_no_arch_destroy_vm;
770
771 slots->generation = i;
772 rcu_assign_pointer(kvm->memslots[i], slots);
773 }
774
775 for (i = 0; i < KVM_NR_BUSES; i++) {
776 rcu_assign_pointer(kvm->buses[i],
777 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
778 if (!kvm->buses[i])
779 goto out_err_no_arch_destroy_vm;
780 }
781
782 kvm->max_halt_poll_ns = halt_poll_ns;
783
784 r = kvm_arch_init_vm(kvm, type);
785 if (r)
786 goto out_err_no_arch_destroy_vm;
787
788 r = hardware_enable_all();
789 if (r)
790 goto out_err_no_disable;
791
792#ifdef CONFIG_HAVE_KVM_IRQFD
793 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
794#endif
795
796 r = kvm_init_mmu_notifier(kvm);
797 if (r)
798 goto out_err_no_mmu_notifier;
799
800 r = kvm_arch_post_init_vm(kvm);
801 if (r)
802 goto out_err;
803
804 mutex_lock(&kvm_lock);
805 list_add(&kvm->vm_list, &vm_list);
806 mutex_unlock(&kvm_lock);
807
808 preempt_notifier_inc();
809
810 return kvm;
811
812out_err:
813#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
814 if (kvm->mmu_notifier.ops)
815 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
816#endif
817out_err_no_mmu_notifier:
818 hardware_disable_all();
819out_err_no_disable:
820 kvm_arch_destroy_vm(kvm);
821out_err_no_arch_destroy_vm:
822 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
823 for (i = 0; i < KVM_NR_BUSES; i++)
824 kfree(kvm_get_bus(kvm, i));
825 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
826 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
827 cleanup_srcu_struct(&kvm->irq_srcu);
828out_err_no_irq_srcu:
829 cleanup_srcu_struct(&kvm->srcu);
830out_err_no_srcu:
831 kvm_arch_free_vm(kvm);
832 mmdrop(current->mm);
833 return ERR_PTR(r);
834}
835
836static void kvm_destroy_devices(struct kvm *kvm)
837{
838 struct kvm_device *dev, *tmp;
839
840
841
842
843
844
845 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
846 list_del(&dev->vm_node);
847 dev->ops->destroy(dev);
848 }
849}
850
851static void kvm_destroy_vm(struct kvm *kvm)
852{
853 int i;
854 struct mm_struct *mm = kvm->mm;
855
856 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
857 kvm_destroy_vm_debugfs(kvm);
858 kvm_arch_sync_events(kvm);
859 mutex_lock(&kvm_lock);
860 list_del(&kvm->vm_list);
861 mutex_unlock(&kvm_lock);
862 kvm_arch_pre_destroy_vm(kvm);
863
864 kvm_free_irq_routing(kvm);
865 for (i = 0; i < KVM_NR_BUSES; i++) {
866 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
867
868 if (bus)
869 kvm_io_bus_destroy(bus);
870 kvm->buses[i] = NULL;
871 }
872 kvm_coalesced_mmio_free(kvm);
873#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
874 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
875#else
876 kvm_arch_flush_shadow_all(kvm);
877#endif
878 kvm_arch_destroy_vm(kvm);
879 kvm_destroy_devices(kvm);
880 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
881 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
882 cleanup_srcu_struct(&kvm->irq_srcu);
883 cleanup_srcu_struct(&kvm->srcu);
884 kvm_arch_free_vm(kvm);
885 preempt_notifier_dec();
886 hardware_disable_all();
887 mmdrop(mm);
888}
889
890void kvm_get_kvm(struct kvm *kvm)
891{
892 refcount_inc(&kvm->users_count);
893}
894EXPORT_SYMBOL_GPL(kvm_get_kvm);
895
896void kvm_put_kvm(struct kvm *kvm)
897{
898 if (refcount_dec_and_test(&kvm->users_count))
899 kvm_destroy_vm(kvm);
900}
901EXPORT_SYMBOL_GPL(kvm_put_kvm);
902
903
904
905
906
907
908
909
910void kvm_put_kvm_no_destroy(struct kvm *kvm)
911{
912 WARN_ON(refcount_dec_and_test(&kvm->users_count));
913}
914EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
915
916static int kvm_vm_release(struct inode *inode, struct file *filp)
917{
918 struct kvm *kvm = filp->private_data;
919
920 kvm_irqfd_release(kvm);
921
922 kvm_put_kvm(kvm);
923 return 0;
924}
925
926
927
928
929
930static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
931{
932 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
933
934 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
935 if (!memslot->dirty_bitmap)
936 return -ENOMEM;
937
938 return 0;
939}
940
941
942
943
944
945static inline void kvm_memslot_delete(struct kvm_memslots *slots,
946 struct kvm_memory_slot *memslot)
947{
948 struct kvm_memory_slot *mslots = slots->memslots;
949 int i;
950
951 if (WARN_ON(slots->id_to_index[memslot->id] == -1))
952 return;
953
954 slots->used_slots--;
955
956 if (atomic_read(&slots->lru_slot) >= slots->used_slots)
957 atomic_set(&slots->lru_slot, 0);
958
959 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
960 mslots[i] = mslots[i + 1];
961 slots->id_to_index[mslots[i].id] = i;
962 }
963 mslots[i] = *memslot;
964 slots->id_to_index[memslot->id] = -1;
965}
966
967
968
969
970
971static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
972{
973 return slots->used_slots++;
974}
975
976
977
978
979
980
981
982
983static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
984 struct kvm_memory_slot *memslot)
985{
986 struct kvm_memory_slot *mslots = slots->memslots;
987 int i;
988
989 if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
990 WARN_ON_ONCE(!slots->used_slots))
991 return -1;
992
993
994
995
996
997
998 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
999 if (memslot->base_gfn > mslots[i + 1].base_gfn)
1000 break;
1001
1002 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1003
1004
1005 mslots[i] = mslots[i + 1];
1006 slots->id_to_index[mslots[i].id] = i;
1007 }
1008 return i;
1009}
1010
1011
1012
1013
1014
1015
1016
1017
1018static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1019 struct kvm_memory_slot *memslot,
1020 int start)
1021{
1022 struct kvm_memory_slot *mslots = slots->memslots;
1023 int i;
1024
1025 for (i = start; i > 0; i--) {
1026 if (memslot->base_gfn < mslots[i - 1].base_gfn)
1027 break;
1028
1029 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1030
1031
1032 mslots[i] = mslots[i - 1];
1033 slots->id_to_index[mslots[i].id] = i;
1034 }
1035 return i;
1036}
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079static void update_memslots(struct kvm_memslots *slots,
1080 struct kvm_memory_slot *memslot,
1081 enum kvm_mr_change change)
1082{
1083 int i;
1084
1085 if (change == KVM_MR_DELETE) {
1086 kvm_memslot_delete(slots, memslot);
1087 } else {
1088 if (change == KVM_MR_CREATE)
1089 i = kvm_memslot_insert_back(slots);
1090 else
1091 i = kvm_memslot_move_backward(slots, memslot);
1092 i = kvm_memslot_move_forward(slots, memslot, i);
1093
1094
1095
1096
1097
1098 slots->memslots[i] = *memslot;
1099 slots->id_to_index[memslot->id] = i;
1100 }
1101}
1102
1103static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1104{
1105 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1106
1107#ifdef __KVM_HAVE_READONLY_MEM
1108 valid_flags |= KVM_MEM_READONLY;
1109#endif
1110
1111 if (mem->flags & ~valid_flags)
1112 return -EINVAL;
1113
1114 return 0;
1115}
1116
1117static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
1118 int as_id, struct kvm_memslots *slots)
1119{
1120 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
1121 u64 gen = old_memslots->generation;
1122
1123 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1124 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1125
1126 rcu_assign_pointer(kvm->memslots[as_id], slots);
1127 synchronize_srcu_expedited(&kvm->srcu);
1128
1129
1130
1131
1132
1133
1134
1135 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1136
1137
1138
1139
1140
1141
1142
1143
1144 gen += KVM_ADDRESS_SPACE_NUM;
1145
1146 kvm_arch_memslots_updated(kvm, gen);
1147
1148 slots->generation = gen;
1149
1150 return old_memslots;
1151}
1152
1153
1154
1155
1156
1157
1158static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1159 enum kvm_mr_change change)
1160{
1161 struct kvm_memslots *slots;
1162 size_t old_size, new_size;
1163
1164 old_size = sizeof(struct kvm_memslots) +
1165 (sizeof(struct kvm_memory_slot) * old->used_slots);
1166
1167 if (change == KVM_MR_CREATE)
1168 new_size = old_size + sizeof(struct kvm_memory_slot);
1169 else
1170 new_size = old_size;
1171
1172 slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1173 if (likely(slots))
1174 memcpy(slots, old, old_size);
1175
1176 return slots;
1177}
1178
1179static int kvm_set_memslot(struct kvm *kvm,
1180 const struct kvm_userspace_memory_region *mem,
1181 struct kvm_memory_slot *old,
1182 struct kvm_memory_slot *new, int as_id,
1183 enum kvm_mr_change change)
1184{
1185 struct kvm_memory_slot *slot;
1186 struct kvm_memslots *slots;
1187 int r;
1188
1189 slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
1190 if (!slots)
1191 return -ENOMEM;
1192
1193 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1194
1195
1196
1197
1198 slot = id_to_memslot(slots, old->id);
1199 slot->flags |= KVM_MEMSLOT_INVALID;
1200
1201
1202
1203
1204
1205
1206
1207 slots = install_new_memslots(kvm, as_id, slots);
1208
1209
1210
1211
1212
1213
1214
1215
1216 kvm_arch_flush_shadow_memslot(kvm, slot);
1217 }
1218
1219 r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
1220 if (r)
1221 goto out_slots;
1222
1223 update_memslots(slots, new, change);
1224 slots = install_new_memslots(kvm, as_id, slots);
1225
1226 kvm_arch_commit_memory_region(kvm, mem, old, new, change);
1227
1228 kvfree(slots);
1229 return 0;
1230
1231out_slots:
1232 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1233 slots = install_new_memslots(kvm, as_id, slots);
1234 kvfree(slots);
1235 return r;
1236}
1237
1238static int kvm_delete_memslot(struct kvm *kvm,
1239 const struct kvm_userspace_memory_region *mem,
1240 struct kvm_memory_slot *old, int as_id)
1241{
1242 struct kvm_memory_slot new;
1243 int r;
1244
1245 if (!old->npages)
1246 return -EINVAL;
1247
1248 memset(&new, 0, sizeof(new));
1249 new.id = old->id;
1250
1251
1252
1253
1254 new.as_id = as_id;
1255
1256 r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
1257 if (r)
1258 return r;
1259
1260 kvm_free_memslot(kvm, old);
1261 return 0;
1262}
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272int __kvm_set_memory_region(struct kvm *kvm,
1273 const struct kvm_userspace_memory_region *mem)
1274{
1275 struct kvm_memory_slot old, new;
1276 struct kvm_memory_slot *tmp;
1277 enum kvm_mr_change change;
1278 int as_id, id;
1279 int r;
1280
1281 r = check_memory_region_flags(mem);
1282 if (r)
1283 return r;
1284
1285 as_id = mem->slot >> 16;
1286 id = (u16)mem->slot;
1287
1288
1289 if (mem->memory_size & (PAGE_SIZE - 1))
1290 return -EINVAL;
1291 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1292 return -EINVAL;
1293
1294 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1295 (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1296 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1297 mem->memory_size))
1298 return -EINVAL;
1299 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1300 return -EINVAL;
1301 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1302 return -EINVAL;
1303
1304
1305
1306
1307
1308
1309
1310 tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1311 if (tmp) {
1312 old = *tmp;
1313 tmp = NULL;
1314 } else {
1315 memset(&old, 0, sizeof(old));
1316 old.id = id;
1317 }
1318
1319 if (!mem->memory_size)
1320 return kvm_delete_memslot(kvm, mem, &old, as_id);
1321
1322 new.as_id = as_id;
1323 new.id = id;
1324 new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1325 new.npages = mem->memory_size >> PAGE_SHIFT;
1326 new.flags = mem->flags;
1327 new.userspace_addr = mem->userspace_addr;
1328
1329 if (new.npages > KVM_MEM_MAX_NR_PAGES)
1330 return -EINVAL;
1331
1332 if (!old.npages) {
1333 change = KVM_MR_CREATE;
1334 new.dirty_bitmap = NULL;
1335 memset(&new.arch, 0, sizeof(new.arch));
1336 } else {
1337 if ((new.userspace_addr != old.userspace_addr) ||
1338 (new.npages != old.npages) ||
1339 ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1340 return -EINVAL;
1341
1342 if (new.base_gfn != old.base_gfn)
1343 change = KVM_MR_MOVE;
1344 else if (new.flags != old.flags)
1345 change = KVM_MR_FLAGS_ONLY;
1346 else
1347 return 0;
1348
1349
1350 new.dirty_bitmap = old.dirty_bitmap;
1351 memcpy(&new.arch, &old.arch, sizeof(new.arch));
1352 }
1353
1354 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
1355
1356 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1357 if (tmp->id == id)
1358 continue;
1359 if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1360 (new.base_gfn >= tmp->base_gfn + tmp->npages)))
1361 return -EEXIST;
1362 }
1363 }
1364
1365
1366 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1367 new.dirty_bitmap = NULL;
1368 else if (!new.dirty_bitmap && !kvm->dirty_ring_size) {
1369 r = kvm_alloc_dirty_bitmap(&new);
1370 if (r)
1371 return r;
1372
1373 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1374 bitmap_set(new.dirty_bitmap, 0, new.npages);
1375 }
1376
1377 r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
1378 if (r)
1379 goto out_bitmap;
1380
1381 if (old.dirty_bitmap && !new.dirty_bitmap)
1382 kvm_destroy_dirty_bitmap(&old);
1383 return 0;
1384
1385out_bitmap:
1386 if (new.dirty_bitmap && !old.dirty_bitmap)
1387 kvm_destroy_dirty_bitmap(&new);
1388 return r;
1389}
1390EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1391
1392int kvm_set_memory_region(struct kvm *kvm,
1393 const struct kvm_userspace_memory_region *mem)
1394{
1395 int r;
1396
1397 mutex_lock(&kvm->slots_lock);
1398 r = __kvm_set_memory_region(kvm, mem);
1399 mutex_unlock(&kvm->slots_lock);
1400 return r;
1401}
1402EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1403
1404static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1405 struct kvm_userspace_memory_region *mem)
1406{
1407 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1408 return -EINVAL;
1409
1410 return kvm_set_memory_region(kvm, mem);
1411}
1412
1413#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1414
1415
1416
1417
1418
1419
1420
1421int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1422 int *is_dirty, struct kvm_memory_slot **memslot)
1423{
1424 struct kvm_memslots *slots;
1425 int i, as_id, id;
1426 unsigned long n;
1427 unsigned long any = 0;
1428
1429
1430 if (kvm->dirty_ring_size)
1431 return -ENXIO;
1432
1433 *memslot = NULL;
1434 *is_dirty = 0;
1435
1436 as_id = log->slot >> 16;
1437 id = (u16)log->slot;
1438 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1439 return -EINVAL;
1440
1441 slots = __kvm_memslots(kvm, as_id);
1442 *memslot = id_to_memslot(slots, id);
1443 if (!(*memslot) || !(*memslot)->dirty_bitmap)
1444 return -ENOENT;
1445
1446 kvm_arch_sync_dirty_log(kvm, *memslot);
1447
1448 n = kvm_dirty_bitmap_bytes(*memslot);
1449
1450 for (i = 0; !any && i < n/sizeof(long); ++i)
1451 any = (*memslot)->dirty_bitmap[i];
1452
1453 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1454 return -EFAULT;
1455
1456 if (any)
1457 *is_dirty = 1;
1458 return 0;
1459}
1460EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1461
1462#else
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
1485{
1486 struct kvm_memslots *slots;
1487 struct kvm_memory_slot *memslot;
1488 int i, as_id, id;
1489 unsigned long n;
1490 unsigned long *dirty_bitmap;
1491 unsigned long *dirty_bitmap_buffer;
1492 bool flush;
1493
1494
1495 if (kvm->dirty_ring_size)
1496 return -ENXIO;
1497
1498 as_id = log->slot >> 16;
1499 id = (u16)log->slot;
1500 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1501 return -EINVAL;
1502
1503 slots = __kvm_memslots(kvm, as_id);
1504 memslot = id_to_memslot(slots, id);
1505 if (!memslot || !memslot->dirty_bitmap)
1506 return -ENOENT;
1507
1508 dirty_bitmap = memslot->dirty_bitmap;
1509
1510 kvm_arch_sync_dirty_log(kvm, memslot);
1511
1512 n = kvm_dirty_bitmap_bytes(memslot);
1513 flush = false;
1514 if (kvm->manual_dirty_log_protect) {
1515
1516
1517
1518
1519
1520
1521
1522
1523 dirty_bitmap_buffer = dirty_bitmap;
1524 } else {
1525 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1526 memset(dirty_bitmap_buffer, 0, n);
1527
1528 spin_lock(&kvm->mmu_lock);
1529 for (i = 0; i < n / sizeof(long); i++) {
1530 unsigned long mask;
1531 gfn_t offset;
1532
1533 if (!dirty_bitmap[i])
1534 continue;
1535
1536 flush = true;
1537 mask = xchg(&dirty_bitmap[i], 0);
1538 dirty_bitmap_buffer[i] = mask;
1539
1540 offset = i * BITS_PER_LONG;
1541 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1542 offset, mask);
1543 }
1544 spin_unlock(&kvm->mmu_lock);
1545 }
1546
1547 if (flush)
1548 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1549
1550 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1551 return -EFAULT;
1552 return 0;
1553}
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1576 struct kvm_dirty_log *log)
1577{
1578 int r;
1579
1580 mutex_lock(&kvm->slots_lock);
1581
1582 r = kvm_get_dirty_log_protect(kvm, log);
1583
1584 mutex_unlock(&kvm->slots_lock);
1585 return r;
1586}
1587
1588
1589
1590
1591
1592
1593
1594static int kvm_clear_dirty_log_protect(struct kvm *kvm,
1595 struct kvm_clear_dirty_log *log)
1596{
1597 struct kvm_memslots *slots;
1598 struct kvm_memory_slot *memslot;
1599 int as_id, id;
1600 gfn_t offset;
1601 unsigned long i, n;
1602 unsigned long *dirty_bitmap;
1603 unsigned long *dirty_bitmap_buffer;
1604 bool flush;
1605
1606
1607 if (kvm->dirty_ring_size)
1608 return -ENXIO;
1609
1610 as_id = log->slot >> 16;
1611 id = (u16)log->slot;
1612 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1613 return -EINVAL;
1614
1615 if (log->first_page & 63)
1616 return -EINVAL;
1617
1618 slots = __kvm_memslots(kvm, as_id);
1619 memslot = id_to_memslot(slots, id);
1620 if (!memslot || !memslot->dirty_bitmap)
1621 return -ENOENT;
1622
1623 dirty_bitmap = memslot->dirty_bitmap;
1624
1625 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
1626
1627 if (log->first_page > memslot->npages ||
1628 log->num_pages > memslot->npages - log->first_page ||
1629 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
1630 return -EINVAL;
1631
1632 kvm_arch_sync_dirty_log(kvm, memslot);
1633
1634 flush = false;
1635 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1636 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
1637 return -EFAULT;
1638
1639 spin_lock(&kvm->mmu_lock);
1640 for (offset = log->first_page, i = offset / BITS_PER_LONG,
1641 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
1642 i++, offset += BITS_PER_LONG) {
1643 unsigned long mask = *dirty_bitmap_buffer++;
1644 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
1645 if (!mask)
1646 continue;
1647
1648 mask &= atomic_long_fetch_andnot(mask, p);
1649
1650
1651
1652
1653
1654
1655
1656 if (mask) {
1657 flush = true;
1658 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1659 offset, mask);
1660 }
1661 }
1662 spin_unlock(&kvm->mmu_lock);
1663
1664 if (flush)
1665 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1666
1667 return 0;
1668}
1669
1670static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
1671 struct kvm_clear_dirty_log *log)
1672{
1673 int r;
1674
1675 mutex_lock(&kvm->slots_lock);
1676
1677 r = kvm_clear_dirty_log_protect(kvm, log);
1678
1679 mutex_unlock(&kvm->slots_lock);
1680 return r;
1681}
1682#endif
1683
1684struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1685{
1686 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
1687}
1688EXPORT_SYMBOL_GPL(gfn_to_memslot);
1689
1690struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
1691{
1692 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
1693}
1694EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
1695
1696bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
1697{
1698 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
1699
1700 return kvm_is_visible_memslot(memslot);
1701}
1702EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
1703
1704bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
1705{
1706 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1707
1708 return kvm_is_visible_memslot(memslot);
1709}
1710EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
1711
1712unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
1713{
1714 struct vm_area_struct *vma;
1715 unsigned long addr, size;
1716
1717 size = PAGE_SIZE;
1718
1719 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
1720 if (kvm_is_error_hva(addr))
1721 return PAGE_SIZE;
1722
1723 mmap_read_lock(current->mm);
1724 vma = find_vma(current->mm, addr);
1725 if (!vma)
1726 goto out;
1727
1728 size = vma_kernel_pagesize(vma);
1729
1730out:
1731 mmap_read_unlock(current->mm);
1732
1733 return size;
1734}
1735
1736static bool memslot_is_readonly(struct kvm_memory_slot *slot)
1737{
1738 return slot->flags & KVM_MEM_READONLY;
1739}
1740
1741static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1742 gfn_t *nr_pages, bool write)
1743{
1744 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1745 return KVM_HVA_ERR_BAD;
1746
1747 if (memslot_is_readonly(slot) && write)
1748 return KVM_HVA_ERR_RO_BAD;
1749
1750 if (nr_pages)
1751 *nr_pages = slot->npages - (gfn - slot->base_gfn);
1752
1753 return __gfn_to_hva_memslot(slot, gfn);
1754}
1755
1756static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1757 gfn_t *nr_pages)
1758{
1759 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
1760}
1761
1762unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
1763 gfn_t gfn)
1764{
1765 return gfn_to_hva_many(slot, gfn, NULL);
1766}
1767EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
1768
1769unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1770{
1771 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1772}
1773EXPORT_SYMBOL_GPL(gfn_to_hva);
1774
1775unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
1776{
1777 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
1778}
1779EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
1790 gfn_t gfn, bool *writable)
1791{
1792 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
1793
1794 if (!kvm_is_error_hva(hva) && writable)
1795 *writable = !memslot_is_readonly(slot);
1796
1797 return hva;
1798}
1799
1800unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
1801{
1802 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
1803
1804 return gfn_to_hva_memslot_prot(slot, gfn, writable);
1805}
1806
1807unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
1808{
1809 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1810
1811 return gfn_to_hva_memslot_prot(slot, gfn, writable);
1812}
1813
1814static inline int check_user_page_hwpoison(unsigned long addr)
1815{
1816 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
1817
1818 rc = get_user_pages(addr, 1, flags, NULL, NULL);
1819 return rc == -EHWPOISON;
1820}
1821
1822
1823
1824
1825
1826
1827static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
1828 bool *writable, kvm_pfn_t *pfn)
1829{
1830 struct page *page[1];
1831
1832
1833
1834
1835
1836
1837 if (!(write_fault || writable))
1838 return false;
1839
1840 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
1841 *pfn = page_to_pfn(page[0]);
1842
1843 if (writable)
1844 *writable = true;
1845 return true;
1846 }
1847
1848 return false;
1849}
1850
1851
1852
1853
1854
1855static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1856 bool *writable, kvm_pfn_t *pfn)
1857{
1858 unsigned int flags = FOLL_HWPOISON;
1859 struct page *page;
1860 int npages = 0;
1861
1862 might_sleep();
1863
1864 if (writable)
1865 *writable = write_fault;
1866
1867 if (write_fault)
1868 flags |= FOLL_WRITE;
1869 if (async)
1870 flags |= FOLL_NOWAIT;
1871
1872 npages = get_user_pages_unlocked(addr, 1, &page, flags);
1873 if (npages != 1)
1874 return npages;
1875
1876
1877 if (unlikely(!write_fault) && writable) {
1878 struct page *wpage;
1879
1880 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
1881 *writable = true;
1882 put_page(page);
1883 page = wpage;
1884 }
1885 }
1886 *pfn = page_to_pfn(page);
1887 return npages;
1888}
1889
1890static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
1891{
1892 if (unlikely(!(vma->vm_flags & VM_READ)))
1893 return false;
1894
1895 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
1896 return false;
1897
1898 return true;
1899}
1900
1901static int hva_to_pfn_remapped(struct vm_area_struct *vma,
1902 unsigned long addr, bool *async,
1903 bool write_fault, bool *writable,
1904 kvm_pfn_t *p_pfn)
1905{
1906 unsigned long pfn;
1907 int r;
1908
1909 r = follow_pfn(vma, addr, &pfn);
1910 if (r) {
1911
1912
1913
1914
1915 bool unlocked = false;
1916 r = fixup_user_fault(current->mm, addr,
1917 (write_fault ? FAULT_FLAG_WRITE : 0),
1918 &unlocked);
1919 if (unlocked)
1920 return -EAGAIN;
1921 if (r)
1922 return r;
1923
1924 r = follow_pfn(vma, addr, &pfn);
1925 if (r)
1926 return r;
1927
1928 }
1929
1930 if (writable)
1931 *writable = true;
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944 kvm_get_pfn(pfn);
1945
1946 *p_pfn = pfn;
1947 return 0;
1948}
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
1965 bool write_fault, bool *writable)
1966{
1967 struct vm_area_struct *vma;
1968 kvm_pfn_t pfn = 0;
1969 int npages, r;
1970
1971
1972 BUG_ON(atomic && async);
1973
1974 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
1975 return pfn;
1976
1977 if (atomic)
1978 return KVM_PFN_ERR_FAULT;
1979
1980 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
1981 if (npages == 1)
1982 return pfn;
1983
1984 mmap_read_lock(current->mm);
1985 if (npages == -EHWPOISON ||
1986 (!async && check_user_page_hwpoison(addr))) {
1987 pfn = KVM_PFN_ERR_HWPOISON;
1988 goto exit;
1989 }
1990
1991retry:
1992 vma = find_vma_intersection(current->mm, addr, addr + 1);
1993
1994 if (vma == NULL)
1995 pfn = KVM_PFN_ERR_FAULT;
1996 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
1997 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
1998 if (r == -EAGAIN)
1999 goto retry;
2000 if (r < 0)
2001 pfn = KVM_PFN_ERR_FAULT;
2002 } else {
2003 if (async && vma_is_valid(vma, write_fault))
2004 *async = true;
2005 pfn = KVM_PFN_ERR_FAULT;
2006 }
2007exit:
2008 mmap_read_unlock(current->mm);
2009 return pfn;
2010}
2011
2012kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
2013 bool atomic, bool *async, bool write_fault,
2014 bool *writable)
2015{
2016 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2017
2018 if (addr == KVM_HVA_ERR_RO_BAD) {
2019 if (writable)
2020 *writable = false;
2021 return KVM_PFN_ERR_RO_FAULT;
2022 }
2023
2024 if (kvm_is_error_hva(addr)) {
2025 if (writable)
2026 *writable = false;
2027 return KVM_PFN_NOSLOT;
2028 }
2029
2030
2031 if (writable && memslot_is_readonly(slot)) {
2032 *writable = false;
2033 writable = NULL;
2034 }
2035
2036 return hva_to_pfn(addr, atomic, async, write_fault,
2037 writable);
2038}
2039EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2040
2041kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2042 bool *writable)
2043{
2044 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2045 write_fault, writable);
2046}
2047EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2048
2049kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
2050{
2051 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
2052}
2053EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2054
2055kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
2056{
2057 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
2058}
2059EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2060
2061kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2062{
2063 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2064}
2065EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2066
2067kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2068{
2069 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2070}
2071EXPORT_SYMBOL_GPL(gfn_to_pfn);
2072
2073kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2074{
2075 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2076}
2077EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2078
2079int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2080 struct page **pages, int nr_pages)
2081{
2082 unsigned long addr;
2083 gfn_t entry = 0;
2084
2085 addr = gfn_to_hva_many(slot, gfn, &entry);
2086 if (kvm_is_error_hva(addr))
2087 return -1;
2088
2089 if (entry < nr_pages)
2090 return 0;
2091
2092 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2093}
2094EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2095
2096static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
2097{
2098 if (is_error_noslot_pfn(pfn))
2099 return KVM_ERR_PTR_BAD_PAGE;
2100
2101 if (kvm_is_reserved_pfn(pfn)) {
2102 WARN_ON(1);
2103 return KVM_ERR_PTR_BAD_PAGE;
2104 }
2105
2106 return pfn_to_page(pfn);
2107}
2108
2109struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2110{
2111 kvm_pfn_t pfn;
2112
2113 pfn = gfn_to_pfn(kvm, gfn);
2114
2115 return kvm_pfn_to_page(pfn);
2116}
2117EXPORT_SYMBOL_GPL(gfn_to_page);
2118
2119void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
2120{
2121 if (pfn == 0)
2122 return;
2123
2124 if (cache)
2125 cache->pfn = cache->gfn = 0;
2126
2127 if (dirty)
2128 kvm_release_pfn_dirty(pfn);
2129 else
2130 kvm_release_pfn_clean(pfn);
2131}
2132
2133static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
2134 struct gfn_to_pfn_cache *cache, u64 gen)
2135{
2136 kvm_release_pfn(cache->pfn, cache->dirty, cache);
2137
2138 cache->pfn = gfn_to_pfn_memslot(slot, gfn);
2139 cache->gfn = gfn;
2140 cache->dirty = false;
2141 cache->generation = gen;
2142}
2143
2144static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
2145 struct kvm_host_map *map,
2146 struct gfn_to_pfn_cache *cache,
2147 bool atomic)
2148{
2149 kvm_pfn_t pfn;
2150 void *hva = NULL;
2151 struct page *page = KVM_UNMAPPED_PAGE;
2152 struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
2153 u64 gen = slots->generation;
2154
2155 if (!map)
2156 return -EINVAL;
2157
2158 if (cache) {
2159 if (!cache->pfn || cache->gfn != gfn ||
2160 cache->generation != gen) {
2161 if (atomic)
2162 return -EAGAIN;
2163 kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
2164 }
2165 pfn = cache->pfn;
2166 } else {
2167 if (atomic)
2168 return -EAGAIN;
2169 pfn = gfn_to_pfn_memslot(slot, gfn);
2170 }
2171 if (is_error_noslot_pfn(pfn))
2172 return -EINVAL;
2173
2174 if (pfn_valid(pfn)) {
2175 page = pfn_to_page(pfn);
2176 if (atomic)
2177 hva = kmap_atomic(page);
2178 else
2179 hva = kmap(page);
2180#ifdef CONFIG_HAS_IOMEM
2181 } else if (!atomic) {
2182 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2183 } else {
2184 return -EINVAL;
2185#endif
2186 }
2187
2188 if (!hva)
2189 return -EFAULT;
2190
2191 map->page = page;
2192 map->hva = hva;
2193 map->pfn = pfn;
2194 map->gfn = gfn;
2195
2196 return 0;
2197}
2198
2199int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
2200 struct gfn_to_pfn_cache *cache, bool atomic)
2201{
2202 return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
2203 cache, atomic);
2204}
2205EXPORT_SYMBOL_GPL(kvm_map_gfn);
2206
2207int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2208{
2209 return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
2210 NULL, false);
2211}
2212EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2213
2214static void __kvm_unmap_gfn(struct kvm *kvm,
2215 struct kvm_memory_slot *memslot,
2216 struct kvm_host_map *map,
2217 struct gfn_to_pfn_cache *cache,
2218 bool dirty, bool atomic)
2219{
2220 if (!map)
2221 return;
2222
2223 if (!map->hva)
2224 return;
2225
2226 if (map->page != KVM_UNMAPPED_PAGE) {
2227 if (atomic)
2228 kunmap_atomic(map->hva);
2229 else
2230 kunmap(map->page);
2231 }
2232#ifdef CONFIG_HAS_IOMEM
2233 else if (!atomic)
2234 memunmap(map->hva);
2235 else
2236 WARN_ONCE(1, "Unexpected unmapping in atomic context");
2237#endif
2238
2239 if (dirty)
2240 mark_page_dirty_in_slot(kvm, memslot, map->gfn);
2241
2242 if (cache)
2243 cache->dirty |= dirty;
2244 else
2245 kvm_release_pfn(map->pfn, dirty, NULL);
2246
2247 map->hva = NULL;
2248 map->page = NULL;
2249}
2250
2251int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
2252 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
2253{
2254 __kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map,
2255 cache, dirty, atomic);
2256 return 0;
2257}
2258EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
2259
2260void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2261{
2262 __kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn),
2263 map, NULL, dirty, false);
2264}
2265EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2266
2267struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2268{
2269 kvm_pfn_t pfn;
2270
2271 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2272
2273 return kvm_pfn_to_page(pfn);
2274}
2275EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2276
2277void kvm_release_page_clean(struct page *page)
2278{
2279 WARN_ON(is_error_page(page));
2280
2281 kvm_release_pfn_clean(page_to_pfn(page));
2282}
2283EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2284
2285void kvm_release_pfn_clean(kvm_pfn_t pfn)
2286{
2287 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2288 put_page(pfn_to_page(pfn));
2289}
2290EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2291
2292void kvm_release_page_dirty(struct page *page)
2293{
2294 WARN_ON(is_error_page(page));
2295
2296 kvm_release_pfn_dirty(page_to_pfn(page));
2297}
2298EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2299
2300void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2301{
2302 kvm_set_pfn_dirty(pfn);
2303 kvm_release_pfn_clean(pfn);
2304}
2305EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2306
2307void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2308{
2309 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2310 SetPageDirty(pfn_to_page(pfn));
2311}
2312EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2313
2314void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2315{
2316 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2317 mark_page_accessed(pfn_to_page(pfn));
2318}
2319EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2320
2321void kvm_get_pfn(kvm_pfn_t pfn)
2322{
2323 if (!kvm_is_reserved_pfn(pfn))
2324 get_page(pfn_to_page(pfn));
2325}
2326EXPORT_SYMBOL_GPL(kvm_get_pfn);
2327
2328static int next_segment(unsigned long len, int offset)
2329{
2330 if (len > PAGE_SIZE - offset)
2331 return PAGE_SIZE - offset;
2332 else
2333 return len;
2334}
2335
2336static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2337 void *data, int offset, int len)
2338{
2339 int r;
2340 unsigned long addr;
2341
2342 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2343 if (kvm_is_error_hva(addr))
2344 return -EFAULT;
2345 r = __copy_from_user(data, (void __user *)addr + offset, len);
2346 if (r)
2347 return -EFAULT;
2348 return 0;
2349}
2350
2351int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2352 int len)
2353{
2354 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2355
2356 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2357}
2358EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2359
2360int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2361 int offset, int len)
2362{
2363 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2364
2365 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2366}
2367EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2368
2369int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2370{
2371 gfn_t gfn = gpa >> PAGE_SHIFT;
2372 int seg;
2373 int offset = offset_in_page(gpa);
2374 int ret;
2375
2376 while ((seg = next_segment(len, offset)) != 0) {
2377 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2378 if (ret < 0)
2379 return ret;
2380 offset = 0;
2381 len -= seg;
2382 data += seg;
2383 ++gfn;
2384 }
2385 return 0;
2386}
2387EXPORT_SYMBOL_GPL(kvm_read_guest);
2388
2389int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2390{
2391 gfn_t gfn = gpa >> PAGE_SHIFT;
2392 int seg;
2393 int offset = offset_in_page(gpa);
2394 int ret;
2395
2396 while ((seg = next_segment(len, offset)) != 0) {
2397 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2398 if (ret < 0)
2399 return ret;
2400 offset = 0;
2401 len -= seg;
2402 data += seg;
2403 ++gfn;
2404 }
2405 return 0;
2406}
2407EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2408
2409static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2410 void *data, int offset, unsigned long len)
2411{
2412 int r;
2413 unsigned long addr;
2414
2415 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2416 if (kvm_is_error_hva(addr))
2417 return -EFAULT;
2418 pagefault_disable();
2419 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2420 pagefault_enable();
2421 if (r)
2422 return -EFAULT;
2423 return 0;
2424}
2425
2426int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2427 void *data, unsigned long len)
2428{
2429 gfn_t gfn = gpa >> PAGE_SHIFT;
2430 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2431 int offset = offset_in_page(gpa);
2432
2433 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2434}
2435EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2436
2437static int __kvm_write_guest_page(struct kvm *kvm,
2438 struct kvm_memory_slot *memslot, gfn_t gfn,
2439 const void *data, int offset, int len)
2440{
2441 int r;
2442 unsigned long addr;
2443
2444 addr = gfn_to_hva_memslot(memslot, gfn);
2445 if (kvm_is_error_hva(addr))
2446 return -EFAULT;
2447 r = __copy_to_user((void __user *)addr + offset, data, len);
2448 if (r)
2449 return -EFAULT;
2450 mark_page_dirty_in_slot(kvm, memslot, gfn);
2451 return 0;
2452}
2453
2454int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2455 const void *data, int offset, int len)
2456{
2457 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2458
2459 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
2460}
2461EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2462
2463int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2464 const void *data, int offset, int len)
2465{
2466 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2467
2468 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
2469}
2470EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2471
2472int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2473 unsigned long len)
2474{
2475 gfn_t gfn = gpa >> PAGE_SHIFT;
2476 int seg;
2477 int offset = offset_in_page(gpa);
2478 int ret;
2479
2480 while ((seg = next_segment(len, offset)) != 0) {
2481 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2482 if (ret < 0)
2483 return ret;
2484 offset = 0;
2485 len -= seg;
2486 data += seg;
2487 ++gfn;
2488 }
2489 return 0;
2490}
2491EXPORT_SYMBOL_GPL(kvm_write_guest);
2492
2493int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2494 unsigned long len)
2495{
2496 gfn_t gfn = gpa >> PAGE_SHIFT;
2497 int seg;
2498 int offset = offset_in_page(gpa);
2499 int ret;
2500
2501 while ((seg = next_segment(len, offset)) != 0) {
2502 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2503 if (ret < 0)
2504 return ret;
2505 offset = 0;
2506 len -= seg;
2507 data += seg;
2508 ++gfn;
2509 }
2510 return 0;
2511}
2512EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2513
2514static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2515 struct gfn_to_hva_cache *ghc,
2516 gpa_t gpa, unsigned long len)
2517{
2518 int offset = offset_in_page(gpa);
2519 gfn_t start_gfn = gpa >> PAGE_SHIFT;
2520 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2521 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2522 gfn_t nr_pages_avail;
2523
2524
2525 ghc->generation = slots->generation;
2526
2527 if (start_gfn > end_gfn) {
2528 ghc->hva = KVM_HVA_ERR_BAD;
2529 return -EINVAL;
2530 }
2531
2532
2533
2534
2535
2536 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
2537 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2538 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2539 &nr_pages_avail);
2540 if (kvm_is_error_hva(ghc->hva))
2541 return -EFAULT;
2542 }
2543
2544
2545 if (nr_pages_needed == 1)
2546 ghc->hva += offset;
2547 else
2548 ghc->memslot = NULL;
2549
2550 ghc->gpa = gpa;
2551 ghc->len = len;
2552 return 0;
2553}
2554
2555int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2556 gpa_t gpa, unsigned long len)
2557{
2558 struct kvm_memslots *slots = kvm_memslots(kvm);
2559 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
2560}
2561EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
2562
2563int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2564 void *data, unsigned int offset,
2565 unsigned long len)
2566{
2567 struct kvm_memslots *slots = kvm_memslots(kvm);
2568 int r;
2569 gpa_t gpa = ghc->gpa + offset;
2570
2571 BUG_ON(len + offset > ghc->len);
2572
2573 if (slots->generation != ghc->generation) {
2574 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2575 return -EFAULT;
2576 }
2577
2578 if (kvm_is_error_hva(ghc->hva))
2579 return -EFAULT;
2580
2581 if (unlikely(!ghc->memslot))
2582 return kvm_write_guest(kvm, gpa, data, len);
2583
2584 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
2585 if (r)
2586 return -EFAULT;
2587 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
2588
2589 return 0;
2590}
2591EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
2592
2593int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2594 void *data, unsigned long len)
2595{
2596 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
2597}
2598EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
2599
2600int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2601 void *data, unsigned int offset,
2602 unsigned long len)
2603{
2604 struct kvm_memslots *slots = kvm_memslots(kvm);
2605 int r;
2606 gpa_t gpa = ghc->gpa + offset;
2607
2608 BUG_ON(len + offset > ghc->len);
2609
2610 if (slots->generation != ghc->generation) {
2611 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2612 return -EFAULT;
2613 }
2614
2615 if (kvm_is_error_hva(ghc->hva))
2616 return -EFAULT;
2617
2618 if (unlikely(!ghc->memslot))
2619 return kvm_read_guest(kvm, gpa, data, len);
2620
2621 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
2622 if (r)
2623 return -EFAULT;
2624
2625 return 0;
2626}
2627EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
2628
2629int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2630 void *data, unsigned long len)
2631{
2632 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
2633}
2634EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
2635
2636int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
2637{
2638 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
2639 gfn_t gfn = gpa >> PAGE_SHIFT;
2640 int seg;
2641 int offset = offset_in_page(gpa);
2642 int ret;
2643
2644 while ((seg = next_segment(len, offset)) != 0) {
2645 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
2646 if (ret < 0)
2647 return ret;
2648 offset = 0;
2649 len -= seg;
2650 ++gfn;
2651 }
2652 return 0;
2653}
2654EXPORT_SYMBOL_GPL(kvm_clear_guest);
2655
2656void mark_page_dirty_in_slot(struct kvm *kvm,
2657 struct kvm_memory_slot *memslot,
2658 gfn_t gfn)
2659{
2660 if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
2661 unsigned long rel_gfn = gfn - memslot->base_gfn;
2662 u32 slot = (memslot->as_id << 16) | memslot->id;
2663
2664 if (kvm->dirty_ring_size)
2665 kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
2666 slot, rel_gfn);
2667 else
2668 set_bit_le(rel_gfn, memslot->dirty_bitmap);
2669 }
2670}
2671EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
2672
2673void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
2674{
2675 struct kvm_memory_slot *memslot;
2676
2677 memslot = gfn_to_memslot(kvm, gfn);
2678 mark_page_dirty_in_slot(kvm, memslot, gfn);
2679}
2680EXPORT_SYMBOL_GPL(mark_page_dirty);
2681
2682void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
2683{
2684 struct kvm_memory_slot *memslot;
2685
2686 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2687 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
2688}
2689EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
2690
2691void kvm_sigset_activate(struct kvm_vcpu *vcpu)
2692{
2693 if (!vcpu->sigset_active)
2694 return;
2695
2696
2697
2698
2699
2700
2701
2702 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked);
2703}
2704
2705void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
2706{
2707 if (!vcpu->sigset_active)
2708 return;
2709
2710 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL);
2711 sigemptyset(¤t->real_blocked);
2712}
2713
2714static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
2715{
2716 unsigned int old, val, grow, grow_start;
2717
2718 old = val = vcpu->halt_poll_ns;
2719 grow_start = READ_ONCE(halt_poll_ns_grow_start);
2720 grow = READ_ONCE(halt_poll_ns_grow);
2721 if (!grow)
2722 goto out;
2723
2724 val *= grow;
2725 if (val < grow_start)
2726 val = grow_start;
2727
2728 if (val > halt_poll_ns)
2729 val = halt_poll_ns;
2730
2731 vcpu->halt_poll_ns = val;
2732out:
2733 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
2734}
2735
2736static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
2737{
2738 unsigned int old, val, shrink;
2739
2740 old = val = vcpu->halt_poll_ns;
2741 shrink = READ_ONCE(halt_poll_ns_shrink);
2742 if (shrink == 0)
2743 val = 0;
2744 else
2745 val /= shrink;
2746
2747 vcpu->halt_poll_ns = val;
2748 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
2749}
2750
2751static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
2752{
2753 int ret = -EINTR;
2754 int idx = srcu_read_lock(&vcpu->kvm->srcu);
2755
2756 if (kvm_arch_vcpu_runnable(vcpu)) {
2757 kvm_make_request(KVM_REQ_UNHALT, vcpu);
2758 goto out;
2759 }
2760 if (kvm_cpu_has_pending_timer(vcpu))
2761 goto out;
2762 if (signal_pending(current))
2763 goto out;
2764
2765 ret = 0;
2766out:
2767 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2768 return ret;
2769}
2770
2771static inline void
2772update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
2773{
2774 if (waited)
2775 vcpu->stat.halt_poll_fail_ns += poll_ns;
2776 else
2777 vcpu->stat.halt_poll_success_ns += poll_ns;
2778}
2779
2780
2781
2782
2783void kvm_vcpu_block(struct kvm_vcpu *vcpu)
2784{
2785 ktime_t start, cur, poll_end;
2786 bool waited = false;
2787 u64 block_ns;
2788
2789 kvm_arch_vcpu_blocking(vcpu);
2790
2791 start = cur = poll_end = ktime_get();
2792 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
2793 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
2794
2795 ++vcpu->stat.halt_attempted_poll;
2796 do {
2797
2798
2799
2800
2801 if (kvm_vcpu_check_block(vcpu) < 0) {
2802 ++vcpu->stat.halt_successful_poll;
2803 if (!vcpu_valid_wakeup(vcpu))
2804 ++vcpu->stat.halt_poll_invalid;
2805 goto out;
2806 }
2807 poll_end = cur = ktime_get();
2808 } while (single_task_running() && ktime_before(cur, stop));
2809 }
2810
2811 prepare_to_rcuwait(&vcpu->wait);
2812 for (;;) {
2813 set_current_state(TASK_INTERRUPTIBLE);
2814
2815 if (kvm_vcpu_check_block(vcpu) < 0)
2816 break;
2817
2818 waited = true;
2819 schedule();
2820 }
2821 finish_rcuwait(&vcpu->wait);
2822 cur = ktime_get();
2823out:
2824 kvm_arch_vcpu_unblocking(vcpu);
2825 block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
2826
2827 update_halt_poll_stats(
2828 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
2829
2830 if (!kvm_arch_no_poll(vcpu)) {
2831 if (!vcpu_valid_wakeup(vcpu)) {
2832 shrink_halt_poll_ns(vcpu);
2833 } else if (vcpu->kvm->max_halt_poll_ns) {
2834 if (block_ns <= vcpu->halt_poll_ns)
2835 ;
2836
2837 else if (vcpu->halt_poll_ns &&
2838 block_ns > vcpu->kvm->max_halt_poll_ns)
2839 shrink_halt_poll_ns(vcpu);
2840
2841 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
2842 block_ns < vcpu->kvm->max_halt_poll_ns)
2843 grow_halt_poll_ns(vcpu);
2844 } else {
2845 vcpu->halt_poll_ns = 0;
2846 }
2847 }
2848
2849 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
2850 kvm_arch_vcpu_block_finish(vcpu);
2851}
2852EXPORT_SYMBOL_GPL(kvm_vcpu_block);
2853
2854bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
2855{
2856 struct rcuwait *waitp;
2857
2858 waitp = kvm_arch_vcpu_get_wait(vcpu);
2859 if (rcuwait_wake_up(waitp)) {
2860 WRITE_ONCE(vcpu->ready, true);
2861 ++vcpu->stat.halt_wakeup;
2862 return true;
2863 }
2864
2865 return false;
2866}
2867EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
2868
2869#ifndef CONFIG_S390
2870
2871
2872
2873void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
2874{
2875 int me;
2876 int cpu = vcpu->cpu;
2877
2878 if (kvm_vcpu_wake_up(vcpu))
2879 return;
2880
2881 me = get_cpu();
2882 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
2883 if (kvm_arch_vcpu_should_kick(vcpu))
2884 smp_send_reschedule(cpu);
2885 put_cpu();
2886}
2887EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
2888#endif
2889
2890int kvm_vcpu_yield_to(struct kvm_vcpu *target)
2891{
2892 struct pid *pid;
2893 struct task_struct *task = NULL;
2894 int ret = 0;
2895
2896 rcu_read_lock();
2897 pid = rcu_dereference(target->pid);
2898 if (pid)
2899 task = get_pid_task(pid, PIDTYPE_PID);
2900 rcu_read_unlock();
2901 if (!task)
2902 return ret;
2903 ret = yield_to(task, 1);
2904 put_task_struct(task);
2905
2906 return ret;
2907}
2908EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
2933{
2934#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
2935 bool eligible;
2936
2937 eligible = !vcpu->spin_loop.in_spin_loop ||
2938 vcpu->spin_loop.dy_eligible;
2939
2940 if (vcpu->spin_loop.in_spin_loop)
2941 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
2942
2943 return eligible;
2944#else
2945 return true;
2946#endif
2947}
2948
2949
2950
2951
2952
2953
2954bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
2955{
2956 return kvm_arch_vcpu_runnable(vcpu);
2957}
2958
2959static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
2960{
2961 if (kvm_arch_dy_runnable(vcpu))
2962 return true;
2963
2964#ifdef CONFIG_KVM_ASYNC_PF
2965 if (!list_empty_careful(&vcpu->async_pf.done))
2966 return true;
2967#endif
2968
2969 return false;
2970}
2971
2972void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
2973{
2974 struct kvm *kvm = me->kvm;
2975 struct kvm_vcpu *vcpu;
2976 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
2977 int yielded = 0;
2978 int try = 3;
2979 int pass;
2980 int i;
2981
2982 kvm_vcpu_set_in_spin_loop(me, true);
2983
2984
2985
2986
2987
2988
2989
2990 for (pass = 0; pass < 2 && !yielded && try; pass++) {
2991 kvm_for_each_vcpu(i, vcpu, kvm) {
2992 if (!pass && i <= last_boosted_vcpu) {
2993 i = last_boosted_vcpu;
2994 continue;
2995 } else if (pass && i > last_boosted_vcpu)
2996 break;
2997 if (!READ_ONCE(vcpu->ready))
2998 continue;
2999 if (vcpu == me)
3000 continue;
3001 if (rcuwait_active(&vcpu->wait) &&
3002 !vcpu_dy_runnable(vcpu))
3003 continue;
3004 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3005 !kvm_arch_vcpu_in_kernel(vcpu))
3006 continue;
3007 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3008 continue;
3009
3010 yielded = kvm_vcpu_yield_to(vcpu);
3011 if (yielded > 0) {
3012 kvm->last_boosted_vcpu = i;
3013 break;
3014 } else if (yielded < 0) {
3015 try--;
3016 if (!try)
3017 break;
3018 }
3019 }
3020 }
3021 kvm_vcpu_set_in_spin_loop(me, false);
3022
3023
3024 kvm_vcpu_set_dy_eligible(me, false);
3025}
3026EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3027
3028static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3029{
3030#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
3031 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3032 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3033 kvm->dirty_ring_size / PAGE_SIZE);
3034#else
3035 return false;
3036#endif
3037}
3038
3039static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3040{
3041 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3042 struct page *page;
3043
3044 if (vmf->pgoff == 0)
3045 page = virt_to_page(vcpu->run);
3046#ifdef CONFIG_X86
3047 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3048 page = virt_to_page(vcpu->arch.pio_data);
3049#endif
3050#ifdef CONFIG_KVM_MMIO
3051 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3052 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3053#endif
3054 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3055 page = kvm_dirty_ring_get_page(
3056 &vcpu->dirty_ring,
3057 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3058 else
3059 return kvm_arch_vcpu_fault(vcpu, vmf);
3060 get_page(page);
3061 vmf->page = page;
3062 return 0;
3063}
3064
3065static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3066 .fault = kvm_vcpu_fault,
3067};
3068
3069static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3070{
3071 struct kvm_vcpu *vcpu = file->private_data;
3072 unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
3073
3074 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3075 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3076 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3077 return -EINVAL;
3078
3079 vma->vm_ops = &kvm_vcpu_vm_ops;
3080 return 0;
3081}
3082
3083static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3084{
3085 struct kvm_vcpu *vcpu = filp->private_data;
3086
3087 kvm_put_kvm(vcpu->kvm);
3088 return 0;
3089}
3090
3091static struct file_operations kvm_vcpu_fops = {
3092 .release = kvm_vcpu_release,
3093 .unlocked_ioctl = kvm_vcpu_ioctl,
3094 .mmap = kvm_vcpu_mmap,
3095 .llseek = noop_llseek,
3096 KVM_COMPAT(kvm_vcpu_compat_ioctl),
3097};
3098
3099
3100
3101
3102static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3103{
3104 char name[8 + 1 + ITOA_MAX_LEN + 1];
3105
3106 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3107 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3108}
3109
3110static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3111{
3112#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3113 struct dentry *debugfs_dentry;
3114 char dir_name[ITOA_MAX_LEN * 2];
3115
3116 if (!debugfs_initialized())
3117 return;
3118
3119 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3120 debugfs_dentry = debugfs_create_dir(dir_name,
3121 vcpu->kvm->debugfs_dentry);
3122
3123 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3124#endif
3125}
3126
3127
3128
3129
3130static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3131{
3132 int r;
3133 struct kvm_vcpu *vcpu;
3134 struct page *page;
3135
3136 if (id >= KVM_MAX_VCPU_ID)
3137 return -EINVAL;
3138
3139 mutex_lock(&kvm->lock);
3140 if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3141 mutex_unlock(&kvm->lock);
3142 return -EINVAL;
3143 }
3144
3145 kvm->created_vcpus++;
3146 mutex_unlock(&kvm->lock);
3147
3148 r = kvm_arch_vcpu_precreate(kvm, id);
3149 if (r)
3150 goto vcpu_decrement;
3151
3152 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
3153 if (!vcpu) {
3154 r = -ENOMEM;
3155 goto vcpu_decrement;
3156 }
3157
3158 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3159 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3160 if (!page) {
3161 r = -ENOMEM;
3162 goto vcpu_free;
3163 }
3164 vcpu->run = page_address(page);
3165
3166 kvm_vcpu_init(vcpu, kvm, id);
3167
3168 r = kvm_arch_vcpu_create(vcpu);
3169 if (r)
3170 goto vcpu_free_run_page;
3171
3172 if (kvm->dirty_ring_size) {
3173 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3174 id, kvm->dirty_ring_size);
3175 if (r)
3176 goto arch_vcpu_destroy;
3177 }
3178
3179 mutex_lock(&kvm->lock);
3180 if (kvm_get_vcpu_by_id(kvm, id)) {
3181 r = -EEXIST;
3182 goto unlock_vcpu_destroy;
3183 }
3184
3185 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3186 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
3187
3188
3189 kvm_get_kvm(kvm);
3190 r = create_vcpu_fd(vcpu);
3191 if (r < 0) {
3192 kvm_put_kvm_no_destroy(kvm);
3193 goto unlock_vcpu_destroy;
3194 }
3195
3196 kvm->vcpus[vcpu->vcpu_idx] = vcpu;
3197
3198
3199
3200
3201
3202 smp_wmb();
3203 atomic_inc(&kvm->online_vcpus);
3204
3205 mutex_unlock(&kvm->lock);
3206 kvm_arch_vcpu_postcreate(vcpu);
3207 kvm_create_vcpu_debugfs(vcpu);
3208 return r;
3209
3210unlock_vcpu_destroy:
3211 mutex_unlock(&kvm->lock);
3212 kvm_dirty_ring_free(&vcpu->dirty_ring);
3213arch_vcpu_destroy:
3214 kvm_arch_vcpu_destroy(vcpu);
3215vcpu_free_run_page:
3216 free_page((unsigned long)vcpu->run);
3217vcpu_free:
3218 kmem_cache_free(kvm_vcpu_cache, vcpu);
3219vcpu_decrement:
3220 mutex_lock(&kvm->lock);
3221 kvm->created_vcpus--;
3222 mutex_unlock(&kvm->lock);
3223 return r;
3224}
3225
3226static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3227{
3228 if (sigset) {
3229 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3230 vcpu->sigset_active = 1;
3231 vcpu->sigset = *sigset;
3232 } else
3233 vcpu->sigset_active = 0;
3234 return 0;
3235}
3236
3237static long kvm_vcpu_ioctl(struct file *filp,
3238 unsigned int ioctl, unsigned long arg)
3239{
3240 struct kvm_vcpu *vcpu = filp->private_data;
3241 void __user *argp = (void __user *)arg;
3242 int r;
3243 struct kvm_fpu *fpu = NULL;
3244 struct kvm_sregs *kvm_sregs = NULL;
3245
3246 if (vcpu->kvm->mm != current->mm)
3247 return -EIO;
3248
3249 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3250 return -EINVAL;
3251
3252
3253
3254
3255
3256 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3257 if (r != -ENOIOCTLCMD)
3258 return r;
3259
3260 if (mutex_lock_killable(&vcpu->mutex))
3261 return -EINTR;
3262 switch (ioctl) {
3263 case KVM_RUN: {
3264 struct pid *oldpid;
3265 r = -EINVAL;
3266 if (arg)
3267 goto out;
3268 oldpid = rcu_access_pointer(vcpu->pid);
3269 if (unlikely(oldpid != task_pid(current))) {
3270
3271 struct pid *newpid;
3272
3273 r = kvm_arch_vcpu_run_pid_change(vcpu);
3274 if (r)
3275 break;
3276
3277 newpid = get_task_pid(current, PIDTYPE_PID);
3278 rcu_assign_pointer(vcpu->pid, newpid);
3279 if (oldpid)
3280 synchronize_rcu();
3281 put_pid(oldpid);
3282 }
3283 r = kvm_arch_vcpu_ioctl_run(vcpu);
3284 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
3285 break;
3286 }
3287 case KVM_GET_REGS: {
3288 struct kvm_regs *kvm_regs;
3289
3290 r = -ENOMEM;
3291 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3292 if (!kvm_regs)
3293 goto out;
3294 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
3295 if (r)
3296 goto out_free1;
3297 r = -EFAULT;
3298 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3299 goto out_free1;
3300 r = 0;
3301out_free1:
3302 kfree(kvm_regs);
3303 break;
3304 }
3305 case KVM_SET_REGS: {
3306 struct kvm_regs *kvm_regs;
3307
3308 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3309 if (IS_ERR(kvm_regs)) {
3310 r = PTR_ERR(kvm_regs);
3311 goto out;
3312 }
3313 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3314 kfree(kvm_regs);
3315 break;
3316 }
3317 case KVM_GET_SREGS: {
3318 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3319 GFP_KERNEL_ACCOUNT);
3320 r = -ENOMEM;
3321 if (!kvm_sregs)
3322 goto out;
3323 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3324 if (r)
3325 goto out;
3326 r = -EFAULT;
3327 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3328 goto out;
3329 r = 0;
3330 break;
3331 }
3332 case KVM_SET_SREGS: {
3333 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3334 if (IS_ERR(kvm_sregs)) {
3335 r = PTR_ERR(kvm_sregs);
3336 kvm_sregs = NULL;
3337 goto out;
3338 }
3339 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3340 break;
3341 }
3342 case KVM_GET_MP_STATE: {
3343 struct kvm_mp_state mp_state;
3344
3345 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3346 if (r)
3347 goto out;
3348 r = -EFAULT;
3349 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3350 goto out;
3351 r = 0;
3352 break;
3353 }
3354 case KVM_SET_MP_STATE: {
3355 struct kvm_mp_state mp_state;
3356
3357 r = -EFAULT;
3358 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3359 goto out;
3360 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3361 break;
3362 }
3363 case KVM_TRANSLATE: {
3364 struct kvm_translation tr;
3365
3366 r = -EFAULT;
3367 if (copy_from_user(&tr, argp, sizeof(tr)))
3368 goto out;
3369 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3370 if (r)
3371 goto out;
3372 r = -EFAULT;
3373 if (copy_to_user(argp, &tr, sizeof(tr)))
3374 goto out;
3375 r = 0;
3376 break;
3377 }
3378 case KVM_SET_GUEST_DEBUG: {
3379 struct kvm_guest_debug dbg;
3380
3381 r = -EFAULT;
3382 if (copy_from_user(&dbg, argp, sizeof(dbg)))
3383 goto out;
3384 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
3385 break;
3386 }
3387 case KVM_SET_SIGNAL_MASK: {
3388 struct kvm_signal_mask __user *sigmask_arg = argp;
3389 struct kvm_signal_mask kvm_sigmask;
3390 sigset_t sigset, *p;
3391
3392 p = NULL;
3393 if (argp) {
3394 r = -EFAULT;
3395 if (copy_from_user(&kvm_sigmask, argp,
3396 sizeof(kvm_sigmask)))
3397 goto out;
3398 r = -EINVAL;
3399 if (kvm_sigmask.len != sizeof(sigset))
3400 goto out;
3401 r = -EFAULT;
3402 if (copy_from_user(&sigset, sigmask_arg->sigset,
3403 sizeof(sigset)))
3404 goto out;
3405 p = &sigset;
3406 }
3407 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
3408 break;
3409 }
3410 case KVM_GET_FPU: {
3411 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
3412 r = -ENOMEM;
3413 if (!fpu)
3414 goto out;
3415 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
3416 if (r)
3417 goto out;
3418 r = -EFAULT;
3419 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
3420 goto out;
3421 r = 0;
3422 break;
3423 }
3424 case KVM_SET_FPU: {
3425 fpu = memdup_user(argp, sizeof(*fpu));
3426 if (IS_ERR(fpu)) {
3427 r = PTR_ERR(fpu);
3428 fpu = NULL;
3429 goto out;
3430 }
3431 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
3432 break;
3433 }
3434 default:
3435 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
3436 }
3437out:
3438 mutex_unlock(&vcpu->mutex);
3439 kfree(fpu);
3440 kfree(kvm_sregs);
3441 return r;
3442}
3443
3444#ifdef CONFIG_KVM_COMPAT
3445static long kvm_vcpu_compat_ioctl(struct file *filp,
3446 unsigned int ioctl, unsigned long arg)
3447{
3448 struct kvm_vcpu *vcpu = filp->private_data;
3449 void __user *argp = compat_ptr(arg);
3450 int r;
3451
3452 if (vcpu->kvm->mm != current->mm)
3453 return -EIO;
3454
3455 switch (ioctl) {
3456 case KVM_SET_SIGNAL_MASK: {
3457 struct kvm_signal_mask __user *sigmask_arg = argp;
3458 struct kvm_signal_mask kvm_sigmask;
3459 sigset_t sigset;
3460
3461 if (argp) {
3462 r = -EFAULT;
3463 if (copy_from_user(&kvm_sigmask, argp,
3464 sizeof(kvm_sigmask)))
3465 goto out;
3466 r = -EINVAL;
3467 if (kvm_sigmask.len != sizeof(compat_sigset_t))
3468 goto out;
3469 r = -EFAULT;
3470 if (get_compat_sigset(&sigset,
3471 (compat_sigset_t __user *)sigmask_arg->sigset))
3472 goto out;
3473 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3474 } else
3475 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
3476 break;
3477 }
3478 default:
3479 r = kvm_vcpu_ioctl(filp, ioctl, arg);
3480 }
3481
3482out:
3483 return r;
3484}
3485#endif
3486
3487static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3488{
3489 struct kvm_device *dev = filp->private_data;
3490
3491 if (dev->ops->mmap)
3492 return dev->ops->mmap(dev, vma);
3493
3494 return -ENODEV;
3495}
3496
3497static int kvm_device_ioctl_attr(struct kvm_device *dev,
3498 int (*accessor)(struct kvm_device *dev,
3499 struct kvm_device_attr *attr),
3500 unsigned long arg)
3501{
3502 struct kvm_device_attr attr;
3503
3504 if (!accessor)
3505 return -EPERM;
3506
3507 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
3508 return -EFAULT;
3509
3510 return accessor(dev, &attr);
3511}
3512
3513static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
3514 unsigned long arg)
3515{
3516 struct kvm_device *dev = filp->private_data;
3517
3518 if (dev->kvm->mm != current->mm)
3519 return -EIO;
3520
3521 switch (ioctl) {
3522 case KVM_SET_DEVICE_ATTR:
3523 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
3524 case KVM_GET_DEVICE_ATTR:
3525 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
3526 case KVM_HAS_DEVICE_ATTR:
3527 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
3528 default:
3529 if (dev->ops->ioctl)
3530 return dev->ops->ioctl(dev, ioctl, arg);
3531
3532 return -ENOTTY;
3533 }
3534}
3535
3536static int kvm_device_release(struct inode *inode, struct file *filp)
3537{
3538 struct kvm_device *dev = filp->private_data;
3539 struct kvm *kvm = dev->kvm;
3540
3541 if (dev->ops->release) {
3542 mutex_lock(&kvm->lock);
3543 list_del(&dev->vm_node);
3544 dev->ops->release(dev);
3545 mutex_unlock(&kvm->lock);
3546 }
3547
3548 kvm_put_kvm(kvm);
3549 return 0;
3550}
3551
3552static const struct file_operations kvm_device_fops = {
3553 .unlocked_ioctl = kvm_device_ioctl,
3554 .release = kvm_device_release,
3555 KVM_COMPAT(kvm_device_ioctl),
3556 .mmap = kvm_device_mmap,
3557};
3558
3559struct kvm_device *kvm_device_from_filp(struct file *filp)
3560{
3561 if (filp->f_op != &kvm_device_fops)
3562 return NULL;
3563
3564 return filp->private_data;
3565}
3566
3567static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
3568#ifdef CONFIG_KVM_MPIC
3569 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
3570 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
3571#endif
3572};
3573
3574int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
3575{
3576 if (type >= ARRAY_SIZE(kvm_device_ops_table))
3577 return -ENOSPC;
3578
3579 if (kvm_device_ops_table[type] != NULL)
3580 return -EEXIST;
3581
3582 kvm_device_ops_table[type] = ops;
3583 return 0;
3584}
3585
3586void kvm_unregister_device_ops(u32 type)
3587{
3588 if (kvm_device_ops_table[type] != NULL)
3589 kvm_device_ops_table[type] = NULL;
3590}
3591
3592static int kvm_ioctl_create_device(struct kvm *kvm,
3593 struct kvm_create_device *cd)
3594{
3595 const struct kvm_device_ops *ops = NULL;
3596 struct kvm_device *dev;
3597 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
3598 int type;
3599 int ret;
3600
3601 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
3602 return -ENODEV;
3603
3604 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
3605 ops = kvm_device_ops_table[type];
3606 if (ops == NULL)
3607 return -ENODEV;
3608
3609 if (test)
3610 return 0;
3611
3612 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
3613 if (!dev)
3614 return -ENOMEM;
3615
3616 dev->ops = ops;
3617 dev->kvm = kvm;
3618
3619 mutex_lock(&kvm->lock);
3620 ret = ops->create(dev, type);
3621 if (ret < 0) {
3622 mutex_unlock(&kvm->lock);
3623 kfree(dev);
3624 return ret;
3625 }
3626 list_add(&dev->vm_node, &kvm->devices);
3627 mutex_unlock(&kvm->lock);
3628
3629 if (ops->init)
3630 ops->init(dev);
3631
3632 kvm_get_kvm(kvm);
3633 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
3634 if (ret < 0) {
3635 kvm_put_kvm_no_destroy(kvm);
3636 mutex_lock(&kvm->lock);
3637 list_del(&dev->vm_node);
3638 mutex_unlock(&kvm->lock);
3639 ops->destroy(dev);
3640 return ret;
3641 }
3642
3643 cd->fd = ret;
3644 return 0;
3645}
3646
3647static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
3648{
3649 switch (arg) {
3650 case KVM_CAP_USER_MEMORY:
3651 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
3652 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
3653 case KVM_CAP_INTERNAL_ERROR_DATA:
3654#ifdef CONFIG_HAVE_KVM_MSI
3655 case KVM_CAP_SIGNAL_MSI:
3656#endif
3657#ifdef CONFIG_HAVE_KVM_IRQFD
3658 case KVM_CAP_IRQFD:
3659 case KVM_CAP_IRQFD_RESAMPLE:
3660#endif
3661 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
3662 case KVM_CAP_CHECK_EXTENSION_VM:
3663 case KVM_CAP_ENABLE_CAP_VM:
3664 case KVM_CAP_HALT_POLL:
3665 return 1;
3666#ifdef CONFIG_KVM_MMIO
3667 case KVM_CAP_COALESCED_MMIO:
3668 return KVM_COALESCED_MMIO_PAGE_OFFSET;
3669 case KVM_CAP_COALESCED_PIO:
3670 return 1;
3671#endif
3672#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3673 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
3674 return KVM_DIRTY_LOG_MANUAL_CAPS;
3675#endif
3676#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
3677 case KVM_CAP_IRQ_ROUTING:
3678 return KVM_MAX_IRQ_ROUTES;
3679#endif
3680#if KVM_ADDRESS_SPACE_NUM > 1
3681 case KVM_CAP_MULTI_ADDRESS_SPACE:
3682 return KVM_ADDRESS_SPACE_NUM;
3683#endif
3684 case KVM_CAP_NR_MEMSLOTS:
3685 return KVM_USER_MEM_SLOTS;
3686 case KVM_CAP_DIRTY_LOG_RING:
3687#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
3688 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
3689#else
3690 return 0;
3691#endif
3692 default:
3693 break;
3694 }
3695 return kvm_vm_ioctl_check_extension(kvm, arg);
3696}
3697
3698static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
3699{
3700 int r;
3701
3702 if (!KVM_DIRTY_LOG_PAGE_OFFSET)
3703 return -EINVAL;
3704
3705
3706 if (!size || (size & (size - 1)))
3707 return -EINVAL;
3708
3709
3710 if (size < kvm_dirty_ring_get_rsvd_entries() *
3711 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
3712 return -EINVAL;
3713
3714 if (size > KVM_DIRTY_RING_MAX_ENTRIES *
3715 sizeof(struct kvm_dirty_gfn))
3716 return -E2BIG;
3717
3718
3719 if (kvm->dirty_ring_size)
3720 return -EINVAL;
3721
3722 mutex_lock(&kvm->lock);
3723
3724 if (kvm->created_vcpus) {
3725
3726 r = -EINVAL;
3727 } else {
3728 kvm->dirty_ring_size = size;
3729 r = 0;
3730 }
3731
3732 mutex_unlock(&kvm->lock);
3733 return r;
3734}
3735
3736static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
3737{
3738 int i;
3739 struct kvm_vcpu *vcpu;
3740 int cleared = 0;
3741
3742 if (!kvm->dirty_ring_size)
3743 return -EINVAL;
3744
3745 mutex_lock(&kvm->slots_lock);
3746
3747 kvm_for_each_vcpu(i, vcpu, kvm)
3748 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
3749
3750 mutex_unlock(&kvm->slots_lock);
3751
3752 if (cleared)
3753 kvm_flush_remote_tlbs(kvm);
3754
3755 return cleared;
3756}
3757
3758int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
3759 struct kvm_enable_cap *cap)
3760{
3761 return -EINVAL;
3762}
3763
3764static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
3765 struct kvm_enable_cap *cap)
3766{
3767 switch (cap->cap) {
3768#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3769 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
3770 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
3771
3772 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
3773 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
3774
3775 if (cap->flags || (cap->args[0] & ~allowed_options))
3776 return -EINVAL;
3777 kvm->manual_dirty_log_protect = cap->args[0];
3778 return 0;
3779 }
3780#endif
3781 case KVM_CAP_HALT_POLL: {
3782 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
3783 return -EINVAL;
3784
3785 kvm->max_halt_poll_ns = cap->args[0];
3786 return 0;
3787 }
3788 case KVM_CAP_DIRTY_LOG_RING:
3789 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
3790 default:
3791 return kvm_vm_ioctl_enable_cap(kvm, cap);
3792 }
3793}
3794
3795static long kvm_vm_ioctl(struct file *filp,
3796 unsigned int ioctl, unsigned long arg)
3797{
3798 struct kvm *kvm = filp->private_data;
3799 void __user *argp = (void __user *)arg;
3800 int r;
3801
3802 if (kvm->mm != current->mm)
3803 return -EIO;
3804 switch (ioctl) {
3805 case KVM_CREATE_VCPU:
3806 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3807 break;
3808 case KVM_ENABLE_CAP: {
3809 struct kvm_enable_cap cap;
3810
3811 r = -EFAULT;
3812 if (copy_from_user(&cap, argp, sizeof(cap)))
3813 goto out;
3814 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
3815 break;
3816 }
3817 case KVM_SET_USER_MEMORY_REGION: {
3818 struct kvm_userspace_memory_region kvm_userspace_mem;
3819
3820 r = -EFAULT;
3821 if (copy_from_user(&kvm_userspace_mem, argp,
3822 sizeof(kvm_userspace_mem)))
3823 goto out;
3824
3825 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
3826 break;
3827 }
3828 case KVM_GET_DIRTY_LOG: {
3829 struct kvm_dirty_log log;
3830
3831 r = -EFAULT;
3832 if (copy_from_user(&log, argp, sizeof(log)))
3833 goto out;
3834 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3835 break;
3836 }
3837#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3838 case KVM_CLEAR_DIRTY_LOG: {
3839 struct kvm_clear_dirty_log log;
3840
3841 r = -EFAULT;
3842 if (copy_from_user(&log, argp, sizeof(log)))
3843 goto out;
3844 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
3845 break;
3846 }
3847#endif
3848#ifdef CONFIG_KVM_MMIO
3849 case KVM_REGISTER_COALESCED_MMIO: {
3850 struct kvm_coalesced_mmio_zone zone;
3851
3852 r = -EFAULT;
3853 if (copy_from_user(&zone, argp, sizeof(zone)))
3854 goto out;
3855 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
3856 break;
3857 }
3858 case KVM_UNREGISTER_COALESCED_MMIO: {
3859 struct kvm_coalesced_mmio_zone zone;
3860
3861 r = -EFAULT;
3862 if (copy_from_user(&zone, argp, sizeof(zone)))
3863 goto out;
3864 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
3865 break;
3866 }
3867#endif
3868 case KVM_IRQFD: {
3869 struct kvm_irqfd data;
3870
3871 r = -EFAULT;
3872 if (copy_from_user(&data, argp, sizeof(data)))
3873 goto out;
3874 r = kvm_irqfd(kvm, &data);
3875 break;
3876 }
3877 case KVM_IOEVENTFD: {
3878 struct kvm_ioeventfd data;
3879
3880 r = -EFAULT;
3881 if (copy_from_user(&data, argp, sizeof(data)))
3882 goto out;
3883 r = kvm_ioeventfd(kvm, &data);
3884 break;
3885 }
3886#ifdef CONFIG_HAVE_KVM_MSI
3887 case KVM_SIGNAL_MSI: {
3888 struct kvm_msi msi;
3889
3890 r = -EFAULT;
3891 if (copy_from_user(&msi, argp, sizeof(msi)))
3892 goto out;
3893 r = kvm_send_userspace_msi(kvm, &msi);
3894 break;
3895 }
3896#endif
3897#ifdef __KVM_HAVE_IRQ_LINE
3898 case KVM_IRQ_LINE_STATUS:
3899 case KVM_IRQ_LINE: {
3900 struct kvm_irq_level irq_event;
3901
3902 r = -EFAULT;
3903 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
3904 goto out;
3905
3906 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
3907 ioctl == KVM_IRQ_LINE_STATUS);
3908 if (r)
3909 goto out;
3910
3911 r = -EFAULT;
3912 if (ioctl == KVM_IRQ_LINE_STATUS) {
3913 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
3914 goto out;
3915 }
3916
3917 r = 0;
3918 break;
3919 }
3920#endif
3921#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
3922 case KVM_SET_GSI_ROUTING: {
3923 struct kvm_irq_routing routing;
3924 struct kvm_irq_routing __user *urouting;
3925 struct kvm_irq_routing_entry *entries = NULL;
3926
3927 r = -EFAULT;
3928 if (copy_from_user(&routing, argp, sizeof(routing)))
3929 goto out;
3930 r = -EINVAL;
3931 if (!kvm_arch_can_set_irq_routing(kvm))
3932 goto out;
3933 if (routing.nr > KVM_MAX_IRQ_ROUTES)
3934 goto out;
3935 if (routing.flags)
3936 goto out;
3937 if (routing.nr) {
3938 urouting = argp;
3939 entries = vmemdup_user(urouting->entries,
3940 array_size(sizeof(*entries),
3941 routing.nr));
3942 if (IS_ERR(entries)) {
3943 r = PTR_ERR(entries);
3944 goto out;
3945 }
3946 }
3947 r = kvm_set_irq_routing(kvm, entries, routing.nr,
3948 routing.flags);
3949 kvfree(entries);
3950 break;
3951 }
3952#endif
3953 case KVM_CREATE_DEVICE: {
3954 struct kvm_create_device cd;
3955
3956 r = -EFAULT;
3957 if (copy_from_user(&cd, argp, sizeof(cd)))
3958 goto out;
3959
3960 r = kvm_ioctl_create_device(kvm, &cd);
3961 if (r)
3962 goto out;
3963
3964 r = -EFAULT;
3965 if (copy_to_user(argp, &cd, sizeof(cd)))
3966 goto out;
3967
3968 r = 0;
3969 break;
3970 }
3971 case KVM_CHECK_EXTENSION:
3972 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
3973 break;
3974 case KVM_RESET_DIRTY_RINGS:
3975 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
3976 break;
3977 default:
3978 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
3979 }
3980out:
3981 return r;
3982}
3983
3984#ifdef CONFIG_KVM_COMPAT
3985struct compat_kvm_dirty_log {
3986 __u32 slot;
3987 __u32 padding1;
3988 union {
3989 compat_uptr_t dirty_bitmap;
3990 __u64 padding2;
3991 };
3992};
3993
3994static long kvm_vm_compat_ioctl(struct file *filp,
3995 unsigned int ioctl, unsigned long arg)
3996{
3997 struct kvm *kvm = filp->private_data;
3998 int r;
3999
4000 if (kvm->mm != current->mm)
4001 return -EIO;
4002 switch (ioctl) {
4003 case KVM_GET_DIRTY_LOG: {
4004 struct compat_kvm_dirty_log compat_log;
4005 struct kvm_dirty_log log;
4006
4007 if (copy_from_user(&compat_log, (void __user *)arg,
4008 sizeof(compat_log)))
4009 return -EFAULT;
4010 log.slot = compat_log.slot;
4011 log.padding1 = compat_log.padding1;
4012 log.padding2 = compat_log.padding2;
4013 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4014
4015 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4016 break;
4017 }
4018 default:
4019 r = kvm_vm_ioctl(filp, ioctl, arg);
4020 }
4021 return r;
4022}
4023#endif
4024
4025static struct file_operations kvm_vm_fops = {
4026 .release = kvm_vm_release,
4027 .unlocked_ioctl = kvm_vm_ioctl,
4028 .llseek = noop_llseek,
4029 KVM_COMPAT(kvm_vm_compat_ioctl),
4030};
4031
4032static int kvm_dev_ioctl_create_vm(unsigned long type)
4033{
4034 int r;
4035 struct kvm *kvm;
4036 struct file *file;
4037
4038 kvm = kvm_create_vm(type);
4039 if (IS_ERR(kvm))
4040 return PTR_ERR(kvm);
4041#ifdef CONFIG_KVM_MMIO
4042 r = kvm_coalesced_mmio_init(kvm);
4043 if (r < 0)
4044 goto put_kvm;
4045#endif
4046 r = get_unused_fd_flags(O_CLOEXEC);
4047 if (r < 0)
4048 goto put_kvm;
4049
4050 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4051 if (IS_ERR(file)) {
4052 put_unused_fd(r);
4053 r = PTR_ERR(file);
4054 goto put_kvm;
4055 }
4056
4057
4058
4059
4060
4061
4062
4063 if (kvm_create_vm_debugfs(kvm, r) < 0) {
4064 put_unused_fd(r);
4065 fput(file);
4066 return -ENOMEM;
4067 }
4068 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
4069
4070 fd_install(r, file);
4071 return r;
4072
4073put_kvm:
4074 kvm_put_kvm(kvm);
4075 return r;
4076}
4077
4078static long kvm_dev_ioctl(struct file *filp,
4079 unsigned int ioctl, unsigned long arg)
4080{
4081 long r = -EINVAL;
4082
4083 switch (ioctl) {
4084 case KVM_GET_API_VERSION:
4085 if (arg)
4086 goto out;
4087 r = KVM_API_VERSION;
4088 break;
4089 case KVM_CREATE_VM:
4090 r = kvm_dev_ioctl_create_vm(arg);
4091 break;
4092 case KVM_CHECK_EXTENSION:
4093 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
4094 break;
4095 case KVM_GET_VCPU_MMAP_SIZE:
4096 if (arg)
4097 goto out;
4098 r = PAGE_SIZE;
4099#ifdef CONFIG_X86
4100 r += PAGE_SIZE;
4101#endif
4102#ifdef CONFIG_KVM_MMIO
4103 r += PAGE_SIZE;
4104#endif
4105 break;
4106 case KVM_TRACE_ENABLE:
4107 case KVM_TRACE_PAUSE:
4108 case KVM_TRACE_DISABLE:
4109 r = -EOPNOTSUPP;
4110 break;
4111 default:
4112 return kvm_arch_dev_ioctl(filp, ioctl, arg);
4113 }
4114out:
4115 return r;
4116}
4117
4118static struct file_operations kvm_chardev_ops = {
4119 .unlocked_ioctl = kvm_dev_ioctl,
4120 .llseek = noop_llseek,
4121 KVM_COMPAT(kvm_dev_ioctl),
4122};
4123
4124static struct miscdevice kvm_dev = {
4125 KVM_MINOR,
4126 "kvm",
4127 &kvm_chardev_ops,
4128};
4129
4130static void hardware_enable_nolock(void *junk)
4131{
4132 int cpu = raw_smp_processor_id();
4133 int r;
4134
4135 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
4136 return;
4137
4138 cpumask_set_cpu(cpu, cpus_hardware_enabled);
4139
4140 r = kvm_arch_hardware_enable();
4141
4142 if (r) {
4143 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4144 atomic_inc(&hardware_enable_failed);
4145 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
4146 }
4147}
4148
4149static int kvm_starting_cpu(unsigned int cpu)
4150{
4151 raw_spin_lock(&kvm_count_lock);
4152 if (kvm_usage_count)
4153 hardware_enable_nolock(NULL);
4154 raw_spin_unlock(&kvm_count_lock);
4155 return 0;
4156}
4157
4158static void hardware_disable_nolock(void *junk)
4159{
4160 int cpu = raw_smp_processor_id();
4161
4162 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
4163 return;
4164 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4165 kvm_arch_hardware_disable();
4166}
4167
4168static int kvm_dying_cpu(unsigned int cpu)
4169{
4170 raw_spin_lock(&kvm_count_lock);
4171 if (kvm_usage_count)
4172 hardware_disable_nolock(NULL);
4173 raw_spin_unlock(&kvm_count_lock);
4174 return 0;
4175}
4176
4177static void hardware_disable_all_nolock(void)
4178{
4179 BUG_ON(!kvm_usage_count);
4180
4181 kvm_usage_count--;
4182 if (!kvm_usage_count)
4183 on_each_cpu(hardware_disable_nolock, NULL, 1);
4184}
4185
4186static void hardware_disable_all(void)
4187{
4188 raw_spin_lock(&kvm_count_lock);
4189 hardware_disable_all_nolock();
4190 raw_spin_unlock(&kvm_count_lock);
4191}
4192
4193static int hardware_enable_all(void)
4194{
4195 int r = 0;
4196
4197 raw_spin_lock(&kvm_count_lock);
4198
4199 kvm_usage_count++;
4200 if (kvm_usage_count == 1) {
4201 atomic_set(&hardware_enable_failed, 0);
4202 on_each_cpu(hardware_enable_nolock, NULL, 1);
4203
4204 if (atomic_read(&hardware_enable_failed)) {
4205 hardware_disable_all_nolock();
4206 r = -EBUSY;
4207 }
4208 }
4209
4210 raw_spin_unlock(&kvm_count_lock);
4211
4212 return r;
4213}
4214
4215static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
4216 void *v)
4217{
4218
4219
4220
4221
4222
4223
4224 pr_info("kvm: exiting hardware virtualization\n");
4225 kvm_rebooting = true;
4226 on_each_cpu(hardware_disable_nolock, NULL, 1);
4227 return NOTIFY_OK;
4228}
4229
4230static struct notifier_block kvm_reboot_notifier = {
4231 .notifier_call = kvm_reboot,
4232 .priority = 0,
4233};
4234
4235static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
4236{
4237 int i;
4238
4239 for (i = 0; i < bus->dev_count; i++) {
4240 struct kvm_io_device *pos = bus->range[i].dev;
4241
4242 kvm_iodevice_destructor(pos);
4243 }
4244 kfree(bus);
4245}
4246
4247static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
4248 const struct kvm_io_range *r2)
4249{
4250 gpa_t addr1 = r1->addr;
4251 gpa_t addr2 = r2->addr;
4252
4253 if (addr1 < addr2)
4254 return -1;
4255
4256
4257
4258
4259
4260
4261 if (r2->len) {
4262 addr1 += r1->len;
4263 addr2 += r2->len;
4264 }
4265
4266 if (addr1 > addr2)
4267 return 1;
4268
4269 return 0;
4270}
4271
4272static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4273{
4274 return kvm_io_bus_cmp(p1, p2);
4275}
4276
4277static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
4278 gpa_t addr, int len)
4279{
4280 struct kvm_io_range *range, key;
4281 int off;
4282
4283 key = (struct kvm_io_range) {
4284 .addr = addr,
4285 .len = len,
4286 };
4287
4288 range = bsearch(&key, bus->range, bus->dev_count,
4289 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4290 if (range == NULL)
4291 return -ENOENT;
4292
4293 off = range - bus->range;
4294
4295 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
4296 off--;
4297
4298 return off;
4299}
4300
4301static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4302 struct kvm_io_range *range, const void *val)
4303{
4304 int idx;
4305
4306 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4307 if (idx < 0)
4308 return -EOPNOTSUPP;
4309
4310 while (idx < bus->dev_count &&
4311 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4312 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
4313 range->len, val))
4314 return idx;
4315 idx++;
4316 }
4317
4318 return -EOPNOTSUPP;
4319}
4320
4321
4322int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4323 int len, const void *val)
4324{
4325 struct kvm_io_bus *bus;
4326 struct kvm_io_range range;
4327 int r;
4328
4329 range = (struct kvm_io_range) {
4330 .addr = addr,
4331 .len = len,
4332 };
4333
4334 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4335 if (!bus)
4336 return -ENOMEM;
4337 r = __kvm_io_bus_write(vcpu, bus, &range, val);
4338 return r < 0 ? r : 0;
4339}
4340EXPORT_SYMBOL_GPL(kvm_io_bus_write);
4341
4342
4343int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
4344 gpa_t addr, int len, const void *val, long cookie)
4345{
4346 struct kvm_io_bus *bus;
4347 struct kvm_io_range range;
4348
4349 range = (struct kvm_io_range) {
4350 .addr = addr,
4351 .len = len,
4352 };
4353
4354 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4355 if (!bus)
4356 return -ENOMEM;
4357
4358
4359 if ((cookie >= 0) && (cookie < bus->dev_count) &&
4360 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
4361 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
4362 val))
4363 return cookie;
4364
4365
4366
4367
4368
4369 return __kvm_io_bus_write(vcpu, bus, &range, val);
4370}
4371
4372static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4373 struct kvm_io_range *range, void *val)
4374{
4375 int idx;
4376
4377 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4378 if (idx < 0)
4379 return -EOPNOTSUPP;
4380
4381 while (idx < bus->dev_count &&
4382 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4383 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
4384 range->len, val))
4385 return idx;
4386 idx++;
4387 }
4388
4389 return -EOPNOTSUPP;
4390}
4391
4392
4393int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4394 int len, void *val)
4395{
4396 struct kvm_io_bus *bus;
4397 struct kvm_io_range range;
4398 int r;
4399
4400 range = (struct kvm_io_range) {
4401 .addr = addr,
4402 .len = len,
4403 };
4404
4405 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4406 if (!bus)
4407 return -ENOMEM;
4408 r = __kvm_io_bus_read(vcpu, bus, &range, val);
4409 return r < 0 ? r : 0;
4410}
4411
4412
4413int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
4414 int len, struct kvm_io_device *dev)
4415{
4416 int i;
4417 struct kvm_io_bus *new_bus, *bus;
4418 struct kvm_io_range range;
4419
4420 bus = kvm_get_bus(kvm, bus_idx);
4421 if (!bus)
4422 return -ENOMEM;
4423
4424
4425 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
4426 return -ENOSPC;
4427
4428 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
4429 GFP_KERNEL_ACCOUNT);
4430 if (!new_bus)
4431 return -ENOMEM;
4432
4433 range = (struct kvm_io_range) {
4434 .addr = addr,
4435 .len = len,
4436 .dev = dev,
4437 };
4438
4439 for (i = 0; i < bus->dev_count; i++)
4440 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
4441 break;
4442
4443 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
4444 new_bus->dev_count++;
4445 new_bus->range[i] = range;
4446 memcpy(new_bus->range + i + 1, bus->range + i,
4447 (bus->dev_count - i) * sizeof(struct kvm_io_range));
4448 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4449 synchronize_srcu_expedited(&kvm->srcu);
4450 kfree(bus);
4451
4452 return 0;
4453}
4454
4455
4456void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4457 struct kvm_io_device *dev)
4458{
4459 int i, j;
4460 struct kvm_io_bus *new_bus, *bus;
4461
4462 bus = kvm_get_bus(kvm, bus_idx);
4463 if (!bus)
4464 return;
4465
4466 for (i = 0; i < bus->dev_count; i++)
4467 if (bus->range[i].dev == dev) {
4468 break;
4469 }
4470
4471 if (i == bus->dev_count)
4472 return;
4473
4474 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
4475 GFP_KERNEL_ACCOUNT);
4476 if (new_bus) {
4477 memcpy(new_bus, bus, struct_size(bus, range, i));
4478 new_bus->dev_count--;
4479 memcpy(new_bus->range + i, bus->range + i + 1,
4480 flex_array_size(new_bus, range, new_bus->dev_count - i));
4481 } else {
4482 pr_err("kvm: failed to shrink bus, removing it completely\n");
4483 for (j = 0; j < bus->dev_count; j++) {
4484 if (j == i)
4485 continue;
4486 kvm_iodevice_destructor(bus->range[j].dev);
4487 }
4488 }
4489
4490 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4491 synchronize_srcu_expedited(&kvm->srcu);
4492 kfree(bus);
4493 return;
4494}
4495
4496struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4497 gpa_t addr)
4498{
4499 struct kvm_io_bus *bus;
4500 int dev_idx, srcu_idx;
4501 struct kvm_io_device *iodev = NULL;
4502
4503 srcu_idx = srcu_read_lock(&kvm->srcu);
4504
4505 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
4506 if (!bus)
4507 goto out_unlock;
4508
4509 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
4510 if (dev_idx < 0)
4511 goto out_unlock;
4512
4513 iodev = bus->range[dev_idx].dev;
4514
4515out_unlock:
4516 srcu_read_unlock(&kvm->srcu, srcu_idx);
4517
4518 return iodev;
4519}
4520EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
4521
4522static int kvm_debugfs_open(struct inode *inode, struct file *file,
4523 int (*get)(void *, u64 *), int (*set)(void *, u64),
4524 const char *fmt)
4525{
4526 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
4527 inode->i_private;
4528
4529
4530
4531
4532
4533
4534 if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
4535 return -ENOENT;
4536
4537 if (simple_attr_open(inode, file, get,
4538 KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
4539 ? set : NULL,
4540 fmt)) {
4541 kvm_put_kvm(stat_data->kvm);
4542 return -ENOMEM;
4543 }
4544
4545 return 0;
4546}
4547
4548static int kvm_debugfs_release(struct inode *inode, struct file *file)
4549{
4550 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
4551 inode->i_private;
4552
4553 simple_attr_release(inode, file);
4554 kvm_put_kvm(stat_data->kvm);
4555
4556 return 0;
4557}
4558
4559static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
4560{
4561 *val = *(ulong *)((void *)kvm + offset);
4562
4563 return 0;
4564}
4565
4566static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
4567{
4568 *(ulong *)((void *)kvm + offset) = 0;
4569
4570 return 0;
4571}
4572
4573static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
4574{
4575 int i;
4576 struct kvm_vcpu *vcpu;
4577
4578 *val = 0;
4579
4580 kvm_for_each_vcpu(i, vcpu, kvm)
4581 *val += *(u64 *)((void *)vcpu + offset);
4582
4583 return 0;
4584}
4585
4586static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
4587{
4588 int i;
4589 struct kvm_vcpu *vcpu;
4590
4591 kvm_for_each_vcpu(i, vcpu, kvm)
4592 *(u64 *)((void *)vcpu + offset) = 0;
4593
4594 return 0;
4595}
4596
4597static int kvm_stat_data_get(void *data, u64 *val)
4598{
4599 int r = -EFAULT;
4600 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4601
4602 switch (stat_data->dbgfs_item->kind) {
4603 case KVM_STAT_VM:
4604 r = kvm_get_stat_per_vm(stat_data->kvm,
4605 stat_data->dbgfs_item->offset, val);
4606 break;
4607 case KVM_STAT_VCPU:
4608 r = kvm_get_stat_per_vcpu(stat_data->kvm,
4609 stat_data->dbgfs_item->offset, val);
4610 break;
4611 }
4612
4613 return r;
4614}
4615
4616static int kvm_stat_data_clear(void *data, u64 val)
4617{
4618 int r = -EFAULT;
4619 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4620
4621 if (val)
4622 return -EINVAL;
4623
4624 switch (stat_data->dbgfs_item->kind) {
4625 case KVM_STAT_VM:
4626 r = kvm_clear_stat_per_vm(stat_data->kvm,
4627 stat_data->dbgfs_item->offset);
4628 break;
4629 case KVM_STAT_VCPU:
4630 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
4631 stat_data->dbgfs_item->offset);
4632 break;
4633 }
4634
4635 return r;
4636}
4637
4638static int kvm_stat_data_open(struct inode *inode, struct file *file)
4639{
4640 __simple_attr_check_format("%llu\n", 0ull);
4641 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
4642 kvm_stat_data_clear, "%llu\n");
4643}
4644
4645static const struct file_operations stat_fops_per_vm = {
4646 .owner = THIS_MODULE,
4647 .open = kvm_stat_data_open,
4648 .release = kvm_debugfs_release,
4649 .read = simple_attr_read,
4650 .write = simple_attr_write,
4651 .llseek = no_llseek,
4652};
4653
4654static int vm_stat_get(void *_offset, u64 *val)
4655{
4656 unsigned offset = (long)_offset;
4657 struct kvm *kvm;
4658 u64 tmp_val;
4659
4660 *val = 0;
4661 mutex_lock(&kvm_lock);
4662 list_for_each_entry(kvm, &vm_list, vm_list) {
4663 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
4664 *val += tmp_val;
4665 }
4666 mutex_unlock(&kvm_lock);
4667 return 0;
4668}
4669
4670static int vm_stat_clear(void *_offset, u64 val)
4671{
4672 unsigned offset = (long)_offset;
4673 struct kvm *kvm;
4674
4675 if (val)
4676 return -EINVAL;
4677
4678 mutex_lock(&kvm_lock);
4679 list_for_each_entry(kvm, &vm_list, vm_list) {
4680 kvm_clear_stat_per_vm(kvm, offset);
4681 }
4682 mutex_unlock(&kvm_lock);
4683
4684 return 0;
4685}
4686
4687DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
4688
4689static int vcpu_stat_get(void *_offset, u64 *val)
4690{
4691 unsigned offset = (long)_offset;
4692 struct kvm *kvm;
4693 u64 tmp_val;
4694
4695 *val = 0;
4696 mutex_lock(&kvm_lock);
4697 list_for_each_entry(kvm, &vm_list, vm_list) {
4698 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
4699 *val += tmp_val;
4700 }
4701 mutex_unlock(&kvm_lock);
4702 return 0;
4703}
4704
4705static int vcpu_stat_clear(void *_offset, u64 val)
4706{
4707 unsigned offset = (long)_offset;
4708 struct kvm *kvm;
4709
4710 if (val)
4711 return -EINVAL;
4712
4713 mutex_lock(&kvm_lock);
4714 list_for_each_entry(kvm, &vm_list, vm_list) {
4715 kvm_clear_stat_per_vcpu(kvm, offset);
4716 }
4717 mutex_unlock(&kvm_lock);
4718
4719 return 0;
4720}
4721
4722DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
4723 "%llu\n");
4724
4725static const struct file_operations *stat_fops[] = {
4726 [KVM_STAT_VCPU] = &vcpu_stat_fops,
4727 [KVM_STAT_VM] = &vm_stat_fops,
4728};
4729
4730static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
4731{
4732 struct kobj_uevent_env *env;
4733 unsigned long long created, active;
4734
4735 if (!kvm_dev.this_device || !kvm)
4736 return;
4737
4738 mutex_lock(&kvm_lock);
4739 if (type == KVM_EVENT_CREATE_VM) {
4740 kvm_createvm_count++;
4741 kvm_active_vms++;
4742 } else if (type == KVM_EVENT_DESTROY_VM) {
4743 kvm_active_vms--;
4744 }
4745 created = kvm_createvm_count;
4746 active = kvm_active_vms;
4747 mutex_unlock(&kvm_lock);
4748
4749 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
4750 if (!env)
4751 return;
4752
4753 add_uevent_var(env, "CREATED=%llu", created);
4754 add_uevent_var(env, "COUNT=%llu", active);
4755
4756 if (type == KVM_EVENT_CREATE_VM) {
4757 add_uevent_var(env, "EVENT=create");
4758 kvm->userspace_pid = task_pid_nr(current);
4759 } else if (type == KVM_EVENT_DESTROY_VM) {
4760 add_uevent_var(env, "EVENT=destroy");
4761 }
4762 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
4763
4764 if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
4765 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
4766
4767 if (p) {
4768 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
4769 if (!IS_ERR(tmp))
4770 add_uevent_var(env, "STATS_PATH=%s", tmp);
4771 kfree(p);
4772 }
4773 }
4774
4775 env->envp[env->envp_idx++] = NULL;
4776 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
4777 kfree(env);
4778}
4779
4780static void kvm_init_debug(void)
4781{
4782 struct kvm_stats_debugfs_item *p;
4783
4784 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
4785
4786 kvm_debugfs_num_entries = 0;
4787 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
4788 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
4789 kvm_debugfs_dir, (void *)(long)p->offset,
4790 stat_fops[p->kind]);
4791 }
4792}
4793
4794static int kvm_suspend(void)
4795{
4796 if (kvm_usage_count)
4797 hardware_disable_nolock(NULL);
4798 return 0;
4799}
4800
4801static void kvm_resume(void)
4802{
4803 if (kvm_usage_count) {
4804#ifdef CONFIG_LOCKDEP
4805 WARN_ON(lockdep_is_held(&kvm_count_lock));
4806#endif
4807 hardware_enable_nolock(NULL);
4808 }
4809}
4810
4811static struct syscore_ops kvm_syscore_ops = {
4812 .suspend = kvm_suspend,
4813 .resume = kvm_resume,
4814};
4815
4816static inline
4817struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
4818{
4819 return container_of(pn, struct kvm_vcpu, preempt_notifier);
4820}
4821
4822static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
4823{
4824 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
4825
4826 WRITE_ONCE(vcpu->preempted, false);
4827 WRITE_ONCE(vcpu->ready, false);
4828
4829 __this_cpu_write(kvm_running_vcpu, vcpu);
4830 kvm_arch_sched_in(vcpu, cpu);
4831 kvm_arch_vcpu_load(vcpu, cpu);
4832}
4833
4834static void kvm_sched_out(struct preempt_notifier *pn,
4835 struct task_struct *next)
4836{
4837 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
4838
4839 if (current->state == TASK_RUNNING) {
4840 WRITE_ONCE(vcpu->preempted, true);
4841 WRITE_ONCE(vcpu->ready, true);
4842 }
4843 kvm_arch_vcpu_put(vcpu);
4844 __this_cpu_write(kvm_running_vcpu, NULL);
4845}
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856struct kvm_vcpu *kvm_get_running_vcpu(void)
4857{
4858 struct kvm_vcpu *vcpu;
4859
4860 preempt_disable();
4861 vcpu = __this_cpu_read(kvm_running_vcpu);
4862 preempt_enable();
4863
4864 return vcpu;
4865}
4866EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
4867
4868
4869
4870
4871struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
4872{
4873 return &kvm_running_vcpu;
4874}
4875
4876struct kvm_cpu_compat_check {
4877 void *opaque;
4878 int *ret;
4879};
4880
4881static void check_processor_compat(void *data)
4882{
4883 struct kvm_cpu_compat_check *c = data;
4884
4885 *c->ret = kvm_arch_check_processor_compat(c->opaque);
4886}
4887
4888int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
4889 struct module *module)
4890{
4891 struct kvm_cpu_compat_check c;
4892 int r;
4893 int cpu;
4894
4895 r = kvm_arch_init(opaque);
4896 if (r)
4897 goto out_fail;
4898
4899
4900
4901
4902
4903
4904
4905
4906 r = kvm_irqfd_init();
4907 if (r)
4908 goto out_irqfd;
4909
4910 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
4911 r = -ENOMEM;
4912 goto out_free_0;
4913 }
4914
4915 r = kvm_arch_hardware_setup(opaque);
4916 if (r < 0)
4917 goto out_free_1;
4918
4919 c.ret = &r;
4920 c.opaque = opaque;
4921 for_each_online_cpu(cpu) {
4922 smp_call_function_single(cpu, check_processor_compat, &c, 1);
4923 if (r < 0)
4924 goto out_free_2;
4925 }
4926
4927 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
4928 kvm_starting_cpu, kvm_dying_cpu);
4929 if (r)
4930 goto out_free_2;
4931 register_reboot_notifier(&kvm_reboot_notifier);
4932
4933
4934 if (!vcpu_align)
4935 vcpu_align = __alignof__(struct kvm_vcpu);
4936 kvm_vcpu_cache =
4937 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
4938 SLAB_ACCOUNT,
4939 offsetof(struct kvm_vcpu, arch),
4940 sizeof_field(struct kvm_vcpu, arch),
4941 NULL);
4942 if (!kvm_vcpu_cache) {
4943 r = -ENOMEM;
4944 goto out_free_3;
4945 }
4946
4947 r = kvm_async_pf_init();
4948 if (r)
4949 goto out_free;
4950
4951 kvm_chardev_ops.owner = module;
4952 kvm_vm_fops.owner = module;
4953 kvm_vcpu_fops.owner = module;
4954
4955 r = misc_register(&kvm_dev);
4956 if (r) {
4957 pr_err("kvm: misc device register failed\n");
4958 goto out_unreg;
4959 }
4960
4961 register_syscore_ops(&kvm_syscore_ops);
4962
4963 kvm_preempt_ops.sched_in = kvm_sched_in;
4964 kvm_preempt_ops.sched_out = kvm_sched_out;
4965
4966 kvm_init_debug();
4967
4968 r = kvm_vfio_ops_init();
4969 WARN_ON(r);
4970
4971 return 0;
4972
4973out_unreg:
4974 kvm_async_pf_deinit();
4975out_free:
4976 kmem_cache_destroy(kvm_vcpu_cache);
4977out_free_3:
4978 unregister_reboot_notifier(&kvm_reboot_notifier);
4979 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
4980out_free_2:
4981 kvm_arch_hardware_unsetup();
4982out_free_1:
4983 free_cpumask_var(cpus_hardware_enabled);
4984out_free_0:
4985 kvm_irqfd_exit();
4986out_irqfd:
4987 kvm_arch_exit();
4988out_fail:
4989 return r;
4990}
4991EXPORT_SYMBOL_GPL(kvm_init);
4992
4993void kvm_exit(void)
4994{
4995 debugfs_remove_recursive(kvm_debugfs_dir);
4996 misc_deregister(&kvm_dev);
4997 kmem_cache_destroy(kvm_vcpu_cache);
4998 kvm_async_pf_deinit();
4999 unregister_syscore_ops(&kvm_syscore_ops);
5000 unregister_reboot_notifier(&kvm_reboot_notifier);
5001 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5002 on_each_cpu(hardware_disable_nolock, NULL, 1);
5003 kvm_arch_hardware_unsetup();
5004 kvm_arch_exit();
5005 kvm_irqfd_exit();
5006 free_cpumask_var(cpus_hardware_enabled);
5007 kvm_vfio_ops_exit();
5008}
5009EXPORT_SYMBOL_GPL(kvm_exit);
5010
5011struct kvm_vm_worker_thread_context {
5012 struct kvm *kvm;
5013 struct task_struct *parent;
5014 struct completion init_done;
5015 kvm_vm_thread_fn_t thread_fn;
5016 uintptr_t data;
5017 int err;
5018};
5019
5020static int kvm_vm_worker_thread(void *context)
5021{
5022
5023
5024
5025
5026 struct kvm_vm_worker_thread_context *init_context = context;
5027 struct kvm *kvm = init_context->kvm;
5028 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5029 uintptr_t data = init_context->data;
5030 int err;
5031
5032 err = kthread_park(current);
5033
5034 WARN_ON(err != 0);
5035 if (err)
5036 goto init_complete;
5037
5038 err = cgroup_attach_task_all(init_context->parent, current);
5039 if (err) {
5040 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5041 __func__, err);
5042 goto init_complete;
5043 }
5044
5045 set_user_nice(current, task_nice(init_context->parent));
5046
5047init_complete:
5048 init_context->err = err;
5049 complete(&init_context->init_done);
5050 init_context = NULL;
5051
5052 if (err)
5053 return err;
5054
5055
5056 kthread_parkme();
5057
5058 if (!kthread_should_stop())
5059 err = thread_fn(kvm, data);
5060
5061 return err;
5062}
5063
5064int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5065 uintptr_t data, const char *name,
5066 struct task_struct **thread_ptr)
5067{
5068 struct kvm_vm_worker_thread_context init_context = {};
5069 struct task_struct *thread;
5070
5071 *thread_ptr = NULL;
5072 init_context.kvm = kvm;
5073 init_context.parent = current;
5074 init_context.thread_fn = thread_fn;
5075 init_context.data = data;
5076 init_completion(&init_context.init_done);
5077
5078 thread = kthread_run(kvm_vm_worker_thread, &init_context,
5079 "%s-%d", name, task_pid_nr(current));
5080 if (IS_ERR(thread))
5081 return PTR_ERR(thread);
5082
5083
5084 WARN_ON(thread == NULL);
5085
5086 wait_for_completion(&init_context.init_done);
5087
5088 if (!init_context.err)
5089 *thread_ptr = thread;
5090
5091 return init_context.err;
5092}
5093