1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <kvm/iodev.h>
17
18#include <linux/kvm_host.h>
19#include <linux/kvm.h>
20#include <linux/module.h>
21#include <linux/errno.h>
22#include <linux/percpu.h>
23#include <linux/mm.h>
24#include <linux/miscdevice.h>
25#include <linux/vmalloc.h>
26#include <linux/reboot.h>
27#include <linux/debugfs.h>
28#include <linux/highmem.h>
29#include <linux/file.h>
30#include <linux/syscore_ops.h>
31#include <linux/cpu.h>
32#include <linux/sched/signal.h>
33#include <linux/sched/mm.h>
34#include <linux/sched/stat.h>
35#include <linux/cpumask.h>
36#include <linux/smp.h>
37#include <linux/anon_inodes.h>
38#include <linux/profile.h>
39#include <linux/kvm_para.h>
40#include <linux/pagemap.h>
41#include <linux/mman.h>
42#include <linux/swap.h>
43#include <linux/bitops.h>
44#include <linux/spinlock.h>
45#include <linux/compat.h>
46#include <linux/srcu.h>
47#include <linux/hugetlb.h>
48#include <linux/slab.h>
49#include <linux/sort.h>
50#include <linux/bsearch.h>
51#include <linux/io.h>
52#include <linux/lockdep.h>
53#include <linux/kthread.h>
54
55#include <asm/processor.h>
56#include <asm/ioctl.h>
57#include <linux/uaccess.h>
58
59#include "coalesced_mmio.h"
60#include "async_pf.h"
61#include "vfio.h"
62
63#define CREATE_TRACE_POINTS
64#include <trace/events/kvm.h>
65
66
67#define ITOA_MAX_LEN 12
68
69MODULE_AUTHOR("Qumranet");
70MODULE_LICENSE("GPL");
71
72
73unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
74module_param(halt_poll_ns, uint, 0644);
75EXPORT_SYMBOL_GPL(halt_poll_ns);
76
77
78unsigned int halt_poll_ns_grow = 2;
79module_param(halt_poll_ns_grow, uint, 0644);
80EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
81
82
83unsigned int halt_poll_ns_grow_start = 10000;
84module_param(halt_poll_ns_grow_start, uint, 0644);
85EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
86
87
88unsigned int halt_poll_ns_shrink;
89module_param(halt_poll_ns_shrink, uint, 0644);
90EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
91
92
93
94
95
96
97
98DEFINE_MUTEX(kvm_lock);
99static DEFINE_RAW_SPINLOCK(kvm_count_lock);
100LIST_HEAD(vm_list);
101
102static cpumask_var_t cpus_hardware_enabled;
103static int kvm_usage_count;
104static atomic_t hardware_enable_failed;
105
106static struct kmem_cache *kvm_vcpu_cache;
107
108static __read_mostly struct preempt_ops kvm_preempt_ops;
109static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
110
111struct dentry *kvm_debugfs_dir;
112EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
113
114static int kvm_debugfs_num_entries;
115static const struct file_operations stat_fops_per_vm;
116
117static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
118 unsigned long arg);
119#ifdef CONFIG_KVM_COMPAT
120static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
121 unsigned long arg);
122#define KVM_COMPAT(c) .compat_ioctl = (c)
123#else
124
125
126
127
128
129
130
131static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
132 unsigned long arg) { return -EINVAL; }
133
134static int kvm_no_compat_open(struct inode *inode, struct file *file)
135{
136 return is_compat_task() ? -ENODEV : 0;
137}
138#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
139 .open = kvm_no_compat_open
140#endif
141static int hardware_enable_all(void);
142static void hardware_disable_all(void);
143
144static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
145
146static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
147
148__visible bool kvm_rebooting;
149EXPORT_SYMBOL_GPL(kvm_rebooting);
150
151#define KVM_EVENT_CREATE_VM 0
152#define KVM_EVENT_DESTROY_VM 1
153static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
154static unsigned long long kvm_createvm_count;
155static unsigned long long kvm_active_vms;
156
157__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
158 unsigned long start, unsigned long end)
159{
160}
161
162bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
163{
164
165
166
167
168
169
170 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
171 return false;
172
173 return is_zone_device_page(pfn_to_page(pfn));
174}
175
176bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
177{
178
179
180
181
182
183 if (pfn_valid(pfn))
184 return PageReserved(pfn_to_page(pfn)) &&
185 !is_zero_pfn(pfn) &&
186 !kvm_is_zone_device_pfn(pfn);
187
188 return true;
189}
190
191bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
192{
193 struct page *page = pfn_to_page(pfn);
194
195 if (!PageTransCompoundMap(page))
196 return false;
197
198 return is_transparent_hugepage(compound_head(page));
199}
200
201
202
203
204void vcpu_load(struct kvm_vcpu *vcpu)
205{
206 int cpu = get_cpu();
207
208 __this_cpu_write(kvm_running_vcpu, vcpu);
209 preempt_notifier_register(&vcpu->preempt_notifier);
210 kvm_arch_vcpu_load(vcpu, cpu);
211 put_cpu();
212}
213EXPORT_SYMBOL_GPL(vcpu_load);
214
215void vcpu_put(struct kvm_vcpu *vcpu)
216{
217 preempt_disable();
218 kvm_arch_vcpu_put(vcpu);
219 preempt_notifier_unregister(&vcpu->preempt_notifier);
220 __this_cpu_write(kvm_running_vcpu, NULL);
221 preempt_enable();
222}
223EXPORT_SYMBOL_GPL(vcpu_put);
224
225
226static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
227{
228 int mode = kvm_vcpu_exiting_guest_mode(vcpu);
229
230
231
232
233
234 if (req & KVM_REQUEST_WAIT)
235 return mode != OUTSIDE_GUEST_MODE;
236
237
238
239
240 return mode == IN_GUEST_MODE;
241}
242
243static void ack_flush(void *_completed)
244{
245}
246
247static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
248{
249 if (unlikely(!cpus))
250 cpus = cpu_online_mask;
251
252 if (cpumask_empty(cpus))
253 return false;
254
255 smp_call_function_many(cpus, ack_flush, NULL, wait);
256 return true;
257}
258
259bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
260 struct kvm_vcpu *except,
261 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
262{
263 int i, cpu, me;
264 struct kvm_vcpu *vcpu;
265 bool called;
266
267 me = get_cpu();
268
269 kvm_for_each_vcpu(i, vcpu, kvm) {
270 if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
271 vcpu == except)
272 continue;
273
274 kvm_make_request(req, vcpu);
275 cpu = vcpu->cpu;
276
277 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
278 continue;
279
280 if (tmp != NULL && cpu != -1 && cpu != me &&
281 kvm_request_needs_ipi(vcpu, req))
282 __cpumask_set_cpu(cpu, tmp);
283 }
284
285 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
286 put_cpu();
287
288 return called;
289}
290
291bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
292 struct kvm_vcpu *except)
293{
294 cpumask_var_t cpus;
295 bool called;
296
297 zalloc_cpumask_var(&cpus, GFP_ATOMIC);
298
299 called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus);
300
301 free_cpumask_var(cpus);
302 return called;
303}
304
305bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
306{
307 return kvm_make_all_cpus_request_except(kvm, req, NULL);
308}
309
310#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
311void kvm_flush_remote_tlbs(struct kvm *kvm)
312{
313
314
315
316
317 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
318
319
320
321
322
323
324
325
326
327
328
329
330 if (!kvm_arch_flush_remote_tlb(kvm)
331 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
332 ++kvm->stat.remote_tlb_flush;
333 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
334}
335EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
336#endif
337
338void kvm_reload_remote_mmus(struct kvm *kvm)
339{
340 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
341}
342
343#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
344static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
345 gfp_t gfp_flags)
346{
347 gfp_flags |= mc->gfp_zero;
348
349 if (mc->kmem_cache)
350 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
351 else
352 return (void *)__get_free_page(gfp_flags);
353}
354
355int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
356{
357 void *obj;
358
359 if (mc->nobjs >= min)
360 return 0;
361 while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
362 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
363 if (!obj)
364 return mc->nobjs >= min ? 0 : -ENOMEM;
365 mc->objects[mc->nobjs++] = obj;
366 }
367 return 0;
368}
369
370int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
371{
372 return mc->nobjs;
373}
374
375void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
376{
377 while (mc->nobjs) {
378 if (mc->kmem_cache)
379 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
380 else
381 free_page((unsigned long)mc->objects[--mc->nobjs]);
382 }
383}
384
385void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
386{
387 void *p;
388
389 if (WARN_ON(!mc->nobjs))
390 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
391 else
392 p = mc->objects[--mc->nobjs];
393 BUG_ON(!p);
394 return p;
395}
396#endif
397
398static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
399{
400 mutex_init(&vcpu->mutex);
401 vcpu->cpu = -1;
402 vcpu->kvm = kvm;
403 vcpu->vcpu_id = id;
404 vcpu->pid = NULL;
405 rcuwait_init(&vcpu->wait);
406 kvm_async_pf_vcpu_init(vcpu);
407
408 vcpu->pre_pcpu = -1;
409 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
410
411 kvm_vcpu_set_in_spin_loop(vcpu, false);
412 kvm_vcpu_set_dy_eligible(vcpu, false);
413 vcpu->preempted = false;
414 vcpu->ready = false;
415 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
416}
417
418void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
419{
420 kvm_arch_vcpu_destroy(vcpu);
421
422
423
424
425
426
427 put_pid(rcu_dereference_protected(vcpu->pid, 1));
428
429 free_page((unsigned long)vcpu->run);
430 kmem_cache_free(kvm_vcpu_cache, vcpu);
431}
432EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
433
434#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
435static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
436{
437 return container_of(mn, struct kvm, mmu_notifier);
438}
439
440static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
441 struct mm_struct *mm,
442 unsigned long start, unsigned long end)
443{
444 struct kvm *kvm = mmu_notifier_to_kvm(mn);
445 int idx;
446
447 idx = srcu_read_lock(&kvm->srcu);
448 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
449 srcu_read_unlock(&kvm->srcu, idx);
450}
451
452static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
453 struct mm_struct *mm,
454 unsigned long address,
455 pte_t pte)
456{
457 struct kvm *kvm = mmu_notifier_to_kvm(mn);
458 int idx;
459
460 idx = srcu_read_lock(&kvm->srcu);
461 spin_lock(&kvm->mmu_lock);
462 kvm->mmu_notifier_seq++;
463
464 if (kvm_set_spte_hva(kvm, address, pte))
465 kvm_flush_remote_tlbs(kvm);
466
467 spin_unlock(&kvm->mmu_lock);
468 srcu_read_unlock(&kvm->srcu, idx);
469}
470
471static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
472 const struct mmu_notifier_range *range)
473{
474 struct kvm *kvm = mmu_notifier_to_kvm(mn);
475 int need_tlb_flush = 0, idx;
476
477 idx = srcu_read_lock(&kvm->srcu);
478 spin_lock(&kvm->mmu_lock);
479
480
481
482
483
484 kvm->mmu_notifier_count++;
485 need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
486 range->flags);
487 need_tlb_flush |= kvm->tlbs_dirty;
488
489 if (need_tlb_flush)
490 kvm_flush_remote_tlbs(kvm);
491
492 spin_unlock(&kvm->mmu_lock);
493 srcu_read_unlock(&kvm->srcu, idx);
494
495 return 0;
496}
497
498static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
499 const struct mmu_notifier_range *range)
500{
501 struct kvm *kvm = mmu_notifier_to_kvm(mn);
502
503 spin_lock(&kvm->mmu_lock);
504
505
506
507
508
509 kvm->mmu_notifier_seq++;
510 smp_wmb();
511
512
513
514
515
516 kvm->mmu_notifier_count--;
517 spin_unlock(&kvm->mmu_lock);
518
519 BUG_ON(kvm->mmu_notifier_count < 0);
520}
521
522static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
523 struct mm_struct *mm,
524 unsigned long start,
525 unsigned long end)
526{
527 struct kvm *kvm = mmu_notifier_to_kvm(mn);
528 int young, idx;
529
530 idx = srcu_read_lock(&kvm->srcu);
531 spin_lock(&kvm->mmu_lock);
532
533 young = kvm_age_hva(kvm, start, end);
534 if (young)
535 kvm_flush_remote_tlbs(kvm);
536
537 spin_unlock(&kvm->mmu_lock);
538 srcu_read_unlock(&kvm->srcu, idx);
539
540 return young;
541}
542
543static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
544 struct mm_struct *mm,
545 unsigned long start,
546 unsigned long end)
547{
548 struct kvm *kvm = mmu_notifier_to_kvm(mn);
549 int young, idx;
550
551 idx = srcu_read_lock(&kvm->srcu);
552 spin_lock(&kvm->mmu_lock);
553
554
555
556
557
558
559
560
561
562
563
564
565
566 young = kvm_age_hva(kvm, start, end);
567 spin_unlock(&kvm->mmu_lock);
568 srcu_read_unlock(&kvm->srcu, idx);
569
570 return young;
571}
572
573static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
574 struct mm_struct *mm,
575 unsigned long address)
576{
577 struct kvm *kvm = mmu_notifier_to_kvm(mn);
578 int young, idx;
579
580 idx = srcu_read_lock(&kvm->srcu);
581 spin_lock(&kvm->mmu_lock);
582 young = kvm_test_age_hva(kvm, address);
583 spin_unlock(&kvm->mmu_lock);
584 srcu_read_unlock(&kvm->srcu, idx);
585
586 return young;
587}
588
589static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
590 struct mm_struct *mm)
591{
592 struct kvm *kvm = mmu_notifier_to_kvm(mn);
593 int idx;
594
595 idx = srcu_read_lock(&kvm->srcu);
596 kvm_arch_flush_shadow_all(kvm);
597 srcu_read_unlock(&kvm->srcu, idx);
598}
599
600static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
601 .invalidate_range = kvm_mmu_notifier_invalidate_range,
602 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
603 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
604 .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
605 .clear_young = kvm_mmu_notifier_clear_young,
606 .test_young = kvm_mmu_notifier_test_young,
607 .change_pte = kvm_mmu_notifier_change_pte,
608 .release = kvm_mmu_notifier_release,
609};
610
611static int kvm_init_mmu_notifier(struct kvm *kvm)
612{
613 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
614 return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
615}
616
617#else
618
619static int kvm_init_mmu_notifier(struct kvm *kvm)
620{
621 return 0;
622}
623
624#endif
625
626static struct kvm_memslots *kvm_alloc_memslots(void)
627{
628 int i;
629 struct kvm_memslots *slots;
630
631 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
632 if (!slots)
633 return NULL;
634
635 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
636 slots->id_to_index[i] = -1;
637
638 return slots;
639}
640
641static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
642{
643 if (!memslot->dirty_bitmap)
644 return;
645
646 kvfree(memslot->dirty_bitmap);
647 memslot->dirty_bitmap = NULL;
648}
649
650static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
651{
652 kvm_destroy_dirty_bitmap(slot);
653
654 kvm_arch_free_memslot(kvm, slot);
655
656 slot->flags = 0;
657 slot->npages = 0;
658}
659
660static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
661{
662 struct kvm_memory_slot *memslot;
663
664 if (!slots)
665 return;
666
667 kvm_for_each_memslot(memslot, slots)
668 kvm_free_memslot(kvm, memslot);
669
670 kvfree(slots);
671}
672
673static void kvm_destroy_vm_debugfs(struct kvm *kvm)
674{
675 int i;
676
677 if (!kvm->debugfs_dentry)
678 return;
679
680 debugfs_remove_recursive(kvm->debugfs_dentry);
681
682 if (kvm->debugfs_stat_data) {
683 for (i = 0; i < kvm_debugfs_num_entries; i++)
684 kfree(kvm->debugfs_stat_data[i]);
685 kfree(kvm->debugfs_stat_data);
686 }
687}
688
689static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
690{
691 char dir_name[ITOA_MAX_LEN * 2];
692 struct kvm_stat_data *stat_data;
693 struct kvm_stats_debugfs_item *p;
694
695 if (!debugfs_initialized())
696 return 0;
697
698 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
699 kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
700
701 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
702 sizeof(*kvm->debugfs_stat_data),
703 GFP_KERNEL_ACCOUNT);
704 if (!kvm->debugfs_stat_data)
705 return -ENOMEM;
706
707 for (p = debugfs_entries; p->name; p++) {
708 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
709 if (!stat_data)
710 return -ENOMEM;
711
712 stat_data->kvm = kvm;
713 stat_data->dbgfs_item = p;
714 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
715 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
716 kvm->debugfs_dentry, stat_data,
717 &stat_fops_per_vm);
718 }
719 return 0;
720}
721
722
723
724
725
726int __weak kvm_arch_post_init_vm(struct kvm *kvm)
727{
728 return 0;
729}
730
731
732
733
734
735void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
736{
737}
738
739static struct kvm *kvm_create_vm(unsigned long type)
740{
741 struct kvm *kvm = kvm_arch_alloc_vm();
742 int r = -ENOMEM;
743 int i;
744
745 if (!kvm)
746 return ERR_PTR(-ENOMEM);
747
748 spin_lock_init(&kvm->mmu_lock);
749 mmgrab(current->mm);
750 kvm->mm = current->mm;
751 kvm_eventfd_init(kvm);
752 mutex_init(&kvm->lock);
753 mutex_init(&kvm->irq_lock);
754 mutex_init(&kvm->slots_lock);
755 INIT_LIST_HEAD(&kvm->devices);
756
757 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
758
759 if (init_srcu_struct(&kvm->srcu))
760 goto out_err_no_srcu;
761 if (init_srcu_struct(&kvm->irq_srcu))
762 goto out_err_no_irq_srcu;
763
764 refcount_set(&kvm->users_count, 1);
765 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
766 struct kvm_memslots *slots = kvm_alloc_memslots();
767
768 if (!slots)
769 goto out_err_no_arch_destroy_vm;
770
771 slots->generation = i;
772 rcu_assign_pointer(kvm->memslots[i], slots);
773 }
774
775 for (i = 0; i < KVM_NR_BUSES; i++) {
776 rcu_assign_pointer(kvm->buses[i],
777 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
778 if (!kvm->buses[i])
779 goto out_err_no_arch_destroy_vm;
780 }
781
782 kvm->max_halt_poll_ns = halt_poll_ns;
783
784 r = kvm_arch_init_vm(kvm, type);
785 if (r)
786 goto out_err_no_arch_destroy_vm;
787
788 r = hardware_enable_all();
789 if (r)
790 goto out_err_no_disable;
791
792#ifdef CONFIG_HAVE_KVM_IRQFD
793 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
794#endif
795
796 r = kvm_init_mmu_notifier(kvm);
797 if (r)
798 goto out_err_no_mmu_notifier;
799
800 r = kvm_arch_post_init_vm(kvm);
801 if (r)
802 goto out_err;
803
804 mutex_lock(&kvm_lock);
805 list_add(&kvm->vm_list, &vm_list);
806 mutex_unlock(&kvm_lock);
807
808 preempt_notifier_inc();
809
810 return kvm;
811
812out_err:
813#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
814 if (kvm->mmu_notifier.ops)
815 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
816#endif
817out_err_no_mmu_notifier:
818 hardware_disable_all();
819out_err_no_disable:
820 kvm_arch_destroy_vm(kvm);
821out_err_no_arch_destroy_vm:
822 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
823 for (i = 0; i < KVM_NR_BUSES; i++)
824 kfree(kvm_get_bus(kvm, i));
825 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
826 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
827 cleanup_srcu_struct(&kvm->irq_srcu);
828out_err_no_irq_srcu:
829 cleanup_srcu_struct(&kvm->srcu);
830out_err_no_srcu:
831 kvm_arch_free_vm(kvm);
832 mmdrop(current->mm);
833 return ERR_PTR(r);
834}
835
836static void kvm_destroy_devices(struct kvm *kvm)
837{
838 struct kvm_device *dev, *tmp;
839
840
841
842
843
844
845 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
846 list_del(&dev->vm_node);
847 dev->ops->destroy(dev);
848 }
849}
850
851static void kvm_destroy_vm(struct kvm *kvm)
852{
853 int i;
854 struct mm_struct *mm = kvm->mm;
855
856 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
857 kvm_destroy_vm_debugfs(kvm);
858 kvm_arch_sync_events(kvm);
859 mutex_lock(&kvm_lock);
860 list_del(&kvm->vm_list);
861 mutex_unlock(&kvm_lock);
862 kvm_arch_pre_destroy_vm(kvm);
863
864 kvm_free_irq_routing(kvm);
865 for (i = 0; i < KVM_NR_BUSES; i++) {
866 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
867
868 if (bus)
869 kvm_io_bus_destroy(bus);
870 kvm->buses[i] = NULL;
871 }
872 kvm_coalesced_mmio_free(kvm);
873#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
874 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
875#else
876 kvm_arch_flush_shadow_all(kvm);
877#endif
878 kvm_arch_destroy_vm(kvm);
879 kvm_destroy_devices(kvm);
880 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
881 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
882 cleanup_srcu_struct(&kvm->irq_srcu);
883 cleanup_srcu_struct(&kvm->srcu);
884 kvm_arch_free_vm(kvm);
885 preempt_notifier_dec();
886 hardware_disable_all();
887 mmdrop(mm);
888}
889
890void kvm_get_kvm(struct kvm *kvm)
891{
892 refcount_inc(&kvm->users_count);
893}
894EXPORT_SYMBOL_GPL(kvm_get_kvm);
895
896void kvm_put_kvm(struct kvm *kvm)
897{
898 if (refcount_dec_and_test(&kvm->users_count))
899 kvm_destroy_vm(kvm);
900}
901EXPORT_SYMBOL_GPL(kvm_put_kvm);
902
903
904
905
906
907
908
909
910void kvm_put_kvm_no_destroy(struct kvm *kvm)
911{
912 WARN_ON(refcount_dec_and_test(&kvm->users_count));
913}
914EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
915
916static int kvm_vm_release(struct inode *inode, struct file *filp)
917{
918 struct kvm *kvm = filp->private_data;
919
920 kvm_irqfd_release(kvm);
921
922 kvm_put_kvm(kvm);
923 return 0;
924}
925
926
927
928
929
930static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
931{
932 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
933
934 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
935 if (!memslot->dirty_bitmap)
936 return -ENOMEM;
937
938 return 0;
939}
940
941
942
943
944
945static inline void kvm_memslot_delete(struct kvm_memslots *slots,
946 struct kvm_memory_slot *memslot)
947{
948 struct kvm_memory_slot *mslots = slots->memslots;
949 int i;
950
951 if (WARN_ON(slots->id_to_index[memslot->id] == -1))
952 return;
953
954 slots->used_slots--;
955
956 if (atomic_read(&slots->lru_slot) >= slots->used_slots)
957 atomic_set(&slots->lru_slot, 0);
958
959 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
960 mslots[i] = mslots[i + 1];
961 slots->id_to_index[mslots[i].id] = i;
962 }
963 mslots[i] = *memslot;
964 slots->id_to_index[memslot->id] = -1;
965}
966
967
968
969
970
971static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
972{
973 return slots->used_slots++;
974}
975
976
977
978
979
980
981
982
983static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
984 struct kvm_memory_slot *memslot)
985{
986 struct kvm_memory_slot *mslots = slots->memslots;
987 int i;
988
989 if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
990 WARN_ON_ONCE(!slots->used_slots))
991 return -1;
992
993
994
995
996
997
998 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
999 if (memslot->base_gfn > mslots[i + 1].base_gfn)
1000 break;
1001
1002 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1003
1004
1005 mslots[i] = mslots[i + 1];
1006 slots->id_to_index[mslots[i].id] = i;
1007 }
1008 return i;
1009}
1010
1011
1012
1013
1014
1015
1016
1017
1018static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1019 struct kvm_memory_slot *memslot,
1020 int start)
1021{
1022 struct kvm_memory_slot *mslots = slots->memslots;
1023 int i;
1024
1025 for (i = start; i > 0; i--) {
1026 if (memslot->base_gfn < mslots[i - 1].base_gfn)
1027 break;
1028
1029 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1030
1031
1032 mslots[i] = mslots[i - 1];
1033 slots->id_to_index[mslots[i].id] = i;
1034 }
1035 return i;
1036}
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079static void update_memslots(struct kvm_memslots *slots,
1080 struct kvm_memory_slot *memslot,
1081 enum kvm_mr_change change)
1082{
1083 int i;
1084
1085 if (change == KVM_MR_DELETE) {
1086 kvm_memslot_delete(slots, memslot);
1087 } else {
1088 if (change == KVM_MR_CREATE)
1089 i = kvm_memslot_insert_back(slots);
1090 else
1091 i = kvm_memslot_move_backward(slots, memslot);
1092 i = kvm_memslot_move_forward(slots, memslot, i);
1093
1094
1095
1096
1097
1098 slots->memslots[i] = *memslot;
1099 slots->id_to_index[memslot->id] = i;
1100 }
1101}
1102
1103static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1104{
1105 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1106
1107#ifdef __KVM_HAVE_READONLY_MEM
1108 valid_flags |= KVM_MEM_READONLY;
1109#endif
1110
1111 if (mem->flags & ~valid_flags)
1112 return -EINVAL;
1113
1114 return 0;
1115}
1116
1117static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
1118 int as_id, struct kvm_memslots *slots)
1119{
1120 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
1121 u64 gen = old_memslots->generation;
1122
1123 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1124 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1125
1126 rcu_assign_pointer(kvm->memslots[as_id], slots);
1127 synchronize_srcu_expedited(&kvm->srcu);
1128
1129
1130
1131
1132
1133
1134
1135 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1136
1137
1138
1139
1140
1141
1142
1143
1144 gen += KVM_ADDRESS_SPACE_NUM;
1145
1146 kvm_arch_memslots_updated(kvm, gen);
1147
1148 slots->generation = gen;
1149
1150 return old_memslots;
1151}
1152
1153
1154
1155
1156
1157
1158static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1159 enum kvm_mr_change change)
1160{
1161 struct kvm_memslots *slots;
1162 size_t old_size, new_size;
1163
1164 old_size = sizeof(struct kvm_memslots) +
1165 (sizeof(struct kvm_memory_slot) * old->used_slots);
1166
1167 if (change == KVM_MR_CREATE)
1168 new_size = old_size + sizeof(struct kvm_memory_slot);
1169 else
1170 new_size = old_size;
1171
1172 slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1173 if (likely(slots))
1174 memcpy(slots, old, old_size);
1175
1176 return slots;
1177}
1178
1179static int kvm_set_memslot(struct kvm *kvm,
1180 const struct kvm_userspace_memory_region *mem,
1181 struct kvm_memory_slot *old,
1182 struct kvm_memory_slot *new, int as_id,
1183 enum kvm_mr_change change)
1184{
1185 struct kvm_memory_slot *slot;
1186 struct kvm_memslots *slots;
1187 int r;
1188
1189 slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
1190 if (!slots)
1191 return -ENOMEM;
1192
1193 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1194
1195
1196
1197
1198 slot = id_to_memslot(slots, old->id);
1199 slot->flags |= KVM_MEMSLOT_INVALID;
1200
1201
1202
1203
1204
1205
1206
1207 slots = install_new_memslots(kvm, as_id, slots);
1208
1209
1210
1211
1212
1213
1214
1215
1216 kvm_arch_flush_shadow_memslot(kvm, slot);
1217 }
1218
1219 r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
1220 if (r)
1221 goto out_slots;
1222
1223 update_memslots(slots, new, change);
1224 slots = install_new_memslots(kvm, as_id, slots);
1225
1226 kvm_arch_commit_memory_region(kvm, mem, old, new, change);
1227
1228 kvfree(slots);
1229 return 0;
1230
1231out_slots:
1232 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
1233 slots = install_new_memslots(kvm, as_id, slots);
1234 kvfree(slots);
1235 return r;
1236}
1237
1238static int kvm_delete_memslot(struct kvm *kvm,
1239 const struct kvm_userspace_memory_region *mem,
1240 struct kvm_memory_slot *old, int as_id)
1241{
1242 struct kvm_memory_slot new;
1243 int r;
1244
1245 if (!old->npages)
1246 return -EINVAL;
1247
1248 memset(&new, 0, sizeof(new));
1249 new.id = old->id;
1250
1251 r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
1252 if (r)
1253 return r;
1254
1255 kvm_free_memslot(kvm, old);
1256 return 0;
1257}
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267int __kvm_set_memory_region(struct kvm *kvm,
1268 const struct kvm_userspace_memory_region *mem)
1269{
1270 struct kvm_memory_slot old, new;
1271 struct kvm_memory_slot *tmp;
1272 enum kvm_mr_change change;
1273 int as_id, id;
1274 int r;
1275
1276 r = check_memory_region_flags(mem);
1277 if (r)
1278 return r;
1279
1280 as_id = mem->slot >> 16;
1281 id = (u16)mem->slot;
1282
1283
1284 if (mem->memory_size & (PAGE_SIZE - 1))
1285 return -EINVAL;
1286 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1287 return -EINVAL;
1288
1289 if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1290 !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1291 mem->memory_size))
1292 return -EINVAL;
1293 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1294 return -EINVAL;
1295 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1296 return -EINVAL;
1297
1298
1299
1300
1301
1302
1303
1304 tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1305 if (tmp) {
1306 old = *tmp;
1307 tmp = NULL;
1308 } else {
1309 memset(&old, 0, sizeof(old));
1310 old.id = id;
1311 }
1312
1313 if (!mem->memory_size)
1314 return kvm_delete_memslot(kvm, mem, &old, as_id);
1315
1316 new.id = id;
1317 new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1318 new.npages = mem->memory_size >> PAGE_SHIFT;
1319 new.flags = mem->flags;
1320 new.userspace_addr = mem->userspace_addr;
1321
1322 if (new.npages > KVM_MEM_MAX_NR_PAGES)
1323 return -EINVAL;
1324
1325 if (!old.npages) {
1326 change = KVM_MR_CREATE;
1327 new.dirty_bitmap = NULL;
1328 memset(&new.arch, 0, sizeof(new.arch));
1329 } else {
1330 if ((new.userspace_addr != old.userspace_addr) ||
1331 (new.npages != old.npages) ||
1332 ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1333 return -EINVAL;
1334
1335 if (new.base_gfn != old.base_gfn)
1336 change = KVM_MR_MOVE;
1337 else if (new.flags != old.flags)
1338 change = KVM_MR_FLAGS_ONLY;
1339 else
1340 return 0;
1341
1342
1343 new.dirty_bitmap = old.dirty_bitmap;
1344 memcpy(&new.arch, &old.arch, sizeof(new.arch));
1345 }
1346
1347 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
1348
1349 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1350 if (tmp->id == id)
1351 continue;
1352 if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1353 (new.base_gfn >= tmp->base_gfn + tmp->npages)))
1354 return -EEXIST;
1355 }
1356 }
1357
1358
1359 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1360 new.dirty_bitmap = NULL;
1361 else if (!new.dirty_bitmap) {
1362 r = kvm_alloc_dirty_bitmap(&new);
1363 if (r)
1364 return r;
1365
1366 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1367 bitmap_set(new.dirty_bitmap, 0, new.npages);
1368 }
1369
1370 r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
1371 if (r)
1372 goto out_bitmap;
1373
1374 if (old.dirty_bitmap && !new.dirty_bitmap)
1375 kvm_destroy_dirty_bitmap(&old);
1376 return 0;
1377
1378out_bitmap:
1379 if (new.dirty_bitmap && !old.dirty_bitmap)
1380 kvm_destroy_dirty_bitmap(&new);
1381 return r;
1382}
1383EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1384
1385int kvm_set_memory_region(struct kvm *kvm,
1386 const struct kvm_userspace_memory_region *mem)
1387{
1388 int r;
1389
1390 mutex_lock(&kvm->slots_lock);
1391 r = __kvm_set_memory_region(kvm, mem);
1392 mutex_unlock(&kvm->slots_lock);
1393 return r;
1394}
1395EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1396
1397static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1398 struct kvm_userspace_memory_region *mem)
1399{
1400 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1401 return -EINVAL;
1402
1403 return kvm_set_memory_region(kvm, mem);
1404}
1405
1406#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1407
1408
1409
1410
1411
1412
1413
1414int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1415 int *is_dirty, struct kvm_memory_slot **memslot)
1416{
1417 struct kvm_memslots *slots;
1418 int i, as_id, id;
1419 unsigned long n;
1420 unsigned long any = 0;
1421
1422 *memslot = NULL;
1423 *is_dirty = 0;
1424
1425 as_id = log->slot >> 16;
1426 id = (u16)log->slot;
1427 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1428 return -EINVAL;
1429
1430 slots = __kvm_memslots(kvm, as_id);
1431 *memslot = id_to_memslot(slots, id);
1432 if (!(*memslot) || !(*memslot)->dirty_bitmap)
1433 return -ENOENT;
1434
1435 kvm_arch_sync_dirty_log(kvm, *memslot);
1436
1437 n = kvm_dirty_bitmap_bytes(*memslot);
1438
1439 for (i = 0; !any && i < n/sizeof(long); ++i)
1440 any = (*memslot)->dirty_bitmap[i];
1441
1442 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1443 return -EFAULT;
1444
1445 if (any)
1446 *is_dirty = 1;
1447 return 0;
1448}
1449EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1450
1451#else
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
1474{
1475 struct kvm_memslots *slots;
1476 struct kvm_memory_slot *memslot;
1477 int i, as_id, id;
1478 unsigned long n;
1479 unsigned long *dirty_bitmap;
1480 unsigned long *dirty_bitmap_buffer;
1481 bool flush;
1482
1483 as_id = log->slot >> 16;
1484 id = (u16)log->slot;
1485 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1486 return -EINVAL;
1487
1488 slots = __kvm_memslots(kvm, as_id);
1489 memslot = id_to_memslot(slots, id);
1490 if (!memslot || !memslot->dirty_bitmap)
1491 return -ENOENT;
1492
1493 dirty_bitmap = memslot->dirty_bitmap;
1494
1495 kvm_arch_sync_dirty_log(kvm, memslot);
1496
1497 n = kvm_dirty_bitmap_bytes(memslot);
1498 flush = false;
1499 if (kvm->manual_dirty_log_protect) {
1500
1501
1502
1503
1504
1505
1506
1507
1508 dirty_bitmap_buffer = dirty_bitmap;
1509 } else {
1510 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1511 memset(dirty_bitmap_buffer, 0, n);
1512
1513 spin_lock(&kvm->mmu_lock);
1514 for (i = 0; i < n / sizeof(long); i++) {
1515 unsigned long mask;
1516 gfn_t offset;
1517
1518 if (!dirty_bitmap[i])
1519 continue;
1520
1521 flush = true;
1522 mask = xchg(&dirty_bitmap[i], 0);
1523 dirty_bitmap_buffer[i] = mask;
1524
1525 offset = i * BITS_PER_LONG;
1526 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1527 offset, mask);
1528 }
1529 spin_unlock(&kvm->mmu_lock);
1530 }
1531
1532 if (flush)
1533 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1534
1535 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1536 return -EFAULT;
1537 return 0;
1538}
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1561 struct kvm_dirty_log *log)
1562{
1563 int r;
1564
1565 mutex_lock(&kvm->slots_lock);
1566
1567 r = kvm_get_dirty_log_protect(kvm, log);
1568
1569 mutex_unlock(&kvm->slots_lock);
1570 return r;
1571}
1572
1573
1574
1575
1576
1577
1578
1579static int kvm_clear_dirty_log_protect(struct kvm *kvm,
1580 struct kvm_clear_dirty_log *log)
1581{
1582 struct kvm_memslots *slots;
1583 struct kvm_memory_slot *memslot;
1584 int as_id, id;
1585 gfn_t offset;
1586 unsigned long i, n;
1587 unsigned long *dirty_bitmap;
1588 unsigned long *dirty_bitmap_buffer;
1589 bool flush;
1590
1591 as_id = log->slot >> 16;
1592 id = (u16)log->slot;
1593 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1594 return -EINVAL;
1595
1596 if (log->first_page & 63)
1597 return -EINVAL;
1598
1599 slots = __kvm_memslots(kvm, as_id);
1600 memslot = id_to_memslot(slots, id);
1601 if (!memslot || !memslot->dirty_bitmap)
1602 return -ENOENT;
1603
1604 dirty_bitmap = memslot->dirty_bitmap;
1605
1606 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
1607
1608 if (log->first_page > memslot->npages ||
1609 log->num_pages > memslot->npages - log->first_page ||
1610 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
1611 return -EINVAL;
1612
1613 kvm_arch_sync_dirty_log(kvm, memslot);
1614
1615 flush = false;
1616 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1617 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
1618 return -EFAULT;
1619
1620 spin_lock(&kvm->mmu_lock);
1621 for (offset = log->first_page, i = offset / BITS_PER_LONG,
1622 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
1623 i++, offset += BITS_PER_LONG) {
1624 unsigned long mask = *dirty_bitmap_buffer++;
1625 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
1626 if (!mask)
1627 continue;
1628
1629 mask &= atomic_long_fetch_andnot(mask, p);
1630
1631
1632
1633
1634
1635
1636
1637 if (mask) {
1638 flush = true;
1639 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1640 offset, mask);
1641 }
1642 }
1643 spin_unlock(&kvm->mmu_lock);
1644
1645 if (flush)
1646 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1647
1648 return 0;
1649}
1650
1651static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
1652 struct kvm_clear_dirty_log *log)
1653{
1654 int r;
1655
1656 mutex_lock(&kvm->slots_lock);
1657
1658 r = kvm_clear_dirty_log_protect(kvm, log);
1659
1660 mutex_unlock(&kvm->slots_lock);
1661 return r;
1662}
1663#endif
1664
1665struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1666{
1667 return __gfn_to_memslot(kvm_memslots(kvm), gfn);
1668}
1669EXPORT_SYMBOL_GPL(gfn_to_memslot);
1670
1671struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
1672{
1673 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
1674}
1675EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
1676
1677bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
1678{
1679 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
1680
1681 return kvm_is_visible_memslot(memslot);
1682}
1683EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
1684
1685bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
1686{
1687 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1688
1689 return kvm_is_visible_memslot(memslot);
1690}
1691EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
1692
1693unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
1694{
1695 struct vm_area_struct *vma;
1696 unsigned long addr, size;
1697
1698 size = PAGE_SIZE;
1699
1700 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
1701 if (kvm_is_error_hva(addr))
1702 return PAGE_SIZE;
1703
1704 mmap_read_lock(current->mm);
1705 vma = find_vma(current->mm, addr);
1706 if (!vma)
1707 goto out;
1708
1709 size = vma_kernel_pagesize(vma);
1710
1711out:
1712 mmap_read_unlock(current->mm);
1713
1714 return size;
1715}
1716
1717static bool memslot_is_readonly(struct kvm_memory_slot *slot)
1718{
1719 return slot->flags & KVM_MEM_READONLY;
1720}
1721
1722static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1723 gfn_t *nr_pages, bool write)
1724{
1725 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1726 return KVM_HVA_ERR_BAD;
1727
1728 if (memslot_is_readonly(slot) && write)
1729 return KVM_HVA_ERR_RO_BAD;
1730
1731 if (nr_pages)
1732 *nr_pages = slot->npages - (gfn - slot->base_gfn);
1733
1734 return __gfn_to_hva_memslot(slot, gfn);
1735}
1736
1737static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1738 gfn_t *nr_pages)
1739{
1740 return __gfn_to_hva_many(slot, gfn, nr_pages, true);
1741}
1742
1743unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
1744 gfn_t gfn)
1745{
1746 return gfn_to_hva_many(slot, gfn, NULL);
1747}
1748EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
1749
1750unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1751{
1752 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1753}
1754EXPORT_SYMBOL_GPL(gfn_to_hva);
1755
1756unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
1757{
1758 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
1759}
1760EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
1771 gfn_t gfn, bool *writable)
1772{
1773 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
1774
1775 if (!kvm_is_error_hva(hva) && writable)
1776 *writable = !memslot_is_readonly(slot);
1777
1778 return hva;
1779}
1780
1781unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
1782{
1783 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
1784
1785 return gfn_to_hva_memslot_prot(slot, gfn, writable);
1786}
1787
1788unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
1789{
1790 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1791
1792 return gfn_to_hva_memslot_prot(slot, gfn, writable);
1793}
1794
1795static inline int check_user_page_hwpoison(unsigned long addr)
1796{
1797 int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
1798
1799 rc = get_user_pages(addr, 1, flags, NULL, NULL);
1800 return rc == -EHWPOISON;
1801}
1802
1803
1804
1805
1806
1807
1808static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
1809 bool *writable, kvm_pfn_t *pfn)
1810{
1811 struct page *page[1];
1812
1813
1814
1815
1816
1817
1818 if (!(write_fault || writable))
1819 return false;
1820
1821 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
1822 *pfn = page_to_pfn(page[0]);
1823
1824 if (writable)
1825 *writable = true;
1826 return true;
1827 }
1828
1829 return false;
1830}
1831
1832
1833
1834
1835
1836static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1837 bool *writable, kvm_pfn_t *pfn)
1838{
1839 unsigned int flags = FOLL_HWPOISON;
1840 struct page *page;
1841 int npages = 0;
1842
1843 might_sleep();
1844
1845 if (writable)
1846 *writable = write_fault;
1847
1848 if (write_fault)
1849 flags |= FOLL_WRITE;
1850 if (async)
1851 flags |= FOLL_NOWAIT;
1852
1853 npages = get_user_pages_unlocked(addr, 1, &page, flags);
1854 if (npages != 1)
1855 return npages;
1856
1857
1858 if (unlikely(!write_fault) && writable) {
1859 struct page *wpage;
1860
1861 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
1862 *writable = true;
1863 put_page(page);
1864 page = wpage;
1865 }
1866 }
1867 *pfn = page_to_pfn(page);
1868 return npages;
1869}
1870
1871static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
1872{
1873 if (unlikely(!(vma->vm_flags & VM_READ)))
1874 return false;
1875
1876 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
1877 return false;
1878
1879 return true;
1880}
1881
1882static int hva_to_pfn_remapped(struct vm_area_struct *vma,
1883 unsigned long addr, bool *async,
1884 bool write_fault, bool *writable,
1885 kvm_pfn_t *p_pfn)
1886{
1887 unsigned long pfn;
1888 int r;
1889
1890 r = follow_pfn(vma, addr, &pfn);
1891 if (r) {
1892
1893
1894
1895
1896 bool unlocked = false;
1897 r = fixup_user_fault(current->mm, addr,
1898 (write_fault ? FAULT_FLAG_WRITE : 0),
1899 &unlocked);
1900 if (unlocked)
1901 return -EAGAIN;
1902 if (r)
1903 return r;
1904
1905 r = follow_pfn(vma, addr, &pfn);
1906 if (r)
1907 return r;
1908
1909 }
1910
1911 if (writable)
1912 *writable = true;
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925 kvm_get_pfn(pfn);
1926
1927 *p_pfn = pfn;
1928 return 0;
1929}
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
1946 bool write_fault, bool *writable)
1947{
1948 struct vm_area_struct *vma;
1949 kvm_pfn_t pfn = 0;
1950 int npages, r;
1951
1952
1953 BUG_ON(atomic && async);
1954
1955 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
1956 return pfn;
1957
1958 if (atomic)
1959 return KVM_PFN_ERR_FAULT;
1960
1961 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
1962 if (npages == 1)
1963 return pfn;
1964
1965 mmap_read_lock(current->mm);
1966 if (npages == -EHWPOISON ||
1967 (!async && check_user_page_hwpoison(addr))) {
1968 pfn = KVM_PFN_ERR_HWPOISON;
1969 goto exit;
1970 }
1971
1972retry:
1973 vma = find_vma_intersection(current->mm, addr, addr + 1);
1974
1975 if (vma == NULL)
1976 pfn = KVM_PFN_ERR_FAULT;
1977 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
1978 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
1979 if (r == -EAGAIN)
1980 goto retry;
1981 if (r < 0)
1982 pfn = KVM_PFN_ERR_FAULT;
1983 } else {
1984 if (async && vma_is_valid(vma, write_fault))
1985 *async = true;
1986 pfn = KVM_PFN_ERR_FAULT;
1987 }
1988exit:
1989 mmap_read_unlock(current->mm);
1990 return pfn;
1991}
1992
1993kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
1994 bool atomic, bool *async, bool write_fault,
1995 bool *writable)
1996{
1997 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
1998
1999 if (addr == KVM_HVA_ERR_RO_BAD) {
2000 if (writable)
2001 *writable = false;
2002 return KVM_PFN_ERR_RO_FAULT;
2003 }
2004
2005 if (kvm_is_error_hva(addr)) {
2006 if (writable)
2007 *writable = false;
2008 return KVM_PFN_NOSLOT;
2009 }
2010
2011
2012 if (writable && memslot_is_readonly(slot)) {
2013 *writable = false;
2014 writable = NULL;
2015 }
2016
2017 return hva_to_pfn(addr, atomic, async, write_fault,
2018 writable);
2019}
2020EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2021
2022kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2023 bool *writable)
2024{
2025 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2026 write_fault, writable);
2027}
2028EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2029
2030kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
2031{
2032 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
2033}
2034EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2035
2036kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
2037{
2038 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
2039}
2040EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2041
2042kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2043{
2044 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2045}
2046EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2047
2048kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2049{
2050 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2051}
2052EXPORT_SYMBOL_GPL(gfn_to_pfn);
2053
2054kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2055{
2056 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2057}
2058EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2059
2060int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2061 struct page **pages, int nr_pages)
2062{
2063 unsigned long addr;
2064 gfn_t entry = 0;
2065
2066 addr = gfn_to_hva_many(slot, gfn, &entry);
2067 if (kvm_is_error_hva(addr))
2068 return -1;
2069
2070 if (entry < nr_pages)
2071 return 0;
2072
2073 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2074}
2075EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2076
2077static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
2078{
2079 if (is_error_noslot_pfn(pfn))
2080 return KVM_ERR_PTR_BAD_PAGE;
2081
2082 if (kvm_is_reserved_pfn(pfn)) {
2083 WARN_ON(1);
2084 return KVM_ERR_PTR_BAD_PAGE;
2085 }
2086
2087 return pfn_to_page(pfn);
2088}
2089
2090struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2091{
2092 kvm_pfn_t pfn;
2093
2094 pfn = gfn_to_pfn(kvm, gfn);
2095
2096 return kvm_pfn_to_page(pfn);
2097}
2098EXPORT_SYMBOL_GPL(gfn_to_page);
2099
2100void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
2101{
2102 if (pfn == 0)
2103 return;
2104
2105 if (cache)
2106 cache->pfn = cache->gfn = 0;
2107
2108 if (dirty)
2109 kvm_release_pfn_dirty(pfn);
2110 else
2111 kvm_release_pfn_clean(pfn);
2112}
2113
2114static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
2115 struct gfn_to_pfn_cache *cache, u64 gen)
2116{
2117 kvm_release_pfn(cache->pfn, cache->dirty, cache);
2118
2119 cache->pfn = gfn_to_pfn_memslot(slot, gfn);
2120 cache->gfn = gfn;
2121 cache->dirty = false;
2122 cache->generation = gen;
2123}
2124
2125static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
2126 struct kvm_host_map *map,
2127 struct gfn_to_pfn_cache *cache,
2128 bool atomic)
2129{
2130 kvm_pfn_t pfn;
2131 void *hva = NULL;
2132 struct page *page = KVM_UNMAPPED_PAGE;
2133 struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
2134 u64 gen = slots->generation;
2135
2136 if (!map)
2137 return -EINVAL;
2138
2139 if (cache) {
2140 if (!cache->pfn || cache->gfn != gfn ||
2141 cache->generation != gen) {
2142 if (atomic)
2143 return -EAGAIN;
2144 kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
2145 }
2146 pfn = cache->pfn;
2147 } else {
2148 if (atomic)
2149 return -EAGAIN;
2150 pfn = gfn_to_pfn_memslot(slot, gfn);
2151 }
2152 if (is_error_noslot_pfn(pfn))
2153 return -EINVAL;
2154
2155 if (pfn_valid(pfn)) {
2156 page = pfn_to_page(pfn);
2157 if (atomic)
2158 hva = kmap_atomic(page);
2159 else
2160 hva = kmap(page);
2161#ifdef CONFIG_HAS_IOMEM
2162 } else if (!atomic) {
2163 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2164 } else {
2165 return -EINVAL;
2166#endif
2167 }
2168
2169 if (!hva)
2170 return -EFAULT;
2171
2172 map->page = page;
2173 map->hva = hva;
2174 map->pfn = pfn;
2175 map->gfn = gfn;
2176
2177 return 0;
2178}
2179
2180int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
2181 struct gfn_to_pfn_cache *cache, bool atomic)
2182{
2183 return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
2184 cache, atomic);
2185}
2186EXPORT_SYMBOL_GPL(kvm_map_gfn);
2187
2188int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2189{
2190 return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
2191 NULL, false);
2192}
2193EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2194
2195static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
2196 struct kvm_host_map *map,
2197 struct gfn_to_pfn_cache *cache,
2198 bool dirty, bool atomic)
2199{
2200 if (!map)
2201 return;
2202
2203 if (!map->hva)
2204 return;
2205
2206 if (map->page != KVM_UNMAPPED_PAGE) {
2207 if (atomic)
2208 kunmap_atomic(map->hva);
2209 else
2210 kunmap(map->page);
2211 }
2212#ifdef CONFIG_HAS_IOMEM
2213 else if (!atomic)
2214 memunmap(map->hva);
2215 else
2216 WARN_ONCE(1, "Unexpected unmapping in atomic context");
2217#endif
2218
2219 if (dirty)
2220 mark_page_dirty_in_slot(memslot, map->gfn);
2221
2222 if (cache)
2223 cache->dirty |= dirty;
2224 else
2225 kvm_release_pfn(map->pfn, dirty, NULL);
2226
2227 map->hva = NULL;
2228 map->page = NULL;
2229}
2230
2231int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
2232 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
2233{
2234 __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map,
2235 cache, dirty, atomic);
2236 return 0;
2237}
2238EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
2239
2240void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2241{
2242 __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL,
2243 dirty, false);
2244}
2245EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2246
2247struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2248{
2249 kvm_pfn_t pfn;
2250
2251 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2252
2253 return kvm_pfn_to_page(pfn);
2254}
2255EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2256
2257void kvm_release_page_clean(struct page *page)
2258{
2259 WARN_ON(is_error_page(page));
2260
2261 kvm_release_pfn_clean(page_to_pfn(page));
2262}
2263EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2264
2265void kvm_release_pfn_clean(kvm_pfn_t pfn)
2266{
2267 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2268 put_page(pfn_to_page(pfn));
2269}
2270EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2271
2272void kvm_release_page_dirty(struct page *page)
2273{
2274 WARN_ON(is_error_page(page));
2275
2276 kvm_release_pfn_dirty(page_to_pfn(page));
2277}
2278EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2279
2280void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2281{
2282 kvm_set_pfn_dirty(pfn);
2283 kvm_release_pfn_clean(pfn);
2284}
2285EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2286
2287void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2288{
2289 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2290 SetPageDirty(pfn_to_page(pfn));
2291}
2292EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2293
2294void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2295{
2296 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2297 mark_page_accessed(pfn_to_page(pfn));
2298}
2299EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2300
2301void kvm_get_pfn(kvm_pfn_t pfn)
2302{
2303 if (!kvm_is_reserved_pfn(pfn))
2304 get_page(pfn_to_page(pfn));
2305}
2306EXPORT_SYMBOL_GPL(kvm_get_pfn);
2307
2308static int next_segment(unsigned long len, int offset)
2309{
2310 if (len > PAGE_SIZE - offset)
2311 return PAGE_SIZE - offset;
2312 else
2313 return len;
2314}
2315
2316static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2317 void *data, int offset, int len)
2318{
2319 int r;
2320 unsigned long addr;
2321
2322 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2323 if (kvm_is_error_hva(addr))
2324 return -EFAULT;
2325 r = __copy_from_user(data, (void __user *)addr + offset, len);
2326 if (r)
2327 return -EFAULT;
2328 return 0;
2329}
2330
2331int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2332 int len)
2333{
2334 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2335
2336 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2337}
2338EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2339
2340int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2341 int offset, int len)
2342{
2343 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2344
2345 return __kvm_read_guest_page(slot, gfn, data, offset, len);
2346}
2347EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2348
2349int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2350{
2351 gfn_t gfn = gpa >> PAGE_SHIFT;
2352 int seg;
2353 int offset = offset_in_page(gpa);
2354 int ret;
2355
2356 while ((seg = next_segment(len, offset)) != 0) {
2357 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2358 if (ret < 0)
2359 return ret;
2360 offset = 0;
2361 len -= seg;
2362 data += seg;
2363 ++gfn;
2364 }
2365 return 0;
2366}
2367EXPORT_SYMBOL_GPL(kvm_read_guest);
2368
2369int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2370{
2371 gfn_t gfn = gpa >> PAGE_SHIFT;
2372 int seg;
2373 int offset = offset_in_page(gpa);
2374 int ret;
2375
2376 while ((seg = next_segment(len, offset)) != 0) {
2377 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2378 if (ret < 0)
2379 return ret;
2380 offset = 0;
2381 len -= seg;
2382 data += seg;
2383 ++gfn;
2384 }
2385 return 0;
2386}
2387EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2388
2389static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2390 void *data, int offset, unsigned long len)
2391{
2392 int r;
2393 unsigned long addr;
2394
2395 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2396 if (kvm_is_error_hva(addr))
2397 return -EFAULT;
2398 pagefault_disable();
2399 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2400 pagefault_enable();
2401 if (r)
2402 return -EFAULT;
2403 return 0;
2404}
2405
2406int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2407 void *data, unsigned long len)
2408{
2409 gfn_t gfn = gpa >> PAGE_SHIFT;
2410 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2411 int offset = offset_in_page(gpa);
2412
2413 return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2414}
2415EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2416
2417static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
2418 const void *data, int offset, int len)
2419{
2420 int r;
2421 unsigned long addr;
2422
2423 addr = gfn_to_hva_memslot(memslot, gfn);
2424 if (kvm_is_error_hva(addr))
2425 return -EFAULT;
2426 r = __copy_to_user((void __user *)addr + offset, data, len);
2427 if (r)
2428 return -EFAULT;
2429 mark_page_dirty_in_slot(memslot, gfn);
2430 return 0;
2431}
2432
2433int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2434 const void *data, int offset, int len)
2435{
2436 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2437
2438 return __kvm_write_guest_page(slot, gfn, data, offset, len);
2439}
2440EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2441
2442int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2443 const void *data, int offset, int len)
2444{
2445 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2446
2447 return __kvm_write_guest_page(slot, gfn, data, offset, len);
2448}
2449EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2450
2451int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2452 unsigned long len)
2453{
2454 gfn_t gfn = gpa >> PAGE_SHIFT;
2455 int seg;
2456 int offset = offset_in_page(gpa);
2457 int ret;
2458
2459 while ((seg = next_segment(len, offset)) != 0) {
2460 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2461 if (ret < 0)
2462 return ret;
2463 offset = 0;
2464 len -= seg;
2465 data += seg;
2466 ++gfn;
2467 }
2468 return 0;
2469}
2470EXPORT_SYMBOL_GPL(kvm_write_guest);
2471
2472int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2473 unsigned long len)
2474{
2475 gfn_t gfn = gpa >> PAGE_SHIFT;
2476 int seg;
2477 int offset = offset_in_page(gpa);
2478 int ret;
2479
2480 while ((seg = next_segment(len, offset)) != 0) {
2481 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2482 if (ret < 0)
2483 return ret;
2484 offset = 0;
2485 len -= seg;
2486 data += seg;
2487 ++gfn;
2488 }
2489 return 0;
2490}
2491EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2492
2493static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2494 struct gfn_to_hva_cache *ghc,
2495 gpa_t gpa, unsigned long len)
2496{
2497 int offset = offset_in_page(gpa);
2498 gfn_t start_gfn = gpa >> PAGE_SHIFT;
2499 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2500 gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2501 gfn_t nr_pages_avail;
2502
2503
2504 ghc->generation = slots->generation;
2505
2506 if (start_gfn > end_gfn) {
2507 ghc->hva = KVM_HVA_ERR_BAD;
2508 return -EINVAL;
2509 }
2510
2511
2512
2513
2514
2515 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
2516 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2517 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2518 &nr_pages_avail);
2519 if (kvm_is_error_hva(ghc->hva))
2520 return -EFAULT;
2521 }
2522
2523
2524 if (nr_pages_needed == 1)
2525 ghc->hva += offset;
2526 else
2527 ghc->memslot = NULL;
2528
2529 ghc->gpa = gpa;
2530 ghc->len = len;
2531 return 0;
2532}
2533
2534int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2535 gpa_t gpa, unsigned long len)
2536{
2537 struct kvm_memslots *slots = kvm_memslots(kvm);
2538 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
2539}
2540EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
2541
2542int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2543 void *data, unsigned int offset,
2544 unsigned long len)
2545{
2546 struct kvm_memslots *slots = kvm_memslots(kvm);
2547 int r;
2548 gpa_t gpa = ghc->gpa + offset;
2549
2550 BUG_ON(len + offset > ghc->len);
2551
2552 if (slots->generation != ghc->generation) {
2553 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2554 return -EFAULT;
2555 }
2556
2557 if (kvm_is_error_hva(ghc->hva))
2558 return -EFAULT;
2559
2560 if (unlikely(!ghc->memslot))
2561 return kvm_write_guest(kvm, gpa, data, len);
2562
2563 r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
2564 if (r)
2565 return -EFAULT;
2566 mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
2567
2568 return 0;
2569}
2570EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
2571
2572int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2573 void *data, unsigned long len)
2574{
2575 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
2576}
2577EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
2578
2579int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2580 void *data, unsigned int offset,
2581 unsigned long len)
2582{
2583 struct kvm_memslots *slots = kvm_memslots(kvm);
2584 int r;
2585 gpa_t gpa = ghc->gpa + offset;
2586
2587 BUG_ON(len + offset > ghc->len);
2588
2589 if (slots->generation != ghc->generation) {
2590 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2591 return -EFAULT;
2592 }
2593
2594 if (kvm_is_error_hva(ghc->hva))
2595 return -EFAULT;
2596
2597 if (unlikely(!ghc->memslot))
2598 return kvm_read_guest(kvm, gpa, data, len);
2599
2600 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
2601 if (r)
2602 return -EFAULT;
2603
2604 return 0;
2605}
2606EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
2607
2608int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2609 void *data, unsigned long len)
2610{
2611 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
2612}
2613EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
2614
2615int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
2616{
2617 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
2618
2619 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
2620}
2621EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
2622
2623int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
2624{
2625 gfn_t gfn = gpa >> PAGE_SHIFT;
2626 int seg;
2627 int offset = offset_in_page(gpa);
2628 int ret;
2629
2630 while ((seg = next_segment(len, offset)) != 0) {
2631 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
2632 if (ret < 0)
2633 return ret;
2634 offset = 0;
2635 len -= seg;
2636 ++gfn;
2637 }
2638 return 0;
2639}
2640EXPORT_SYMBOL_GPL(kvm_clear_guest);
2641
2642static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
2643 gfn_t gfn)
2644{
2645 if (memslot && memslot->dirty_bitmap) {
2646 unsigned long rel_gfn = gfn - memslot->base_gfn;
2647
2648 set_bit_le(rel_gfn, memslot->dirty_bitmap);
2649 }
2650}
2651
2652void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
2653{
2654 struct kvm_memory_slot *memslot;
2655
2656 memslot = gfn_to_memslot(kvm, gfn);
2657 mark_page_dirty_in_slot(memslot, gfn);
2658}
2659EXPORT_SYMBOL_GPL(mark_page_dirty);
2660
2661void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
2662{
2663 struct kvm_memory_slot *memslot;
2664
2665 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2666 mark_page_dirty_in_slot(memslot, gfn);
2667}
2668EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
2669
2670void kvm_sigset_activate(struct kvm_vcpu *vcpu)
2671{
2672 if (!vcpu->sigset_active)
2673 return;
2674
2675
2676
2677
2678
2679
2680
2681 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked);
2682}
2683
2684void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
2685{
2686 if (!vcpu->sigset_active)
2687 return;
2688
2689 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL);
2690 sigemptyset(¤t->real_blocked);
2691}
2692
2693static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
2694{
2695 unsigned int old, val, grow, grow_start;
2696
2697 old = val = vcpu->halt_poll_ns;
2698 grow_start = READ_ONCE(halt_poll_ns_grow_start);
2699 grow = READ_ONCE(halt_poll_ns_grow);
2700 if (!grow)
2701 goto out;
2702
2703 val *= grow;
2704 if (val < grow_start)
2705 val = grow_start;
2706
2707 if (val > halt_poll_ns)
2708 val = halt_poll_ns;
2709
2710 vcpu->halt_poll_ns = val;
2711out:
2712 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
2713}
2714
2715static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
2716{
2717 unsigned int old, val, shrink;
2718
2719 old = val = vcpu->halt_poll_ns;
2720 shrink = READ_ONCE(halt_poll_ns_shrink);
2721 if (shrink == 0)
2722 val = 0;
2723 else
2724 val /= shrink;
2725
2726 vcpu->halt_poll_ns = val;
2727 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
2728}
2729
2730static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
2731{
2732 int ret = -EINTR;
2733 int idx = srcu_read_lock(&vcpu->kvm->srcu);
2734
2735 if (kvm_arch_vcpu_runnable(vcpu)) {
2736 kvm_make_request(KVM_REQ_UNHALT, vcpu);
2737 goto out;
2738 }
2739 if (kvm_cpu_has_pending_timer(vcpu))
2740 goto out;
2741 if (signal_pending(current))
2742 goto out;
2743
2744 ret = 0;
2745out:
2746 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2747 return ret;
2748}
2749
2750static inline void
2751update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
2752{
2753 if (waited)
2754 vcpu->stat.halt_poll_fail_ns += poll_ns;
2755 else
2756 vcpu->stat.halt_poll_success_ns += poll_ns;
2757}
2758
2759
2760
2761
2762void kvm_vcpu_block(struct kvm_vcpu *vcpu)
2763{
2764 ktime_t start, cur, poll_end;
2765 bool waited = false;
2766 u64 block_ns;
2767
2768 kvm_arch_vcpu_blocking(vcpu);
2769
2770 start = cur = poll_end = ktime_get();
2771 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
2772 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
2773
2774 ++vcpu->stat.halt_attempted_poll;
2775 do {
2776
2777
2778
2779
2780 if (kvm_vcpu_check_block(vcpu) < 0) {
2781 ++vcpu->stat.halt_successful_poll;
2782 if (!vcpu_valid_wakeup(vcpu))
2783 ++vcpu->stat.halt_poll_invalid;
2784 goto out;
2785 }
2786 poll_end = cur = ktime_get();
2787 } while (single_task_running() && ktime_before(cur, stop));
2788 }
2789
2790 prepare_to_rcuwait(&vcpu->wait);
2791 for (;;) {
2792 set_current_state(TASK_INTERRUPTIBLE);
2793
2794 if (kvm_vcpu_check_block(vcpu) < 0)
2795 break;
2796
2797 waited = true;
2798 schedule();
2799 }
2800 finish_rcuwait(&vcpu->wait);
2801 cur = ktime_get();
2802out:
2803 kvm_arch_vcpu_unblocking(vcpu);
2804 block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
2805
2806 update_halt_poll_stats(
2807 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
2808
2809 if (!kvm_arch_no_poll(vcpu)) {
2810 if (!vcpu_valid_wakeup(vcpu)) {
2811 shrink_halt_poll_ns(vcpu);
2812 } else if (vcpu->kvm->max_halt_poll_ns) {
2813 if (block_ns <= vcpu->halt_poll_ns)
2814 ;
2815
2816 else if (vcpu->halt_poll_ns &&
2817 block_ns > vcpu->kvm->max_halt_poll_ns)
2818 shrink_halt_poll_ns(vcpu);
2819
2820 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
2821 block_ns < vcpu->kvm->max_halt_poll_ns)
2822 grow_halt_poll_ns(vcpu);
2823 } else {
2824 vcpu->halt_poll_ns = 0;
2825 }
2826 }
2827
2828 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
2829 kvm_arch_vcpu_block_finish(vcpu);
2830}
2831EXPORT_SYMBOL_GPL(kvm_vcpu_block);
2832
2833bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
2834{
2835 struct rcuwait *waitp;
2836
2837 waitp = kvm_arch_vcpu_get_wait(vcpu);
2838 if (rcuwait_wake_up(waitp)) {
2839 WRITE_ONCE(vcpu->ready, true);
2840 ++vcpu->stat.halt_wakeup;
2841 return true;
2842 }
2843
2844 return false;
2845}
2846EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
2847
2848#ifndef CONFIG_S390
2849
2850
2851
2852void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
2853{
2854 int me;
2855 int cpu = vcpu->cpu;
2856
2857 if (kvm_vcpu_wake_up(vcpu))
2858 return;
2859
2860 me = get_cpu();
2861 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
2862 if (kvm_arch_vcpu_should_kick(vcpu))
2863 smp_send_reschedule(cpu);
2864 put_cpu();
2865}
2866EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
2867#endif
2868
2869int kvm_vcpu_yield_to(struct kvm_vcpu *target)
2870{
2871 struct pid *pid;
2872 struct task_struct *task = NULL;
2873 int ret = 0;
2874
2875 rcu_read_lock();
2876 pid = rcu_dereference(target->pid);
2877 if (pid)
2878 task = get_pid_task(pid, PIDTYPE_PID);
2879 rcu_read_unlock();
2880 if (!task)
2881 return ret;
2882 ret = yield_to(task, 1);
2883 put_task_struct(task);
2884
2885 return ret;
2886}
2887EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
2912{
2913#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
2914 bool eligible;
2915
2916 eligible = !vcpu->spin_loop.in_spin_loop ||
2917 vcpu->spin_loop.dy_eligible;
2918
2919 if (vcpu->spin_loop.in_spin_loop)
2920 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
2921
2922 return eligible;
2923#else
2924 return true;
2925#endif
2926}
2927
2928
2929
2930
2931
2932
2933bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
2934{
2935 return kvm_arch_vcpu_runnable(vcpu);
2936}
2937
2938static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
2939{
2940 if (kvm_arch_dy_runnable(vcpu))
2941 return true;
2942
2943#ifdef CONFIG_KVM_ASYNC_PF
2944 if (!list_empty_careful(&vcpu->async_pf.done))
2945 return true;
2946#endif
2947
2948 return false;
2949}
2950
2951void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
2952{
2953 struct kvm *kvm = me->kvm;
2954 struct kvm_vcpu *vcpu;
2955 int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
2956 int yielded = 0;
2957 int try = 3;
2958 int pass;
2959 int i;
2960
2961 kvm_vcpu_set_in_spin_loop(me, true);
2962
2963
2964
2965
2966
2967
2968
2969 for (pass = 0; pass < 2 && !yielded && try; pass++) {
2970 kvm_for_each_vcpu(i, vcpu, kvm) {
2971 if (!pass && i <= last_boosted_vcpu) {
2972 i = last_boosted_vcpu;
2973 continue;
2974 } else if (pass && i > last_boosted_vcpu)
2975 break;
2976 if (!READ_ONCE(vcpu->ready))
2977 continue;
2978 if (vcpu == me)
2979 continue;
2980 if (rcuwait_active(&vcpu->wait) &&
2981 !vcpu_dy_runnable(vcpu))
2982 continue;
2983 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
2984 !kvm_arch_vcpu_in_kernel(vcpu))
2985 continue;
2986 if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
2987 continue;
2988
2989 yielded = kvm_vcpu_yield_to(vcpu);
2990 if (yielded > 0) {
2991 kvm->last_boosted_vcpu = i;
2992 break;
2993 } else if (yielded < 0) {
2994 try--;
2995 if (!try)
2996 break;
2997 }
2998 }
2999 }
3000 kvm_vcpu_set_in_spin_loop(me, false);
3001
3002
3003 kvm_vcpu_set_dy_eligible(me, false);
3004}
3005EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3006
3007static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3008{
3009 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3010 struct page *page;
3011
3012 if (vmf->pgoff == 0)
3013 page = virt_to_page(vcpu->run);
3014#ifdef CONFIG_X86
3015 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3016 page = virt_to_page(vcpu->arch.pio_data);
3017#endif
3018#ifdef CONFIG_KVM_MMIO
3019 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3020 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3021#endif
3022 else
3023 return kvm_arch_vcpu_fault(vcpu, vmf);
3024 get_page(page);
3025 vmf->page = page;
3026 return 0;
3027}
3028
3029static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3030 .fault = kvm_vcpu_fault,
3031};
3032
3033static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3034{
3035 vma->vm_ops = &kvm_vcpu_vm_ops;
3036 return 0;
3037}
3038
3039static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3040{
3041 struct kvm_vcpu *vcpu = filp->private_data;
3042
3043 kvm_put_kvm(vcpu->kvm);
3044 return 0;
3045}
3046
3047static struct file_operations kvm_vcpu_fops = {
3048 .release = kvm_vcpu_release,
3049 .unlocked_ioctl = kvm_vcpu_ioctl,
3050 .mmap = kvm_vcpu_mmap,
3051 .llseek = noop_llseek,
3052 KVM_COMPAT(kvm_vcpu_compat_ioctl),
3053};
3054
3055
3056
3057
3058static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3059{
3060 char name[8 + 1 + ITOA_MAX_LEN + 1];
3061
3062 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3063 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3064}
3065
3066static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3067{
3068#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3069 struct dentry *debugfs_dentry;
3070 char dir_name[ITOA_MAX_LEN * 2];
3071
3072 if (!debugfs_initialized())
3073 return;
3074
3075 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3076 debugfs_dentry = debugfs_create_dir(dir_name,
3077 vcpu->kvm->debugfs_dentry);
3078
3079 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3080#endif
3081}
3082
3083
3084
3085
3086static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3087{
3088 int r;
3089 struct kvm_vcpu *vcpu;
3090 struct page *page;
3091
3092 if (id >= KVM_MAX_VCPU_ID)
3093 return -EINVAL;
3094
3095 mutex_lock(&kvm->lock);
3096 if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3097 mutex_unlock(&kvm->lock);
3098 return -EINVAL;
3099 }
3100
3101 kvm->created_vcpus++;
3102 mutex_unlock(&kvm->lock);
3103
3104 r = kvm_arch_vcpu_precreate(kvm, id);
3105 if (r)
3106 goto vcpu_decrement;
3107
3108 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
3109 if (!vcpu) {
3110 r = -ENOMEM;
3111 goto vcpu_decrement;
3112 }
3113
3114 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3115 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3116 if (!page) {
3117 r = -ENOMEM;
3118 goto vcpu_free;
3119 }
3120 vcpu->run = page_address(page);
3121
3122 kvm_vcpu_init(vcpu, kvm, id);
3123
3124 r = kvm_arch_vcpu_create(vcpu);
3125 if (r)
3126 goto vcpu_free_run_page;
3127
3128 mutex_lock(&kvm->lock);
3129 if (kvm_get_vcpu_by_id(kvm, id)) {
3130 r = -EEXIST;
3131 goto unlock_vcpu_destroy;
3132 }
3133
3134 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3135 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
3136
3137
3138 kvm_get_kvm(kvm);
3139 r = create_vcpu_fd(vcpu);
3140 if (r < 0) {
3141 kvm_put_kvm_no_destroy(kvm);
3142 goto unlock_vcpu_destroy;
3143 }
3144
3145 kvm->vcpus[vcpu->vcpu_idx] = vcpu;
3146
3147
3148
3149
3150
3151 smp_wmb();
3152 atomic_inc(&kvm->online_vcpus);
3153
3154 mutex_unlock(&kvm->lock);
3155 kvm_arch_vcpu_postcreate(vcpu);
3156 kvm_create_vcpu_debugfs(vcpu);
3157 return r;
3158
3159unlock_vcpu_destroy:
3160 mutex_unlock(&kvm->lock);
3161 kvm_arch_vcpu_destroy(vcpu);
3162vcpu_free_run_page:
3163 free_page((unsigned long)vcpu->run);
3164vcpu_free:
3165 kmem_cache_free(kvm_vcpu_cache, vcpu);
3166vcpu_decrement:
3167 mutex_lock(&kvm->lock);
3168 kvm->created_vcpus--;
3169 mutex_unlock(&kvm->lock);
3170 return r;
3171}
3172
3173static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3174{
3175 if (sigset) {
3176 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3177 vcpu->sigset_active = 1;
3178 vcpu->sigset = *sigset;
3179 } else
3180 vcpu->sigset_active = 0;
3181 return 0;
3182}
3183
3184static long kvm_vcpu_ioctl(struct file *filp,
3185 unsigned int ioctl, unsigned long arg)
3186{
3187 struct kvm_vcpu *vcpu = filp->private_data;
3188 void __user *argp = (void __user *)arg;
3189 int r;
3190 struct kvm_fpu *fpu = NULL;
3191 struct kvm_sregs *kvm_sregs = NULL;
3192
3193 if (vcpu->kvm->mm != current->mm)
3194 return -EIO;
3195
3196 if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3197 return -EINVAL;
3198
3199
3200
3201
3202
3203 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3204 if (r != -ENOIOCTLCMD)
3205 return r;
3206
3207 if (mutex_lock_killable(&vcpu->mutex))
3208 return -EINTR;
3209 switch (ioctl) {
3210 case KVM_RUN: {
3211 struct pid *oldpid;
3212 r = -EINVAL;
3213 if (arg)
3214 goto out;
3215 oldpid = rcu_access_pointer(vcpu->pid);
3216 if (unlikely(oldpid != task_pid(current))) {
3217
3218 struct pid *newpid;
3219
3220 r = kvm_arch_vcpu_run_pid_change(vcpu);
3221 if (r)
3222 break;
3223
3224 newpid = get_task_pid(current, PIDTYPE_PID);
3225 rcu_assign_pointer(vcpu->pid, newpid);
3226 if (oldpid)
3227 synchronize_rcu();
3228 put_pid(oldpid);
3229 }
3230 r = kvm_arch_vcpu_ioctl_run(vcpu);
3231 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
3232 break;
3233 }
3234 case KVM_GET_REGS: {
3235 struct kvm_regs *kvm_regs;
3236
3237 r = -ENOMEM;
3238 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3239 if (!kvm_regs)
3240 goto out;
3241 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
3242 if (r)
3243 goto out_free1;
3244 r = -EFAULT;
3245 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3246 goto out_free1;
3247 r = 0;
3248out_free1:
3249 kfree(kvm_regs);
3250 break;
3251 }
3252 case KVM_SET_REGS: {
3253 struct kvm_regs *kvm_regs;
3254
3255 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3256 if (IS_ERR(kvm_regs)) {
3257 r = PTR_ERR(kvm_regs);
3258 goto out;
3259 }
3260 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3261 kfree(kvm_regs);
3262 break;
3263 }
3264 case KVM_GET_SREGS: {
3265 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3266 GFP_KERNEL_ACCOUNT);
3267 r = -ENOMEM;
3268 if (!kvm_sregs)
3269 goto out;
3270 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3271 if (r)
3272 goto out;
3273 r = -EFAULT;
3274 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3275 goto out;
3276 r = 0;
3277 break;
3278 }
3279 case KVM_SET_SREGS: {
3280 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3281 if (IS_ERR(kvm_sregs)) {
3282 r = PTR_ERR(kvm_sregs);
3283 kvm_sregs = NULL;
3284 goto out;
3285 }
3286 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3287 break;
3288 }
3289 case KVM_GET_MP_STATE: {
3290 struct kvm_mp_state mp_state;
3291
3292 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3293 if (r)
3294 goto out;
3295 r = -EFAULT;
3296 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3297 goto out;
3298 r = 0;
3299 break;
3300 }
3301 case KVM_SET_MP_STATE: {
3302 struct kvm_mp_state mp_state;
3303
3304 r = -EFAULT;
3305 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3306 goto out;
3307 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3308 break;
3309 }
3310 case KVM_TRANSLATE: {
3311 struct kvm_translation tr;
3312
3313 r = -EFAULT;
3314 if (copy_from_user(&tr, argp, sizeof(tr)))
3315 goto out;
3316 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3317 if (r)
3318 goto out;
3319 r = -EFAULT;
3320 if (copy_to_user(argp, &tr, sizeof(tr)))
3321 goto out;
3322 r = 0;
3323 break;
3324 }
3325 case KVM_SET_GUEST_DEBUG: {
3326 struct kvm_guest_debug dbg;
3327
3328 r = -EFAULT;
3329 if (copy_from_user(&dbg, argp, sizeof(dbg)))
3330 goto out;
3331 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
3332 break;
3333 }
3334 case KVM_SET_SIGNAL_MASK: {
3335 struct kvm_signal_mask __user *sigmask_arg = argp;
3336 struct kvm_signal_mask kvm_sigmask;
3337 sigset_t sigset, *p;
3338
3339 p = NULL;
3340 if (argp) {
3341 r = -EFAULT;
3342 if (copy_from_user(&kvm_sigmask, argp,
3343 sizeof(kvm_sigmask)))
3344 goto out;
3345 r = -EINVAL;
3346 if (kvm_sigmask.len != sizeof(sigset))
3347 goto out;
3348 r = -EFAULT;
3349 if (copy_from_user(&sigset, sigmask_arg->sigset,
3350 sizeof(sigset)))
3351 goto out;
3352 p = &sigset;
3353 }
3354 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
3355 break;
3356 }
3357 case KVM_GET_FPU: {
3358 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
3359 r = -ENOMEM;
3360 if (!fpu)
3361 goto out;
3362 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
3363 if (r)
3364 goto out;
3365 r = -EFAULT;
3366 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
3367 goto out;
3368 r = 0;
3369 break;
3370 }
3371 case KVM_SET_FPU: {
3372 fpu = memdup_user(argp, sizeof(*fpu));
3373 if (IS_ERR(fpu)) {
3374 r = PTR_ERR(fpu);
3375 fpu = NULL;
3376 goto out;
3377 }
3378 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
3379 break;
3380 }
3381 default:
3382 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
3383 }
3384out:
3385 mutex_unlock(&vcpu->mutex);
3386 kfree(fpu);
3387 kfree(kvm_sregs);
3388 return r;
3389}
3390
3391#ifdef CONFIG_KVM_COMPAT
3392static long kvm_vcpu_compat_ioctl(struct file *filp,
3393 unsigned int ioctl, unsigned long arg)
3394{
3395 struct kvm_vcpu *vcpu = filp->private_data;
3396 void __user *argp = compat_ptr(arg);
3397 int r;
3398
3399 if (vcpu->kvm->mm != current->mm)
3400 return -EIO;
3401
3402 switch (ioctl) {
3403 case KVM_SET_SIGNAL_MASK: {
3404 struct kvm_signal_mask __user *sigmask_arg = argp;
3405 struct kvm_signal_mask kvm_sigmask;
3406 sigset_t sigset;
3407
3408 if (argp) {
3409 r = -EFAULT;
3410 if (copy_from_user(&kvm_sigmask, argp,
3411 sizeof(kvm_sigmask)))
3412 goto out;
3413 r = -EINVAL;
3414 if (kvm_sigmask.len != sizeof(compat_sigset_t))
3415 goto out;
3416 r = -EFAULT;
3417 if (get_compat_sigset(&sigset,
3418 (compat_sigset_t __user *)sigmask_arg->sigset))
3419 goto out;
3420 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3421 } else
3422 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
3423 break;
3424 }
3425 default:
3426 r = kvm_vcpu_ioctl(filp, ioctl, arg);
3427 }
3428
3429out:
3430 return r;
3431}
3432#endif
3433
3434static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3435{
3436 struct kvm_device *dev = filp->private_data;
3437
3438 if (dev->ops->mmap)
3439 return dev->ops->mmap(dev, vma);
3440
3441 return -ENODEV;
3442}
3443
3444static int kvm_device_ioctl_attr(struct kvm_device *dev,
3445 int (*accessor)(struct kvm_device *dev,
3446 struct kvm_device_attr *attr),
3447 unsigned long arg)
3448{
3449 struct kvm_device_attr attr;
3450
3451 if (!accessor)
3452 return -EPERM;
3453
3454 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
3455 return -EFAULT;
3456
3457 return accessor(dev, &attr);
3458}
3459
3460static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
3461 unsigned long arg)
3462{
3463 struct kvm_device *dev = filp->private_data;
3464
3465 if (dev->kvm->mm != current->mm)
3466 return -EIO;
3467
3468 switch (ioctl) {
3469 case KVM_SET_DEVICE_ATTR:
3470 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
3471 case KVM_GET_DEVICE_ATTR:
3472 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
3473 case KVM_HAS_DEVICE_ATTR:
3474 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
3475 default:
3476 if (dev->ops->ioctl)
3477 return dev->ops->ioctl(dev, ioctl, arg);
3478
3479 return -ENOTTY;
3480 }
3481}
3482
3483static int kvm_device_release(struct inode *inode, struct file *filp)
3484{
3485 struct kvm_device *dev = filp->private_data;
3486 struct kvm *kvm = dev->kvm;
3487
3488 if (dev->ops->release) {
3489 mutex_lock(&kvm->lock);
3490 list_del(&dev->vm_node);
3491 dev->ops->release(dev);
3492 mutex_unlock(&kvm->lock);
3493 }
3494
3495 kvm_put_kvm(kvm);
3496 return 0;
3497}
3498
3499static const struct file_operations kvm_device_fops = {
3500 .unlocked_ioctl = kvm_device_ioctl,
3501 .release = kvm_device_release,
3502 KVM_COMPAT(kvm_device_ioctl),
3503 .mmap = kvm_device_mmap,
3504};
3505
3506struct kvm_device *kvm_device_from_filp(struct file *filp)
3507{
3508 if (filp->f_op != &kvm_device_fops)
3509 return NULL;
3510
3511 return filp->private_data;
3512}
3513
3514static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
3515#ifdef CONFIG_KVM_MPIC
3516 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
3517 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
3518#endif
3519};
3520
3521int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
3522{
3523 if (type >= ARRAY_SIZE(kvm_device_ops_table))
3524 return -ENOSPC;
3525
3526 if (kvm_device_ops_table[type] != NULL)
3527 return -EEXIST;
3528
3529 kvm_device_ops_table[type] = ops;
3530 return 0;
3531}
3532
3533void kvm_unregister_device_ops(u32 type)
3534{
3535 if (kvm_device_ops_table[type] != NULL)
3536 kvm_device_ops_table[type] = NULL;
3537}
3538
3539static int kvm_ioctl_create_device(struct kvm *kvm,
3540 struct kvm_create_device *cd)
3541{
3542 const struct kvm_device_ops *ops = NULL;
3543 struct kvm_device *dev;
3544 bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
3545 int type;
3546 int ret;
3547
3548 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
3549 return -ENODEV;
3550
3551 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
3552 ops = kvm_device_ops_table[type];
3553 if (ops == NULL)
3554 return -ENODEV;
3555
3556 if (test)
3557 return 0;
3558
3559 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
3560 if (!dev)
3561 return -ENOMEM;
3562
3563 dev->ops = ops;
3564 dev->kvm = kvm;
3565
3566 mutex_lock(&kvm->lock);
3567 ret = ops->create(dev, type);
3568 if (ret < 0) {
3569 mutex_unlock(&kvm->lock);
3570 kfree(dev);
3571 return ret;
3572 }
3573 list_add(&dev->vm_node, &kvm->devices);
3574 mutex_unlock(&kvm->lock);
3575
3576 if (ops->init)
3577 ops->init(dev);
3578
3579 kvm_get_kvm(kvm);
3580 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
3581 if (ret < 0) {
3582 kvm_put_kvm_no_destroy(kvm);
3583 mutex_lock(&kvm->lock);
3584 list_del(&dev->vm_node);
3585 mutex_unlock(&kvm->lock);
3586 ops->destroy(dev);
3587 return ret;
3588 }
3589
3590 cd->fd = ret;
3591 return 0;
3592}
3593
3594static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
3595{
3596 switch (arg) {
3597 case KVM_CAP_USER_MEMORY:
3598 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
3599 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
3600 case KVM_CAP_INTERNAL_ERROR_DATA:
3601#ifdef CONFIG_HAVE_KVM_MSI
3602 case KVM_CAP_SIGNAL_MSI:
3603#endif
3604#ifdef CONFIG_HAVE_KVM_IRQFD
3605 case KVM_CAP_IRQFD:
3606 case KVM_CAP_IRQFD_RESAMPLE:
3607#endif
3608 case KVM_CAP_IOEVENTFD_ANY_LENGTH:
3609 case KVM_CAP_CHECK_EXTENSION_VM:
3610 case KVM_CAP_ENABLE_CAP_VM:
3611 case KVM_CAP_HALT_POLL:
3612 return 1;
3613#ifdef CONFIG_KVM_MMIO
3614 case KVM_CAP_COALESCED_MMIO:
3615 return KVM_COALESCED_MMIO_PAGE_OFFSET;
3616 case KVM_CAP_COALESCED_PIO:
3617 return 1;
3618#endif
3619#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3620 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
3621 return KVM_DIRTY_LOG_MANUAL_CAPS;
3622#endif
3623#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
3624 case KVM_CAP_IRQ_ROUTING:
3625 return KVM_MAX_IRQ_ROUTES;
3626#endif
3627#if KVM_ADDRESS_SPACE_NUM > 1
3628 case KVM_CAP_MULTI_ADDRESS_SPACE:
3629 return KVM_ADDRESS_SPACE_NUM;
3630#endif
3631 case KVM_CAP_NR_MEMSLOTS:
3632 return KVM_USER_MEM_SLOTS;
3633 default:
3634 break;
3635 }
3636 return kvm_vm_ioctl_check_extension(kvm, arg);
3637}
3638
3639int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
3640 struct kvm_enable_cap *cap)
3641{
3642 return -EINVAL;
3643}
3644
3645static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
3646 struct kvm_enable_cap *cap)
3647{
3648 switch (cap->cap) {
3649#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3650 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
3651 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
3652
3653 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
3654 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
3655
3656 if (cap->flags || (cap->args[0] & ~allowed_options))
3657 return -EINVAL;
3658 kvm->manual_dirty_log_protect = cap->args[0];
3659 return 0;
3660 }
3661#endif
3662 case KVM_CAP_HALT_POLL: {
3663 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
3664 return -EINVAL;
3665
3666 kvm->max_halt_poll_ns = cap->args[0];
3667 return 0;
3668 }
3669 default:
3670 return kvm_vm_ioctl_enable_cap(kvm, cap);
3671 }
3672}
3673
3674static long kvm_vm_ioctl(struct file *filp,
3675 unsigned int ioctl, unsigned long arg)
3676{
3677 struct kvm *kvm = filp->private_data;
3678 void __user *argp = (void __user *)arg;
3679 int r;
3680
3681 if (kvm->mm != current->mm)
3682 return -EIO;
3683 switch (ioctl) {
3684 case KVM_CREATE_VCPU:
3685 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3686 break;
3687 case KVM_ENABLE_CAP: {
3688 struct kvm_enable_cap cap;
3689
3690 r = -EFAULT;
3691 if (copy_from_user(&cap, argp, sizeof(cap)))
3692 goto out;
3693 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
3694 break;
3695 }
3696 case KVM_SET_USER_MEMORY_REGION: {
3697 struct kvm_userspace_memory_region kvm_userspace_mem;
3698
3699 r = -EFAULT;
3700 if (copy_from_user(&kvm_userspace_mem, argp,
3701 sizeof(kvm_userspace_mem)))
3702 goto out;
3703
3704 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
3705 break;
3706 }
3707 case KVM_GET_DIRTY_LOG: {
3708 struct kvm_dirty_log log;
3709
3710 r = -EFAULT;
3711 if (copy_from_user(&log, argp, sizeof(log)))
3712 goto out;
3713 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3714 break;
3715 }
3716#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
3717 case KVM_CLEAR_DIRTY_LOG: {
3718 struct kvm_clear_dirty_log log;
3719
3720 r = -EFAULT;
3721 if (copy_from_user(&log, argp, sizeof(log)))
3722 goto out;
3723 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
3724 break;
3725 }
3726#endif
3727#ifdef CONFIG_KVM_MMIO
3728 case KVM_REGISTER_COALESCED_MMIO: {
3729 struct kvm_coalesced_mmio_zone zone;
3730
3731 r = -EFAULT;
3732 if (copy_from_user(&zone, argp, sizeof(zone)))
3733 goto out;
3734 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
3735 break;
3736 }
3737 case KVM_UNREGISTER_COALESCED_MMIO: {
3738 struct kvm_coalesced_mmio_zone zone;
3739
3740 r = -EFAULT;
3741 if (copy_from_user(&zone, argp, sizeof(zone)))
3742 goto out;
3743 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
3744 break;
3745 }
3746#endif
3747 case KVM_IRQFD: {
3748 struct kvm_irqfd data;
3749
3750 r = -EFAULT;
3751 if (copy_from_user(&data, argp, sizeof(data)))
3752 goto out;
3753 r = kvm_irqfd(kvm, &data);
3754 break;
3755 }
3756 case KVM_IOEVENTFD: {
3757 struct kvm_ioeventfd data;
3758
3759 r = -EFAULT;
3760 if (copy_from_user(&data, argp, sizeof(data)))
3761 goto out;
3762 r = kvm_ioeventfd(kvm, &data);
3763 break;
3764 }
3765#ifdef CONFIG_HAVE_KVM_MSI
3766 case KVM_SIGNAL_MSI: {
3767 struct kvm_msi msi;
3768
3769 r = -EFAULT;
3770 if (copy_from_user(&msi, argp, sizeof(msi)))
3771 goto out;
3772 r = kvm_send_userspace_msi(kvm, &msi);
3773 break;
3774 }
3775#endif
3776#ifdef __KVM_HAVE_IRQ_LINE
3777 case KVM_IRQ_LINE_STATUS:
3778 case KVM_IRQ_LINE: {
3779 struct kvm_irq_level irq_event;
3780
3781 r = -EFAULT;
3782 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
3783 goto out;
3784
3785 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
3786 ioctl == KVM_IRQ_LINE_STATUS);
3787 if (r)
3788 goto out;
3789
3790 r = -EFAULT;
3791 if (ioctl == KVM_IRQ_LINE_STATUS) {
3792 if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
3793 goto out;
3794 }
3795
3796 r = 0;
3797 break;
3798 }
3799#endif
3800#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
3801 case KVM_SET_GSI_ROUTING: {
3802 struct kvm_irq_routing routing;
3803 struct kvm_irq_routing __user *urouting;
3804 struct kvm_irq_routing_entry *entries = NULL;
3805
3806 r = -EFAULT;
3807 if (copy_from_user(&routing, argp, sizeof(routing)))
3808 goto out;
3809 r = -EINVAL;
3810 if (!kvm_arch_can_set_irq_routing(kvm))
3811 goto out;
3812 if (routing.nr > KVM_MAX_IRQ_ROUTES)
3813 goto out;
3814 if (routing.flags)
3815 goto out;
3816 if (routing.nr) {
3817 urouting = argp;
3818 entries = vmemdup_user(urouting->entries,
3819 array_size(sizeof(*entries),
3820 routing.nr));
3821 if (IS_ERR(entries)) {
3822 r = PTR_ERR(entries);
3823 goto out;
3824 }
3825 }
3826 r = kvm_set_irq_routing(kvm, entries, routing.nr,
3827 routing.flags);
3828 kvfree(entries);
3829 break;
3830 }
3831#endif
3832 case KVM_CREATE_DEVICE: {
3833 struct kvm_create_device cd;
3834
3835 r = -EFAULT;
3836 if (copy_from_user(&cd, argp, sizeof(cd)))
3837 goto out;
3838
3839 r = kvm_ioctl_create_device(kvm, &cd);
3840 if (r)
3841 goto out;
3842
3843 r = -EFAULT;
3844 if (copy_to_user(argp, &cd, sizeof(cd)))
3845 goto out;
3846
3847 r = 0;
3848 break;
3849 }
3850 case KVM_CHECK_EXTENSION:
3851 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
3852 break;
3853 default:
3854 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
3855 }
3856out:
3857 return r;
3858}
3859
3860#ifdef CONFIG_KVM_COMPAT
3861struct compat_kvm_dirty_log {
3862 __u32 slot;
3863 __u32 padding1;
3864 union {
3865 compat_uptr_t dirty_bitmap;
3866 __u64 padding2;
3867 };
3868};
3869
3870static long kvm_vm_compat_ioctl(struct file *filp,
3871 unsigned int ioctl, unsigned long arg)
3872{
3873 struct kvm *kvm = filp->private_data;
3874 int r;
3875
3876 if (kvm->mm != current->mm)
3877 return -EIO;
3878 switch (ioctl) {
3879 case KVM_GET_DIRTY_LOG: {
3880 struct compat_kvm_dirty_log compat_log;
3881 struct kvm_dirty_log log;
3882
3883 if (copy_from_user(&compat_log, (void __user *)arg,
3884 sizeof(compat_log)))
3885 return -EFAULT;
3886 log.slot = compat_log.slot;
3887 log.padding1 = compat_log.padding1;
3888 log.padding2 = compat_log.padding2;
3889 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
3890
3891 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3892 break;
3893 }
3894 default:
3895 r = kvm_vm_ioctl(filp, ioctl, arg);
3896 }
3897 return r;
3898}
3899#endif
3900
3901static struct file_operations kvm_vm_fops = {
3902 .release = kvm_vm_release,
3903 .unlocked_ioctl = kvm_vm_ioctl,
3904 .llseek = noop_llseek,
3905 KVM_COMPAT(kvm_vm_compat_ioctl),
3906};
3907
3908static int kvm_dev_ioctl_create_vm(unsigned long type)
3909{
3910 int r;
3911 struct kvm *kvm;
3912 struct file *file;
3913
3914 kvm = kvm_create_vm(type);
3915 if (IS_ERR(kvm))
3916 return PTR_ERR(kvm);
3917#ifdef CONFIG_KVM_MMIO
3918 r = kvm_coalesced_mmio_init(kvm);
3919 if (r < 0)
3920 goto put_kvm;
3921#endif
3922 r = get_unused_fd_flags(O_CLOEXEC);
3923 if (r < 0)
3924 goto put_kvm;
3925
3926 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
3927 if (IS_ERR(file)) {
3928 put_unused_fd(r);
3929 r = PTR_ERR(file);
3930 goto put_kvm;
3931 }
3932
3933
3934
3935
3936
3937
3938
3939 if (kvm_create_vm_debugfs(kvm, r) < 0) {
3940 put_unused_fd(r);
3941 fput(file);
3942 return -ENOMEM;
3943 }
3944 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
3945
3946 fd_install(r, file);
3947 return r;
3948
3949put_kvm:
3950 kvm_put_kvm(kvm);
3951 return r;
3952}
3953
3954static long kvm_dev_ioctl(struct file *filp,
3955 unsigned int ioctl, unsigned long arg)
3956{
3957 long r = -EINVAL;
3958
3959 switch (ioctl) {
3960 case KVM_GET_API_VERSION:
3961 if (arg)
3962 goto out;
3963 r = KVM_API_VERSION;
3964 break;
3965 case KVM_CREATE_VM:
3966 r = kvm_dev_ioctl_create_vm(arg);
3967 break;
3968 case KVM_CHECK_EXTENSION:
3969 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
3970 break;
3971 case KVM_GET_VCPU_MMAP_SIZE:
3972 if (arg)
3973 goto out;
3974 r = PAGE_SIZE;
3975#ifdef CONFIG_X86
3976 r += PAGE_SIZE;
3977#endif
3978#ifdef CONFIG_KVM_MMIO
3979 r += PAGE_SIZE;
3980#endif
3981 break;
3982 case KVM_TRACE_ENABLE:
3983 case KVM_TRACE_PAUSE:
3984 case KVM_TRACE_DISABLE:
3985 r = -EOPNOTSUPP;
3986 break;
3987 default:
3988 return kvm_arch_dev_ioctl(filp, ioctl, arg);
3989 }
3990out:
3991 return r;
3992}
3993
3994static struct file_operations kvm_chardev_ops = {
3995 .unlocked_ioctl = kvm_dev_ioctl,
3996 .llseek = noop_llseek,
3997 KVM_COMPAT(kvm_dev_ioctl),
3998};
3999
4000static struct miscdevice kvm_dev = {
4001 KVM_MINOR,
4002 "kvm",
4003 &kvm_chardev_ops,
4004};
4005
4006static void hardware_enable_nolock(void *junk)
4007{
4008 int cpu = raw_smp_processor_id();
4009 int r;
4010
4011 if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
4012 return;
4013
4014 cpumask_set_cpu(cpu, cpus_hardware_enabled);
4015
4016 r = kvm_arch_hardware_enable();
4017
4018 if (r) {
4019 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4020 atomic_inc(&hardware_enable_failed);
4021 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
4022 }
4023}
4024
4025static int kvm_starting_cpu(unsigned int cpu)
4026{
4027 raw_spin_lock(&kvm_count_lock);
4028 if (kvm_usage_count)
4029 hardware_enable_nolock(NULL);
4030 raw_spin_unlock(&kvm_count_lock);
4031 return 0;
4032}
4033
4034static void hardware_disable_nolock(void *junk)
4035{
4036 int cpu = raw_smp_processor_id();
4037
4038 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
4039 return;
4040 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4041 kvm_arch_hardware_disable();
4042}
4043
4044static int kvm_dying_cpu(unsigned int cpu)
4045{
4046 raw_spin_lock(&kvm_count_lock);
4047 if (kvm_usage_count)
4048 hardware_disable_nolock(NULL);
4049 raw_spin_unlock(&kvm_count_lock);
4050 return 0;
4051}
4052
4053static void hardware_disable_all_nolock(void)
4054{
4055 BUG_ON(!kvm_usage_count);
4056
4057 kvm_usage_count--;
4058 if (!kvm_usage_count)
4059 on_each_cpu(hardware_disable_nolock, NULL, 1);
4060}
4061
4062static void hardware_disable_all(void)
4063{
4064 raw_spin_lock(&kvm_count_lock);
4065 hardware_disable_all_nolock();
4066 raw_spin_unlock(&kvm_count_lock);
4067}
4068
4069static int hardware_enable_all(void)
4070{
4071 int r = 0;
4072
4073 raw_spin_lock(&kvm_count_lock);
4074
4075 kvm_usage_count++;
4076 if (kvm_usage_count == 1) {
4077 atomic_set(&hardware_enable_failed, 0);
4078 on_each_cpu(hardware_enable_nolock, NULL, 1);
4079
4080 if (atomic_read(&hardware_enable_failed)) {
4081 hardware_disable_all_nolock();
4082 r = -EBUSY;
4083 }
4084 }
4085
4086 raw_spin_unlock(&kvm_count_lock);
4087
4088 return r;
4089}
4090
4091static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
4092 void *v)
4093{
4094
4095
4096
4097
4098
4099
4100 pr_info("kvm: exiting hardware virtualization\n");
4101 kvm_rebooting = true;
4102 on_each_cpu(hardware_disable_nolock, NULL, 1);
4103 return NOTIFY_OK;
4104}
4105
4106static struct notifier_block kvm_reboot_notifier = {
4107 .notifier_call = kvm_reboot,
4108 .priority = 0,
4109};
4110
4111static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
4112{
4113 int i;
4114
4115 for (i = 0; i < bus->dev_count; i++) {
4116 struct kvm_io_device *pos = bus->range[i].dev;
4117
4118 kvm_iodevice_destructor(pos);
4119 }
4120 kfree(bus);
4121}
4122
4123static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
4124 const struct kvm_io_range *r2)
4125{
4126 gpa_t addr1 = r1->addr;
4127 gpa_t addr2 = r2->addr;
4128
4129 if (addr1 < addr2)
4130 return -1;
4131
4132
4133
4134
4135
4136
4137 if (r2->len) {
4138 addr1 += r1->len;
4139 addr2 += r2->len;
4140 }
4141
4142 if (addr1 > addr2)
4143 return 1;
4144
4145 return 0;
4146}
4147
4148static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4149{
4150 return kvm_io_bus_cmp(p1, p2);
4151}
4152
4153static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
4154 gpa_t addr, int len)
4155{
4156 struct kvm_io_range *range, key;
4157 int off;
4158
4159 key = (struct kvm_io_range) {
4160 .addr = addr,
4161 .len = len,
4162 };
4163
4164 range = bsearch(&key, bus->range, bus->dev_count,
4165 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4166 if (range == NULL)
4167 return -ENOENT;
4168
4169 off = range - bus->range;
4170
4171 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
4172 off--;
4173
4174 return off;
4175}
4176
4177static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4178 struct kvm_io_range *range, const void *val)
4179{
4180 int idx;
4181
4182 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4183 if (idx < 0)
4184 return -EOPNOTSUPP;
4185
4186 while (idx < bus->dev_count &&
4187 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4188 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
4189 range->len, val))
4190 return idx;
4191 idx++;
4192 }
4193
4194 return -EOPNOTSUPP;
4195}
4196
4197
4198int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4199 int len, const void *val)
4200{
4201 struct kvm_io_bus *bus;
4202 struct kvm_io_range range;
4203 int r;
4204
4205 range = (struct kvm_io_range) {
4206 .addr = addr,
4207 .len = len,
4208 };
4209
4210 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4211 if (!bus)
4212 return -ENOMEM;
4213 r = __kvm_io_bus_write(vcpu, bus, &range, val);
4214 return r < 0 ? r : 0;
4215}
4216EXPORT_SYMBOL_GPL(kvm_io_bus_write);
4217
4218
4219int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
4220 gpa_t addr, int len, const void *val, long cookie)
4221{
4222 struct kvm_io_bus *bus;
4223 struct kvm_io_range range;
4224
4225 range = (struct kvm_io_range) {
4226 .addr = addr,
4227 .len = len,
4228 };
4229
4230 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4231 if (!bus)
4232 return -ENOMEM;
4233
4234
4235 if ((cookie >= 0) && (cookie < bus->dev_count) &&
4236 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
4237 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
4238 val))
4239 return cookie;
4240
4241
4242
4243
4244
4245 return __kvm_io_bus_write(vcpu, bus, &range, val);
4246}
4247
4248static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4249 struct kvm_io_range *range, void *val)
4250{
4251 int idx;
4252
4253 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4254 if (idx < 0)
4255 return -EOPNOTSUPP;
4256
4257 while (idx < bus->dev_count &&
4258 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4259 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
4260 range->len, val))
4261 return idx;
4262 idx++;
4263 }
4264
4265 return -EOPNOTSUPP;
4266}
4267
4268
4269int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4270 int len, void *val)
4271{
4272 struct kvm_io_bus *bus;
4273 struct kvm_io_range range;
4274 int r;
4275
4276 range = (struct kvm_io_range) {
4277 .addr = addr,
4278 .len = len,
4279 };
4280
4281 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4282 if (!bus)
4283 return -ENOMEM;
4284 r = __kvm_io_bus_read(vcpu, bus, &range, val);
4285 return r < 0 ? r : 0;
4286}
4287
4288
4289int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
4290 int len, struct kvm_io_device *dev)
4291{
4292 int i;
4293 struct kvm_io_bus *new_bus, *bus;
4294 struct kvm_io_range range;
4295
4296 bus = kvm_get_bus(kvm, bus_idx);
4297 if (!bus)
4298 return -ENOMEM;
4299
4300
4301 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
4302 return -ENOSPC;
4303
4304 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
4305 GFP_KERNEL_ACCOUNT);
4306 if (!new_bus)
4307 return -ENOMEM;
4308
4309 range = (struct kvm_io_range) {
4310 .addr = addr,
4311 .len = len,
4312 .dev = dev,
4313 };
4314
4315 for (i = 0; i < bus->dev_count; i++)
4316 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
4317 break;
4318
4319 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
4320 new_bus->dev_count++;
4321 new_bus->range[i] = range;
4322 memcpy(new_bus->range + i + 1, bus->range + i,
4323 (bus->dev_count - i) * sizeof(struct kvm_io_range));
4324 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4325 synchronize_srcu_expedited(&kvm->srcu);
4326 kfree(bus);
4327
4328 return 0;
4329}
4330
4331
4332void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4333 struct kvm_io_device *dev)
4334{
4335 int i, j;
4336 struct kvm_io_bus *new_bus, *bus;
4337
4338 bus = kvm_get_bus(kvm, bus_idx);
4339 if (!bus)
4340 return;
4341
4342 for (i = 0; i < bus->dev_count; i++)
4343 if (bus->range[i].dev == dev) {
4344 break;
4345 }
4346
4347 if (i == bus->dev_count)
4348 return;
4349
4350 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
4351 GFP_KERNEL_ACCOUNT);
4352 if (new_bus) {
4353 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
4354 new_bus->dev_count--;
4355 memcpy(new_bus->range + i, bus->range + i + 1,
4356 (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
4357 } else {
4358 pr_err("kvm: failed to shrink bus, removing it completely\n");
4359 for (j = 0; j < bus->dev_count; j++) {
4360 if (j == i)
4361 continue;
4362 kvm_iodevice_destructor(bus->range[j].dev);
4363 }
4364 }
4365
4366 rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4367 synchronize_srcu_expedited(&kvm->srcu);
4368 kfree(bus);
4369 return;
4370}
4371
4372struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4373 gpa_t addr)
4374{
4375 struct kvm_io_bus *bus;
4376 int dev_idx, srcu_idx;
4377 struct kvm_io_device *iodev = NULL;
4378
4379 srcu_idx = srcu_read_lock(&kvm->srcu);
4380
4381 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
4382 if (!bus)
4383 goto out_unlock;
4384
4385 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
4386 if (dev_idx < 0)
4387 goto out_unlock;
4388
4389 iodev = bus->range[dev_idx].dev;
4390
4391out_unlock:
4392 srcu_read_unlock(&kvm->srcu, srcu_idx);
4393
4394 return iodev;
4395}
4396EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
4397
4398static int kvm_debugfs_open(struct inode *inode, struct file *file,
4399 int (*get)(void *, u64 *), int (*set)(void *, u64),
4400 const char *fmt)
4401{
4402 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
4403 inode->i_private;
4404
4405
4406
4407
4408
4409
4410 if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
4411 return -ENOENT;
4412
4413 if (simple_attr_open(inode, file, get,
4414 KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
4415 ? set : NULL,
4416 fmt)) {
4417 kvm_put_kvm(stat_data->kvm);
4418 return -ENOMEM;
4419 }
4420
4421 return 0;
4422}
4423
4424static int kvm_debugfs_release(struct inode *inode, struct file *file)
4425{
4426 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
4427 inode->i_private;
4428
4429 simple_attr_release(inode, file);
4430 kvm_put_kvm(stat_data->kvm);
4431
4432 return 0;
4433}
4434
4435static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
4436{
4437 *val = *(ulong *)((void *)kvm + offset);
4438
4439 return 0;
4440}
4441
4442static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
4443{
4444 *(ulong *)((void *)kvm + offset) = 0;
4445
4446 return 0;
4447}
4448
4449static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
4450{
4451 int i;
4452 struct kvm_vcpu *vcpu;
4453
4454 *val = 0;
4455
4456 kvm_for_each_vcpu(i, vcpu, kvm)
4457 *val += *(u64 *)((void *)vcpu + offset);
4458
4459 return 0;
4460}
4461
4462static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
4463{
4464 int i;
4465 struct kvm_vcpu *vcpu;
4466
4467 kvm_for_each_vcpu(i, vcpu, kvm)
4468 *(u64 *)((void *)vcpu + offset) = 0;
4469
4470 return 0;
4471}
4472
4473static int kvm_stat_data_get(void *data, u64 *val)
4474{
4475 int r = -EFAULT;
4476 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4477
4478 switch (stat_data->dbgfs_item->kind) {
4479 case KVM_STAT_VM:
4480 r = kvm_get_stat_per_vm(stat_data->kvm,
4481 stat_data->dbgfs_item->offset, val);
4482 break;
4483 case KVM_STAT_VCPU:
4484 r = kvm_get_stat_per_vcpu(stat_data->kvm,
4485 stat_data->dbgfs_item->offset, val);
4486 break;
4487 }
4488
4489 return r;
4490}
4491
4492static int kvm_stat_data_clear(void *data, u64 val)
4493{
4494 int r = -EFAULT;
4495 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4496
4497 if (val)
4498 return -EINVAL;
4499
4500 switch (stat_data->dbgfs_item->kind) {
4501 case KVM_STAT_VM:
4502 r = kvm_clear_stat_per_vm(stat_data->kvm,
4503 stat_data->dbgfs_item->offset);
4504 break;
4505 case KVM_STAT_VCPU:
4506 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
4507 stat_data->dbgfs_item->offset);
4508 break;
4509 }
4510
4511 return r;
4512}
4513
4514static int kvm_stat_data_open(struct inode *inode, struct file *file)
4515{
4516 __simple_attr_check_format("%llu\n", 0ull);
4517 return kvm_debugfs_open(inode, file, kvm_stat_data_get,
4518 kvm_stat_data_clear, "%llu\n");
4519}
4520
4521static const struct file_operations stat_fops_per_vm = {
4522 .owner = THIS_MODULE,
4523 .open = kvm_stat_data_open,
4524 .release = kvm_debugfs_release,
4525 .read = simple_attr_read,
4526 .write = simple_attr_write,
4527 .llseek = no_llseek,
4528};
4529
4530static int vm_stat_get(void *_offset, u64 *val)
4531{
4532 unsigned offset = (long)_offset;
4533 struct kvm *kvm;
4534 u64 tmp_val;
4535
4536 *val = 0;
4537 mutex_lock(&kvm_lock);
4538 list_for_each_entry(kvm, &vm_list, vm_list) {
4539 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
4540 *val += tmp_val;
4541 }
4542 mutex_unlock(&kvm_lock);
4543 return 0;
4544}
4545
4546static int vm_stat_clear(void *_offset, u64 val)
4547{
4548 unsigned offset = (long)_offset;
4549 struct kvm *kvm;
4550
4551 if (val)
4552 return -EINVAL;
4553
4554 mutex_lock(&kvm_lock);
4555 list_for_each_entry(kvm, &vm_list, vm_list) {
4556 kvm_clear_stat_per_vm(kvm, offset);
4557 }
4558 mutex_unlock(&kvm_lock);
4559
4560 return 0;
4561}
4562
4563DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
4564
4565static int vcpu_stat_get(void *_offset, u64 *val)
4566{
4567 unsigned offset = (long)_offset;
4568 struct kvm *kvm;
4569 u64 tmp_val;
4570
4571 *val = 0;
4572 mutex_lock(&kvm_lock);
4573 list_for_each_entry(kvm, &vm_list, vm_list) {
4574 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
4575 *val += tmp_val;
4576 }
4577 mutex_unlock(&kvm_lock);
4578 return 0;
4579}
4580
4581static int vcpu_stat_clear(void *_offset, u64 val)
4582{
4583 unsigned offset = (long)_offset;
4584 struct kvm *kvm;
4585
4586 if (val)
4587 return -EINVAL;
4588
4589 mutex_lock(&kvm_lock);
4590 list_for_each_entry(kvm, &vm_list, vm_list) {
4591 kvm_clear_stat_per_vcpu(kvm, offset);
4592 }
4593 mutex_unlock(&kvm_lock);
4594
4595 return 0;
4596}
4597
4598DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
4599 "%llu\n");
4600
4601static const struct file_operations *stat_fops[] = {
4602 [KVM_STAT_VCPU] = &vcpu_stat_fops,
4603 [KVM_STAT_VM] = &vm_stat_fops,
4604};
4605
4606static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
4607{
4608 struct kobj_uevent_env *env;
4609 unsigned long long created, active;
4610
4611 if (!kvm_dev.this_device || !kvm)
4612 return;
4613
4614 mutex_lock(&kvm_lock);
4615 if (type == KVM_EVENT_CREATE_VM) {
4616 kvm_createvm_count++;
4617 kvm_active_vms++;
4618 } else if (type == KVM_EVENT_DESTROY_VM) {
4619 kvm_active_vms--;
4620 }
4621 created = kvm_createvm_count;
4622 active = kvm_active_vms;
4623 mutex_unlock(&kvm_lock);
4624
4625 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
4626 if (!env)
4627 return;
4628
4629 add_uevent_var(env, "CREATED=%llu", created);
4630 add_uevent_var(env, "COUNT=%llu", active);
4631
4632 if (type == KVM_EVENT_CREATE_VM) {
4633 add_uevent_var(env, "EVENT=create");
4634 kvm->userspace_pid = task_pid_nr(current);
4635 } else if (type == KVM_EVENT_DESTROY_VM) {
4636 add_uevent_var(env, "EVENT=destroy");
4637 }
4638 add_uevent_var(env, "PID=%d", kvm->userspace_pid);
4639
4640 if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
4641 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
4642
4643 if (p) {
4644 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
4645 if (!IS_ERR(tmp))
4646 add_uevent_var(env, "STATS_PATH=%s", tmp);
4647 kfree(p);
4648 }
4649 }
4650
4651 env->envp[env->envp_idx++] = NULL;
4652 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
4653 kfree(env);
4654}
4655
4656static void kvm_init_debug(void)
4657{
4658 struct kvm_stats_debugfs_item *p;
4659
4660 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
4661
4662 kvm_debugfs_num_entries = 0;
4663 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
4664 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
4665 kvm_debugfs_dir, (void *)(long)p->offset,
4666 stat_fops[p->kind]);
4667 }
4668}
4669
4670static int kvm_suspend(void)
4671{
4672 if (kvm_usage_count)
4673 hardware_disable_nolock(NULL);
4674 return 0;
4675}
4676
4677static void kvm_resume(void)
4678{
4679 if (kvm_usage_count) {
4680#ifdef CONFIG_LOCKDEP
4681 WARN_ON(lockdep_is_held(&kvm_count_lock));
4682#endif
4683 hardware_enable_nolock(NULL);
4684 }
4685}
4686
4687static struct syscore_ops kvm_syscore_ops = {
4688 .suspend = kvm_suspend,
4689 .resume = kvm_resume,
4690};
4691
4692static inline
4693struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
4694{
4695 return container_of(pn, struct kvm_vcpu, preempt_notifier);
4696}
4697
4698static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
4699{
4700 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
4701
4702 WRITE_ONCE(vcpu->preempted, false);
4703 WRITE_ONCE(vcpu->ready, false);
4704
4705 __this_cpu_write(kvm_running_vcpu, vcpu);
4706 kvm_arch_sched_in(vcpu, cpu);
4707 kvm_arch_vcpu_load(vcpu, cpu);
4708}
4709
4710static void kvm_sched_out(struct preempt_notifier *pn,
4711 struct task_struct *next)
4712{
4713 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
4714
4715 if (current->state == TASK_RUNNING) {
4716 WRITE_ONCE(vcpu->preempted, true);
4717 WRITE_ONCE(vcpu->ready, true);
4718 }
4719 kvm_arch_vcpu_put(vcpu);
4720 __this_cpu_write(kvm_running_vcpu, NULL);
4721}
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732struct kvm_vcpu *kvm_get_running_vcpu(void)
4733{
4734 struct kvm_vcpu *vcpu;
4735
4736 preempt_disable();
4737 vcpu = __this_cpu_read(kvm_running_vcpu);
4738 preempt_enable();
4739
4740 return vcpu;
4741}
4742EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
4743
4744
4745
4746
4747struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
4748{
4749 return &kvm_running_vcpu;
4750}
4751
4752struct kvm_cpu_compat_check {
4753 void *opaque;
4754 int *ret;
4755};
4756
4757static void check_processor_compat(void *data)
4758{
4759 struct kvm_cpu_compat_check *c = data;
4760
4761 *c->ret = kvm_arch_check_processor_compat(c->opaque);
4762}
4763
4764int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
4765 struct module *module)
4766{
4767 struct kvm_cpu_compat_check c;
4768 int r;
4769 int cpu;
4770
4771 r = kvm_arch_init(opaque);
4772 if (r)
4773 goto out_fail;
4774
4775
4776
4777
4778
4779
4780
4781
4782 r = kvm_irqfd_init();
4783 if (r)
4784 goto out_irqfd;
4785
4786 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
4787 r = -ENOMEM;
4788 goto out_free_0;
4789 }
4790
4791 r = kvm_arch_hardware_setup(opaque);
4792 if (r < 0)
4793 goto out_free_1;
4794
4795 c.ret = &r;
4796 c.opaque = opaque;
4797 for_each_online_cpu(cpu) {
4798 smp_call_function_single(cpu, check_processor_compat, &c, 1);
4799 if (r < 0)
4800 goto out_free_2;
4801 }
4802
4803 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
4804 kvm_starting_cpu, kvm_dying_cpu);
4805 if (r)
4806 goto out_free_2;
4807 register_reboot_notifier(&kvm_reboot_notifier);
4808
4809
4810 if (!vcpu_align)
4811 vcpu_align = __alignof__(struct kvm_vcpu);
4812 kvm_vcpu_cache =
4813 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
4814 SLAB_ACCOUNT,
4815 offsetof(struct kvm_vcpu, arch),
4816 sizeof_field(struct kvm_vcpu, arch),
4817 NULL);
4818 if (!kvm_vcpu_cache) {
4819 r = -ENOMEM;
4820 goto out_free_3;
4821 }
4822
4823 r = kvm_async_pf_init();
4824 if (r)
4825 goto out_free;
4826
4827 kvm_chardev_ops.owner = module;
4828 kvm_vm_fops.owner = module;
4829 kvm_vcpu_fops.owner = module;
4830
4831 r = misc_register(&kvm_dev);
4832 if (r) {
4833 pr_err("kvm: misc device register failed\n");
4834 goto out_unreg;
4835 }
4836
4837 register_syscore_ops(&kvm_syscore_ops);
4838
4839 kvm_preempt_ops.sched_in = kvm_sched_in;
4840 kvm_preempt_ops.sched_out = kvm_sched_out;
4841
4842 kvm_init_debug();
4843
4844 r = kvm_vfio_ops_init();
4845 WARN_ON(r);
4846
4847 return 0;
4848
4849out_unreg:
4850 kvm_async_pf_deinit();
4851out_free:
4852 kmem_cache_destroy(kvm_vcpu_cache);
4853out_free_3:
4854 unregister_reboot_notifier(&kvm_reboot_notifier);
4855 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
4856out_free_2:
4857 kvm_arch_hardware_unsetup();
4858out_free_1:
4859 free_cpumask_var(cpus_hardware_enabled);
4860out_free_0:
4861 kvm_irqfd_exit();
4862out_irqfd:
4863 kvm_arch_exit();
4864out_fail:
4865 return r;
4866}
4867EXPORT_SYMBOL_GPL(kvm_init);
4868
4869void kvm_exit(void)
4870{
4871 debugfs_remove_recursive(kvm_debugfs_dir);
4872 misc_deregister(&kvm_dev);
4873 kmem_cache_destroy(kvm_vcpu_cache);
4874 kvm_async_pf_deinit();
4875 unregister_syscore_ops(&kvm_syscore_ops);
4876 unregister_reboot_notifier(&kvm_reboot_notifier);
4877 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
4878 on_each_cpu(hardware_disable_nolock, NULL, 1);
4879 kvm_arch_hardware_unsetup();
4880 kvm_arch_exit();
4881 kvm_irqfd_exit();
4882 free_cpumask_var(cpus_hardware_enabled);
4883 kvm_vfio_ops_exit();
4884}
4885EXPORT_SYMBOL_GPL(kvm_exit);
4886
4887struct kvm_vm_worker_thread_context {
4888 struct kvm *kvm;
4889 struct task_struct *parent;
4890 struct completion init_done;
4891 kvm_vm_thread_fn_t thread_fn;
4892 uintptr_t data;
4893 int err;
4894};
4895
4896static int kvm_vm_worker_thread(void *context)
4897{
4898
4899
4900
4901
4902 struct kvm_vm_worker_thread_context *init_context = context;
4903 struct kvm *kvm = init_context->kvm;
4904 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
4905 uintptr_t data = init_context->data;
4906 int err;
4907
4908 err = kthread_park(current);
4909
4910 WARN_ON(err != 0);
4911 if (err)
4912 goto init_complete;
4913
4914 err = cgroup_attach_task_all(init_context->parent, current);
4915 if (err) {
4916 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
4917 __func__, err);
4918 goto init_complete;
4919 }
4920
4921 set_user_nice(current, task_nice(init_context->parent));
4922
4923init_complete:
4924 init_context->err = err;
4925 complete(&init_context->init_done);
4926 init_context = NULL;
4927
4928 if (err)
4929 return err;
4930
4931
4932 kthread_parkme();
4933
4934 if (!kthread_should_stop())
4935 err = thread_fn(kvm, data);
4936
4937 return err;
4938}
4939
4940int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
4941 uintptr_t data, const char *name,
4942 struct task_struct **thread_ptr)
4943{
4944 struct kvm_vm_worker_thread_context init_context = {};
4945 struct task_struct *thread;
4946
4947 *thread_ptr = NULL;
4948 init_context.kvm = kvm;
4949 init_context.parent = current;
4950 init_context.thread_fn = thread_fn;
4951 init_context.data = data;
4952 init_completion(&init_context.init_done);
4953
4954 thread = kthread_run(kvm_vm_worker_thread, &init_context,
4955 "%s-%d", name, task_pid_nr(current));
4956 if (IS_ERR(thread))
4957 return PTR_ERR(thread);
4958
4959
4960 WARN_ON(thread == NULL);
4961
4962 wait_for_completion(&init_context.init_done);
4963
4964 if (!init_context.err)
4965 *thread_ptr = thread;
4966
4967 return init_context.err;
4968}
4969