linux/virt/kvm/kvm_main.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * This module enables machines with Intel VT-x extensions to run virtual
   5 * machines without emulation or binary translation.
   6 *
   7 * Copyright (C) 2006 Qumranet, Inc.
   8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   9 *
  10 * Authors:
  11 *   Avi Kivity   <avi@qumranet.com>
  12 *   Yaniv Kamay  <yaniv@qumranet.com>
  13 *
  14 * This work is licensed under the terms of the GNU GPL, version 2.  See
  15 * the COPYING file in the top-level directory.
  16 *
  17 */
  18
  19#include "iodev.h"
  20
  21#include <linux/kvm_host.h>
  22#include <linux/kvm.h>
  23#include <linux/module.h>
  24#include <linux/errno.h>
  25#include <linux/percpu.h>
  26#include <linux/mm.h>
  27#include <linux/miscdevice.h>
  28#include <linux/vmalloc.h>
  29#include <linux/reboot.h>
  30#include <linux/debugfs.h>
  31#include <linux/highmem.h>
  32#include <linux/file.h>
  33#include <linux/syscore_ops.h>
  34#include <linux/cpu.h>
  35#include <linux/sched.h>
  36#include <linux/cpumask.h>
  37#include <linux/smp.h>
  38#include <linux/anon_inodes.h>
  39#include <linux/profile.h>
  40#include <linux/kvm_para.h>
  41#include <linux/pagemap.h>
  42#include <linux/mman.h>
  43#include <linux/swap.h>
  44#include <linux/bitops.h>
  45#include <linux/spinlock.h>
  46#include <linux/compat.h>
  47#include <linux/srcu.h>
  48#include <linux/hugetlb.h>
  49#include <linux/slab.h>
  50#include <linux/sort.h>
  51#include <linux/bsearch.h>
  52
  53#include <asm/processor.h>
  54#include <asm/io.h>
  55#include <asm/uaccess.h>
  56#include <asm/pgtable.h>
  57
  58#include "coalesced_mmio.h"
  59#include "async_pf.h"
  60
  61#define CREATE_TRACE_POINTS
  62#include <trace/events/kvm.h>
  63
  64MODULE_AUTHOR("Qumranet");
  65MODULE_LICENSE("GPL");
  66
  67/*
  68 * Ordering of locks:
  69 *
  70 *              kvm->lock --> kvm->slots_lock --> kvm->irq_lock
  71 */
  72
  73DEFINE_RAW_SPINLOCK(kvm_lock);
  74LIST_HEAD(vm_list);
  75
  76static cpumask_var_t cpus_hardware_enabled;
  77static int kvm_usage_count = 0;
  78static atomic_t hardware_enable_failed;
  79
  80struct kmem_cache *kvm_vcpu_cache;
  81EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
  82
  83static __read_mostly struct preempt_ops kvm_preempt_ops;
  84
  85struct dentry *kvm_debugfs_dir;
  86
  87static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
  88                           unsigned long arg);
  89#ifdef CONFIG_COMPAT
  90static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
  91                                  unsigned long arg);
  92#endif
  93static int hardware_enable_all(void);
  94static void hardware_disable_all(void);
  95
  96static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
  97
  98bool kvm_rebooting;
  99EXPORT_SYMBOL_GPL(kvm_rebooting);
 100
 101static bool largepages_enabled = true;
 102
 103static struct page *hwpoison_page;
 104static pfn_t hwpoison_pfn;
 105
 106struct page *fault_page;
 107pfn_t fault_pfn;
 108
 109inline int kvm_is_mmio_pfn(pfn_t pfn)
 110{
 111        if (pfn_valid(pfn)) {
 112                int reserved;
 113                struct page *tail = pfn_to_page(pfn);
 114                struct page *head = compound_trans_head(tail);
 115                reserved = PageReserved(head);
 116                if (head != tail) {
 117                        /*
 118                         * "head" is not a dangling pointer
 119                         * (compound_trans_head takes care of that)
 120                         * but the hugepage may have been splitted
 121                         * from under us (and we may not hold a
 122                         * reference count on the head page so it can
 123                         * be reused before we run PageReferenced), so
 124                         * we've to check PageTail before returning
 125                         * what we just read.
 126                         */
 127                        smp_rmb();
 128                        if (PageTail(tail))
 129                                return reserved;
 130                }
 131                return PageReserved(tail);
 132        }
 133
 134        return true;
 135}
 136
 137/*
 138 * Switches to specified vcpu, until a matching vcpu_put()
 139 */
 140void vcpu_load(struct kvm_vcpu *vcpu)
 141{
 142        int cpu;
 143
 144        mutex_lock(&vcpu->mutex);
 145        if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
 146                /* The thread running this VCPU changed. */
 147                struct pid *oldpid = vcpu->pid;
 148                struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
 149                rcu_assign_pointer(vcpu->pid, newpid);
 150                synchronize_rcu();
 151                put_pid(oldpid);
 152        }
 153        cpu = get_cpu();
 154        preempt_notifier_register(&vcpu->preempt_notifier);
 155        kvm_arch_vcpu_load(vcpu, cpu);
 156        put_cpu();
 157}
 158
 159void vcpu_put(struct kvm_vcpu *vcpu)
 160{
 161        preempt_disable();
 162        kvm_arch_vcpu_put(vcpu);
 163        preempt_notifier_unregister(&vcpu->preempt_notifier);
 164        preempt_enable();
 165        mutex_unlock(&vcpu->mutex);
 166}
 167
 168static void ack_flush(void *_completed)
 169{
 170}
 171
 172static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 173{
 174        int i, cpu, me;
 175        cpumask_var_t cpus;
 176        bool called = true;
 177        struct kvm_vcpu *vcpu;
 178
 179        zalloc_cpumask_var(&cpus, GFP_ATOMIC);
 180
 181        me = get_cpu();
 182        kvm_for_each_vcpu(i, vcpu, kvm) {
 183                kvm_make_request(req, vcpu);
 184                cpu = vcpu->cpu;
 185
 186                /* Set ->requests bit before we read ->mode */
 187                smp_mb();
 188
 189                if (cpus != NULL && cpu != -1 && cpu != me &&
 190                      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
 191                        cpumask_set_cpu(cpu, cpus);
 192        }
 193        if (unlikely(cpus == NULL))
 194                smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
 195        else if (!cpumask_empty(cpus))
 196                smp_call_function_many(cpus, ack_flush, NULL, 1);
 197        else
 198                called = false;
 199        put_cpu();
 200        free_cpumask_var(cpus);
 201        return called;
 202}
 203
 204void kvm_flush_remote_tlbs(struct kvm *kvm)
 205{
 206        long dirty_count = kvm->tlbs_dirty;
 207
 208        smp_mb();
 209        if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 210                ++kvm->stat.remote_tlb_flush;
 211        cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 212}
 213
 214void kvm_reload_remote_mmus(struct kvm *kvm)
 215{
 216        make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 217}
 218
 219int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 220{
 221        struct page *page;
 222        int r;
 223
 224        mutex_init(&vcpu->mutex);
 225        vcpu->cpu = -1;
 226        vcpu->kvm = kvm;
 227        vcpu->vcpu_id = id;
 228        vcpu->pid = NULL;
 229        init_waitqueue_head(&vcpu->wq);
 230        kvm_async_pf_vcpu_init(vcpu);
 231
 232        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 233        if (!page) {
 234                r = -ENOMEM;
 235                goto fail;
 236        }
 237        vcpu->run = page_address(page);
 238
 239        r = kvm_arch_vcpu_init(vcpu);
 240        if (r < 0)
 241                goto fail_free_run;
 242        return 0;
 243
 244fail_free_run:
 245        free_page((unsigned long)vcpu->run);
 246fail:
 247        return r;
 248}
 249EXPORT_SYMBOL_GPL(kvm_vcpu_init);
 250
 251void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 252{
 253        put_pid(vcpu->pid);
 254        kvm_arch_vcpu_uninit(vcpu);
 255        free_page((unsigned long)vcpu->run);
 256}
 257EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
 258
 259#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 260static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 261{
 262        return container_of(mn, struct kvm, mmu_notifier);
 263}
 264
 265static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 266                                             struct mm_struct *mm,
 267                                             unsigned long address)
 268{
 269        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 270        int need_tlb_flush, idx;
 271
 272        /*
 273         * When ->invalidate_page runs, the linux pte has been zapped
 274         * already but the page is still allocated until
 275         * ->invalidate_page returns. So if we increase the sequence
 276         * here the kvm page fault will notice if the spte can't be
 277         * established because the page is going to be freed. If
 278         * instead the kvm page fault establishes the spte before
 279         * ->invalidate_page runs, kvm_unmap_hva will release it
 280         * before returning.
 281         *
 282         * The sequence increase only need to be seen at spin_unlock
 283         * time, and not at spin_lock time.
 284         *
 285         * Increasing the sequence after the spin_unlock would be
 286         * unsafe because the kvm page fault could then establish the
 287         * pte after kvm_unmap_hva returned, without noticing the page
 288         * is going to be freed.
 289         */
 290        idx = srcu_read_lock(&kvm->srcu);
 291        spin_lock(&kvm->mmu_lock);
 292
 293        kvm->mmu_notifier_seq++;
 294        need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
 295        /* we've to flush the tlb before the pages can be freed */
 296        if (need_tlb_flush)
 297                kvm_flush_remote_tlbs(kvm);
 298
 299        spin_unlock(&kvm->mmu_lock);
 300        srcu_read_unlock(&kvm->srcu, idx);
 301}
 302
 303static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 304                                        struct mm_struct *mm,
 305                                        unsigned long address,
 306                                        pte_t pte)
 307{
 308        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 309        int idx;
 310
 311        idx = srcu_read_lock(&kvm->srcu);
 312        spin_lock(&kvm->mmu_lock);
 313        kvm->mmu_notifier_seq++;
 314        kvm_set_spte_hva(kvm, address, pte);
 315        spin_unlock(&kvm->mmu_lock);
 316        srcu_read_unlock(&kvm->srcu, idx);
 317}
 318
 319static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 320                                                    struct mm_struct *mm,
 321                                                    unsigned long start,
 322                                                    unsigned long end)
 323{
 324        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 325        int need_tlb_flush = 0, idx;
 326
 327        idx = srcu_read_lock(&kvm->srcu);
 328        spin_lock(&kvm->mmu_lock);
 329        /*
 330         * The count increase must become visible at unlock time as no
 331         * spte can be established without taking the mmu_lock and
 332         * count is also read inside the mmu_lock critical section.
 333         */
 334        kvm->mmu_notifier_count++;
 335        for (; start < end; start += PAGE_SIZE)
 336                need_tlb_flush |= kvm_unmap_hva(kvm, start);
 337        need_tlb_flush |= kvm->tlbs_dirty;
 338        /* we've to flush the tlb before the pages can be freed */
 339        if (need_tlb_flush)
 340                kvm_flush_remote_tlbs(kvm);
 341
 342        spin_unlock(&kvm->mmu_lock);
 343        srcu_read_unlock(&kvm->srcu, idx);
 344}
 345
 346static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 347                                                  struct mm_struct *mm,
 348                                                  unsigned long start,
 349                                                  unsigned long end)
 350{
 351        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 352
 353        spin_lock(&kvm->mmu_lock);
 354        /*
 355         * This sequence increase will notify the kvm page fault that
 356         * the page that is going to be mapped in the spte could have
 357         * been freed.
 358         */
 359        kvm->mmu_notifier_seq++;
 360        smp_wmb();
 361        /*
 362         * The above sequence increase must be visible before the
 363         * below count decrease, which is ensured by the smp_wmb above
 364         * in conjunction with the smp_rmb in mmu_notifier_retry().
 365         */
 366        kvm->mmu_notifier_count--;
 367        spin_unlock(&kvm->mmu_lock);
 368
 369        BUG_ON(kvm->mmu_notifier_count < 0);
 370}
 371
 372static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 373                                              struct mm_struct *mm,
 374                                              unsigned long address)
 375{
 376        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 377        int young, idx;
 378
 379        idx = srcu_read_lock(&kvm->srcu);
 380        spin_lock(&kvm->mmu_lock);
 381
 382        young = kvm_age_hva(kvm, address);
 383        if (young)
 384                kvm_flush_remote_tlbs(kvm);
 385
 386        spin_unlock(&kvm->mmu_lock);
 387        srcu_read_unlock(&kvm->srcu, idx);
 388
 389        return young;
 390}
 391
 392static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 393                                       struct mm_struct *mm,
 394                                       unsigned long address)
 395{
 396        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 397        int young, idx;
 398
 399        idx = srcu_read_lock(&kvm->srcu);
 400        spin_lock(&kvm->mmu_lock);
 401        young = kvm_test_age_hva(kvm, address);
 402        spin_unlock(&kvm->mmu_lock);
 403        srcu_read_unlock(&kvm->srcu, idx);
 404
 405        return young;
 406}
 407
 408static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 409                                     struct mm_struct *mm)
 410{
 411        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 412        int idx;
 413
 414        idx = srcu_read_lock(&kvm->srcu);
 415        kvm_arch_flush_shadow(kvm);
 416        srcu_read_unlock(&kvm->srcu, idx);
 417}
 418
 419static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 420        .invalidate_page        = kvm_mmu_notifier_invalidate_page,
 421        .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
 422        .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
 423        .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
 424        .test_young             = kvm_mmu_notifier_test_young,
 425        .change_pte             = kvm_mmu_notifier_change_pte,
 426        .release                = kvm_mmu_notifier_release,
 427};
 428
 429static int kvm_init_mmu_notifier(struct kvm *kvm)
 430{
 431        kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 432        return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 433}
 434
 435#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
 436
 437static int kvm_init_mmu_notifier(struct kvm *kvm)
 438{
 439        return 0;
 440}
 441
 442#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 443
 444static void kvm_init_memslots_id(struct kvm *kvm)
 445{
 446        int i;
 447        struct kvm_memslots *slots = kvm->memslots;
 448
 449        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
 450                slots->id_to_index[i] = slots->memslots[i].id = i;
 451}
 452
 453static struct kvm *kvm_create_vm(unsigned long type)
 454{
 455        int r, i;
 456        struct kvm *kvm = kvm_arch_alloc_vm();
 457
 458        if (!kvm)
 459                return ERR_PTR(-ENOMEM);
 460
 461        r = kvm_arch_init_vm(kvm, type);
 462        if (r)
 463                goto out_err_nodisable;
 464
 465        r = hardware_enable_all();
 466        if (r)
 467                goto out_err_nodisable;
 468
 469#ifdef CONFIG_HAVE_KVM_IRQCHIP
 470        INIT_HLIST_HEAD(&kvm->mask_notifier_list);
 471        INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 472#endif
 473
 474        r = -ENOMEM;
 475        kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 476        if (!kvm->memslots)
 477                goto out_err_nosrcu;
 478        kvm_init_memslots_id(kvm);
 479        if (init_srcu_struct(&kvm->srcu))
 480                goto out_err_nosrcu;
 481        for (i = 0; i < KVM_NR_BUSES; i++) {
 482                kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
 483                                        GFP_KERNEL);
 484                if (!kvm->buses[i])
 485                        goto out_err;
 486        }
 487
 488        spin_lock_init(&kvm->mmu_lock);
 489        kvm->mm = current->mm;
 490        atomic_inc(&kvm->mm->mm_count);
 491        kvm_eventfd_init(kvm);
 492        mutex_init(&kvm->lock);
 493        mutex_init(&kvm->irq_lock);
 494        mutex_init(&kvm->slots_lock);
 495        atomic_set(&kvm->users_count, 1);
 496
 497        r = kvm_init_mmu_notifier(kvm);
 498        if (r)
 499                goto out_err;
 500
 501        raw_spin_lock(&kvm_lock);
 502        list_add(&kvm->vm_list, &vm_list);
 503        raw_spin_unlock(&kvm_lock);
 504
 505        return kvm;
 506
 507out_err:
 508        cleanup_srcu_struct(&kvm->srcu);
 509out_err_nosrcu:
 510        hardware_disable_all();
 511out_err_nodisable:
 512        for (i = 0; i < KVM_NR_BUSES; i++)
 513                kfree(kvm->buses[i]);
 514        kfree(kvm->memslots);
 515        kvm_arch_free_vm(kvm);
 516        return ERR_PTR(r);
 517}
 518
 519static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 520{
 521        if (!memslot->dirty_bitmap)
 522                return;
 523
 524        if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE)
 525                vfree(memslot->dirty_bitmap_head);
 526        else
 527                kfree(memslot->dirty_bitmap_head);
 528
 529        memslot->dirty_bitmap = NULL;
 530        memslot->dirty_bitmap_head = NULL;
 531}
 532
 533/*
 534 * Free any memory in @free but not in @dont.
 535 */
 536static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 537                                  struct kvm_memory_slot *dont)
 538{
 539        if (!dont || free->rmap != dont->rmap)
 540                vfree(free->rmap);
 541
 542        if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
 543                kvm_destroy_dirty_bitmap(free);
 544
 545        kvm_arch_free_memslot(free, dont);
 546
 547        free->npages = 0;
 548        free->rmap = NULL;
 549}
 550
 551void kvm_free_physmem(struct kvm *kvm)
 552{
 553        struct kvm_memslots *slots = kvm->memslots;
 554        struct kvm_memory_slot *memslot;
 555
 556        kvm_for_each_memslot(memslot, slots)
 557                kvm_free_physmem_slot(memslot, NULL);
 558
 559        kfree(kvm->memslots);
 560}
 561
 562static void kvm_destroy_vm(struct kvm *kvm)
 563{
 564        int i;
 565        struct mm_struct *mm = kvm->mm;
 566
 567        kvm_arch_sync_events(kvm);
 568        raw_spin_lock(&kvm_lock);
 569        list_del(&kvm->vm_list);
 570        raw_spin_unlock(&kvm_lock);
 571        kvm_free_irq_routing(kvm);
 572        for (i = 0; i < KVM_NR_BUSES; i++)
 573                kvm_io_bus_destroy(kvm->buses[i]);
 574        kvm_coalesced_mmio_free(kvm);
 575#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 576        mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
 577#else
 578        kvm_arch_flush_shadow(kvm);
 579#endif
 580        kvm_arch_destroy_vm(kvm);
 581        kvm_free_physmem(kvm);
 582        cleanup_srcu_struct(&kvm->srcu);
 583        kvm_arch_free_vm(kvm);
 584        hardware_disable_all();
 585        mmdrop(mm);
 586}
 587
 588void kvm_get_kvm(struct kvm *kvm)
 589{
 590        atomic_inc(&kvm->users_count);
 591}
 592EXPORT_SYMBOL_GPL(kvm_get_kvm);
 593
 594void kvm_put_kvm(struct kvm *kvm)
 595{
 596        if (atomic_dec_and_test(&kvm->users_count))
 597                kvm_destroy_vm(kvm);
 598}
 599EXPORT_SYMBOL_GPL(kvm_put_kvm);
 600
 601
 602static int kvm_vm_release(struct inode *inode, struct file *filp)
 603{
 604        struct kvm *kvm = filp->private_data;
 605
 606        kvm_irqfd_release(kvm);
 607
 608        kvm_put_kvm(kvm);
 609        return 0;
 610}
 611
 612/*
 613 * Allocation size is twice as large as the actual dirty bitmap size.
 614 * This makes it possible to do double buffering: see x86's
 615 * kvm_vm_ioctl_get_dirty_log().
 616 */
 617static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 618{
 619#ifndef CONFIG_S390
 620        unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 621
 622        if (dirty_bytes > PAGE_SIZE)
 623                memslot->dirty_bitmap = vzalloc(dirty_bytes);
 624        else
 625                memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL);
 626
 627        if (!memslot->dirty_bitmap)
 628                return -ENOMEM;
 629
 630        memslot->dirty_bitmap_head = memslot->dirty_bitmap;
 631        memslot->nr_dirty_pages = 0;
 632#endif /* !CONFIG_S390 */
 633        return 0;
 634}
 635
 636static int cmp_memslot(const void *slot1, const void *slot2)
 637{
 638        struct kvm_memory_slot *s1, *s2;
 639
 640        s1 = (struct kvm_memory_slot *)slot1;
 641        s2 = (struct kvm_memory_slot *)slot2;
 642
 643        if (s1->npages < s2->npages)
 644                return 1;
 645        if (s1->npages > s2->npages)
 646                return -1;
 647
 648        return 0;
 649}
 650
 651/*
 652 * Sort the memslots base on its size, so the larger slots
 653 * will get better fit.
 654 */
 655static void sort_memslots(struct kvm_memslots *slots)
 656{
 657        int i;
 658
 659        sort(slots->memslots, KVM_MEM_SLOTS_NUM,
 660              sizeof(struct kvm_memory_slot), cmp_memslot, NULL);
 661
 662        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
 663                slots->id_to_index[slots->memslots[i].id] = i;
 664}
 665
 666void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
 667{
 668        if (new) {
 669                int id = new->id;
 670                struct kvm_memory_slot *old = id_to_memslot(slots, id);
 671                unsigned long npages = old->npages;
 672
 673                *old = *new;
 674                if (new->npages != npages)
 675                        sort_memslots(slots);
 676        }
 677
 678        slots->generation++;
 679}
 680
 681/*
 682 * Allocate some memory and give it an address in the guest physical address
 683 * space.
 684 *
 685 * Discontiguous memory is allowed, mostly for framebuffers.
 686 *
 687 * Must be called holding mmap_sem for write.
 688 */
 689int __kvm_set_memory_region(struct kvm *kvm,
 690                            struct kvm_userspace_memory_region *mem,
 691                            int user_alloc)
 692{
 693        int r;
 694        gfn_t base_gfn;
 695        unsigned long npages;
 696        unsigned long i;
 697        struct kvm_memory_slot *memslot;
 698        struct kvm_memory_slot old, new;
 699        struct kvm_memslots *slots, *old_memslots;
 700
 701        r = -EINVAL;
 702        /* General sanity checks */
 703        if (mem->memory_size & (PAGE_SIZE - 1))
 704                goto out;
 705        if (mem->guest_phys_addr & (PAGE_SIZE - 1))
 706                goto out;
 707        /* We can read the guest memory with __xxx_user() later on. */
 708        if (user_alloc &&
 709            ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
 710             !access_ok(VERIFY_WRITE,
 711                        (void __user *)(unsigned long)mem->userspace_addr,
 712                        mem->memory_size)))
 713                goto out;
 714        if (mem->slot >= KVM_MEM_SLOTS_NUM)
 715                goto out;
 716        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
 717                goto out;
 718
 719        memslot = id_to_memslot(kvm->memslots, mem->slot);
 720        base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
 721        npages = mem->memory_size >> PAGE_SHIFT;
 722
 723        r = -EINVAL;
 724        if (npages > KVM_MEM_MAX_NR_PAGES)
 725                goto out;
 726
 727        if (!npages)
 728                mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
 729
 730        new = old = *memslot;
 731
 732        new.id = mem->slot;
 733        new.base_gfn = base_gfn;
 734        new.npages = npages;
 735        new.flags = mem->flags;
 736
 737        /* Disallow changing a memory slot's size. */
 738        r = -EINVAL;
 739        if (npages && old.npages && npages != old.npages)
 740                goto out_free;
 741
 742        /* Check for overlaps */
 743        r = -EEXIST;
 744        for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
 745                struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
 746
 747                if (s == memslot || !s->npages)
 748                        continue;
 749                if (!((base_gfn + npages <= s->base_gfn) ||
 750                      (base_gfn >= s->base_gfn + s->npages)))
 751                        goto out_free;
 752        }
 753
 754        /* Free page dirty bitmap if unneeded */
 755        if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
 756                new.dirty_bitmap = NULL;
 757
 758        r = -ENOMEM;
 759
 760        /* Allocate if a slot is being created */
 761        if (npages && !old.npages) {
 762                new.user_alloc = user_alloc;
 763                new.userspace_addr = mem->userspace_addr;
 764#ifndef CONFIG_S390
 765                new.rmap = vzalloc(npages * sizeof(*new.rmap));
 766                if (!new.rmap)
 767                        goto out_free;
 768#endif /* not defined CONFIG_S390 */
 769                if (kvm_arch_create_memslot(&new, npages))
 770                        goto out_free;
 771        }
 772
 773        /* Allocate page dirty bitmap if needed */
 774        if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
 775                if (kvm_create_dirty_bitmap(&new) < 0)
 776                        goto out_free;
 777                /* destroy any largepage mappings for dirty tracking */
 778        }
 779
 780        if (!npages) {
 781                struct kvm_memory_slot *slot;
 782
 783                r = -ENOMEM;
 784                slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
 785                                GFP_KERNEL);
 786                if (!slots)
 787                        goto out_free;
 788                slot = id_to_memslot(slots, mem->slot);
 789                slot->flags |= KVM_MEMSLOT_INVALID;
 790
 791                update_memslots(slots, NULL);
 792
 793                old_memslots = kvm->memslots;
 794                rcu_assign_pointer(kvm->memslots, slots);
 795                synchronize_srcu_expedited(&kvm->srcu);
 796                /* From this point no new shadow pages pointing to a deleted
 797                 * memslot will be created.
 798                 *
 799                 * validation of sp->gfn happens in:
 800                 *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
 801                 *      - kvm_is_visible_gfn (mmu_check_roots)
 802                 */
 803                kvm_arch_flush_shadow(kvm);
 804                kfree(old_memslots);
 805        }
 806
 807        r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
 808        if (r)
 809                goto out_free;
 810
 811        /* map/unmap the pages in iommu page table */
 812        if (npages) {
 813                r = kvm_iommu_map_pages(kvm, &new);
 814                if (r)
 815                        goto out_free;
 816        } else
 817                kvm_iommu_unmap_pages(kvm, &old);
 818
 819        r = -ENOMEM;
 820        slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
 821                        GFP_KERNEL);
 822        if (!slots)
 823                goto out_free;
 824
 825        /* actual memory is freed via old in kvm_free_physmem_slot below */
 826        if (!npages) {
 827                new.rmap = NULL;
 828                new.dirty_bitmap = NULL;
 829                memset(&new.arch, 0, sizeof(new.arch));
 830        }
 831
 832        update_memslots(slots, &new);
 833        old_memslots = kvm->memslots;
 834        rcu_assign_pointer(kvm->memslots, slots);
 835        synchronize_srcu_expedited(&kvm->srcu);
 836
 837        kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
 838
 839        /*
 840         * If the new memory slot is created, we need to clear all
 841         * mmio sptes.
 842         */
 843        if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
 844                kvm_arch_flush_shadow(kvm);
 845
 846        kvm_free_physmem_slot(&old, &new);
 847        kfree(old_memslots);
 848
 849        return 0;
 850
 851out_free:
 852        kvm_free_physmem_slot(&new, &old);
 853out:
 854        return r;
 855
 856}
 857EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
 858
 859int kvm_set_memory_region(struct kvm *kvm,
 860                          struct kvm_userspace_memory_region *mem,
 861                          int user_alloc)
 862{
 863        int r;
 864
 865        mutex_lock(&kvm->slots_lock);
 866        r = __kvm_set_memory_region(kvm, mem, user_alloc);
 867        mutex_unlock(&kvm->slots_lock);
 868        return r;
 869}
 870EXPORT_SYMBOL_GPL(kvm_set_memory_region);
 871
 872int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 873                                   struct
 874                                   kvm_userspace_memory_region *mem,
 875                                   int user_alloc)
 876{
 877        if (mem->slot >= KVM_MEMORY_SLOTS)
 878                return -EINVAL;
 879        return kvm_set_memory_region(kvm, mem, user_alloc);
 880}
 881
 882int kvm_get_dirty_log(struct kvm *kvm,
 883                        struct kvm_dirty_log *log, int *is_dirty)
 884{
 885        struct kvm_memory_slot *memslot;
 886        int r, i;
 887        unsigned long n;
 888        unsigned long any = 0;
 889
 890        r = -EINVAL;
 891        if (log->slot >= KVM_MEMORY_SLOTS)
 892                goto out;
 893
 894        memslot = id_to_memslot(kvm->memslots, log->slot);
 895        r = -ENOENT;
 896        if (!memslot->dirty_bitmap)
 897                goto out;
 898
 899        n = kvm_dirty_bitmap_bytes(memslot);
 900
 901        for (i = 0; !any && i < n/sizeof(long); ++i)
 902                any = memslot->dirty_bitmap[i];
 903
 904        r = -EFAULT;
 905        if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
 906                goto out;
 907
 908        if (any)
 909                *is_dirty = 1;
 910
 911        r = 0;
 912out:
 913        return r;
 914}
 915
 916bool kvm_largepages_enabled(void)
 917{
 918        return largepages_enabled;
 919}
 920
 921void kvm_disable_largepages(void)
 922{
 923        largepages_enabled = false;
 924}
 925EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 926
 927int is_error_page(struct page *page)
 928{
 929        return page == bad_page || page == hwpoison_page || page == fault_page;
 930}
 931EXPORT_SYMBOL_GPL(is_error_page);
 932
 933int is_error_pfn(pfn_t pfn)
 934{
 935        return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
 936}
 937EXPORT_SYMBOL_GPL(is_error_pfn);
 938
 939int is_hwpoison_pfn(pfn_t pfn)
 940{
 941        return pfn == hwpoison_pfn;
 942}
 943EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
 944
 945int is_fault_pfn(pfn_t pfn)
 946{
 947        return pfn == fault_pfn;
 948}
 949EXPORT_SYMBOL_GPL(is_fault_pfn);
 950
 951int is_noslot_pfn(pfn_t pfn)
 952{
 953        return pfn == bad_pfn;
 954}
 955EXPORT_SYMBOL_GPL(is_noslot_pfn);
 956
 957int is_invalid_pfn(pfn_t pfn)
 958{
 959        return pfn == hwpoison_pfn || pfn == fault_pfn;
 960}
 961EXPORT_SYMBOL_GPL(is_invalid_pfn);
 962
 963static inline unsigned long bad_hva(void)
 964{
 965        return PAGE_OFFSET;
 966}
 967
 968int kvm_is_error_hva(unsigned long addr)
 969{
 970        return addr == bad_hva();
 971}
 972EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 973
 974struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 975{
 976        return __gfn_to_memslot(kvm_memslots(kvm), gfn);
 977}
 978EXPORT_SYMBOL_GPL(gfn_to_memslot);
 979
 980int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 981{
 982        struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
 983
 984        if (!memslot || memslot->id >= KVM_MEMORY_SLOTS ||
 985              memslot->flags & KVM_MEMSLOT_INVALID)
 986                return 0;
 987
 988        return 1;
 989}
 990EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
 991
 992unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
 993{
 994        struct vm_area_struct *vma;
 995        unsigned long addr, size;
 996
 997        size = PAGE_SIZE;
 998
 999        addr = gfn_to_hva(kvm, gfn);
1000        if (kvm_is_error_hva(addr))
1001                return PAGE_SIZE;
1002
1003        down_read(&current->mm->mmap_sem);
1004        vma = find_vma(current->mm, addr);
1005        if (!vma)
1006                goto out;
1007
1008        size = vma_kernel_pagesize(vma);
1009
1010out:
1011        up_read(&current->mm->mmap_sem);
1012
1013        return size;
1014}
1015
1016static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1017                                     gfn_t *nr_pages)
1018{
1019        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1020                return bad_hva();
1021
1022        if (nr_pages)
1023                *nr_pages = slot->npages - (gfn - slot->base_gfn);
1024
1025        return gfn_to_hva_memslot(slot, gfn);
1026}
1027
1028unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1029{
1030        return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1031}
1032EXPORT_SYMBOL_GPL(gfn_to_hva);
1033
1034static pfn_t get_fault_pfn(void)
1035{
1036        get_page(fault_page);
1037        return fault_pfn;
1038}
1039
1040int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
1041        unsigned long start, int write, struct page **page)
1042{
1043        int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
1044
1045        if (write)
1046                flags |= FOLL_WRITE;
1047
1048        return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
1049}
1050
1051static inline int check_user_page_hwpoison(unsigned long addr)
1052{
1053        int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
1054
1055        rc = __get_user_pages(current, current->mm, addr, 1,
1056                              flags, NULL, NULL, NULL);
1057        return rc == -EHWPOISON;
1058}
1059
1060static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
1061                        bool *async, bool write_fault, bool *writable)
1062{
1063        struct page *page[1];
1064        int npages = 0;
1065        pfn_t pfn;
1066
1067        /* we can do it either atomically or asynchronously, not both */
1068        BUG_ON(atomic && async);
1069
1070        BUG_ON(!write_fault && !writable);
1071
1072        if (writable)
1073                *writable = true;
1074
1075        if (atomic || async)
1076                npages = __get_user_pages_fast(addr, 1, 1, page);
1077
1078        if (unlikely(npages != 1) && !atomic) {
1079                might_sleep();
1080
1081                if (writable)
1082                        *writable = write_fault;
1083
1084                if (async) {
1085                        down_read(&current->mm->mmap_sem);
1086                        npages = get_user_page_nowait(current, current->mm,
1087                                                     addr, write_fault, page);
1088                        up_read(&current->mm->mmap_sem);
1089                } else
1090                        npages = get_user_pages_fast(addr, 1, write_fault,
1091                                                     page);
1092
1093                /* map read fault as writable if possible */
1094                if (unlikely(!write_fault) && npages == 1) {
1095                        struct page *wpage[1];
1096
1097                        npages = __get_user_pages_fast(addr, 1, 1, wpage);
1098                        if (npages == 1) {
1099                                *writable = true;
1100                                put_page(page[0]);
1101                                page[0] = wpage[0];
1102                        }
1103                        npages = 1;
1104                }
1105        }
1106
1107        if (unlikely(npages != 1)) {
1108                struct vm_area_struct *vma;
1109
1110                if (atomic)
1111                        return get_fault_pfn();
1112
1113                down_read(&current->mm->mmap_sem);
1114                if (npages == -EHWPOISON ||
1115                        (!async && check_user_page_hwpoison(addr))) {
1116                        up_read(&current->mm->mmap_sem);
1117                        get_page(hwpoison_page);
1118                        return page_to_pfn(hwpoison_page);
1119                }
1120
1121                vma = find_vma_intersection(current->mm, addr, addr+1);
1122
1123                if (vma == NULL)
1124                        pfn = get_fault_pfn();
1125                else if ((vma->vm_flags & VM_PFNMAP)) {
1126                        pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
1127                                vma->vm_pgoff;
1128                        BUG_ON(!kvm_is_mmio_pfn(pfn));
1129                } else {
1130                        if (async && (vma->vm_flags & VM_WRITE))
1131                                *async = true;
1132                        pfn = get_fault_pfn();
1133                }
1134                up_read(&current->mm->mmap_sem);
1135        } else
1136                pfn = page_to_pfn(page[0]);
1137
1138        return pfn;
1139}
1140
1141pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
1142{
1143        return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
1144}
1145EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
1146
1147static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1148                          bool write_fault, bool *writable)
1149{
1150        unsigned long addr;
1151
1152        if (async)
1153                *async = false;
1154
1155        addr = gfn_to_hva(kvm, gfn);
1156        if (kvm_is_error_hva(addr)) {
1157                get_page(bad_page);
1158                return page_to_pfn(bad_page);
1159        }
1160
1161        return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
1162}
1163
1164pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1165{
1166        return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
1167}
1168EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1169
1170pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
1171                       bool write_fault, bool *writable)
1172{
1173        return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
1174}
1175EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
1176
1177pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1178{
1179        return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
1180}
1181EXPORT_SYMBOL_GPL(gfn_to_pfn);
1182
1183pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1184                      bool *writable)
1185{
1186        return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
1187}
1188EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1189
1190pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
1191                         struct kvm_memory_slot *slot, gfn_t gfn)
1192{
1193        unsigned long addr = gfn_to_hva_memslot(slot, gfn);
1194        return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
1195}
1196
1197int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1198                                                                  int nr_pages)
1199{
1200        unsigned long addr;
1201        gfn_t entry;
1202
1203        addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
1204        if (kvm_is_error_hva(addr))
1205                return -1;
1206
1207        if (entry < nr_pages)
1208                return 0;
1209
1210        return __get_user_pages_fast(addr, nr_pages, 1, pages);
1211}
1212EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1213
1214struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1215{
1216        pfn_t pfn;
1217
1218        pfn = gfn_to_pfn(kvm, gfn);
1219        if (!kvm_is_mmio_pfn(pfn))
1220                return pfn_to_page(pfn);
1221
1222        WARN_ON(kvm_is_mmio_pfn(pfn));
1223
1224        get_page(bad_page);
1225        return bad_page;
1226}
1227
1228EXPORT_SYMBOL_GPL(gfn_to_page);
1229
1230void kvm_release_page_clean(struct page *page)
1231{
1232        kvm_release_pfn_clean(page_to_pfn(page));
1233}
1234EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1235
1236void kvm_release_pfn_clean(pfn_t pfn)
1237{
1238        if (!kvm_is_mmio_pfn(pfn))
1239                put_page(pfn_to_page(pfn));
1240}
1241EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1242
1243void kvm_release_page_dirty(struct page *page)
1244{
1245        kvm_release_pfn_dirty(page_to_pfn(page));
1246}
1247EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
1248
1249void kvm_release_pfn_dirty(pfn_t pfn)
1250{
1251        kvm_set_pfn_dirty(pfn);
1252        kvm_release_pfn_clean(pfn);
1253}
1254EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1255
1256void kvm_set_page_dirty(struct page *page)
1257{
1258        kvm_set_pfn_dirty(page_to_pfn(page));
1259}
1260EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
1261
1262void kvm_set_pfn_dirty(pfn_t pfn)
1263{
1264        if (!kvm_is_mmio_pfn(pfn)) {
1265                struct page *page = pfn_to_page(pfn);
1266                if (!PageReserved(page))
1267                        SetPageDirty(page);
1268        }
1269}
1270EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1271
1272void kvm_set_pfn_accessed(pfn_t pfn)
1273{
1274        if (!kvm_is_mmio_pfn(pfn))
1275                mark_page_accessed(pfn_to_page(pfn));
1276}
1277EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
1278
1279void kvm_get_pfn(pfn_t pfn)
1280{
1281        if (!kvm_is_mmio_pfn(pfn))
1282                get_page(pfn_to_page(pfn));
1283}
1284EXPORT_SYMBOL_GPL(kvm_get_pfn);
1285
1286static int next_segment(unsigned long len, int offset)
1287{
1288        if (len > PAGE_SIZE - offset)
1289                return PAGE_SIZE - offset;
1290        else
1291                return len;
1292}
1293
1294int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1295                        int len)
1296{
1297        int r;
1298        unsigned long addr;
1299
1300        addr = gfn_to_hva(kvm, gfn);
1301        if (kvm_is_error_hva(addr))
1302                return -EFAULT;
1303        r = __copy_from_user(data, (void __user *)addr + offset, len);
1304        if (r)
1305                return -EFAULT;
1306        return 0;
1307}
1308EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1309
1310int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1311{
1312        gfn_t gfn = gpa >> PAGE_SHIFT;
1313        int seg;
1314        int offset = offset_in_page(gpa);
1315        int ret;
1316
1317        while ((seg = next_segment(len, offset)) != 0) {
1318                ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
1319                if (ret < 0)
1320                        return ret;
1321                offset = 0;
1322                len -= seg;
1323                data += seg;
1324                ++gfn;
1325        }
1326        return 0;
1327}
1328EXPORT_SYMBOL_GPL(kvm_read_guest);
1329
1330int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1331                          unsigned long len)
1332{
1333        int r;
1334        unsigned long addr;
1335        gfn_t gfn = gpa >> PAGE_SHIFT;
1336        int offset = offset_in_page(gpa);
1337
1338        addr = gfn_to_hva(kvm, gfn);
1339        if (kvm_is_error_hva(addr))
1340                return -EFAULT;
1341        pagefault_disable();
1342        r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
1343        pagefault_enable();
1344        if (r)
1345                return -EFAULT;
1346        return 0;
1347}
1348EXPORT_SYMBOL(kvm_read_guest_atomic);
1349
1350int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1351                         int offset, int len)
1352{
1353        int r;
1354        unsigned long addr;
1355
1356        addr = gfn_to_hva(kvm, gfn);
1357        if (kvm_is_error_hva(addr))
1358                return -EFAULT;
1359        r = __copy_to_user((void __user *)addr + offset, data, len);
1360        if (r)
1361                return -EFAULT;
1362        mark_page_dirty(kvm, gfn);
1363        return 0;
1364}
1365EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1366
1367int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1368                    unsigned long len)
1369{
1370        gfn_t gfn = gpa >> PAGE_SHIFT;
1371        int seg;
1372        int offset = offset_in_page(gpa);
1373        int ret;
1374
1375        while ((seg = next_segment(len, offset)) != 0) {
1376                ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
1377                if (ret < 0)
1378                        return ret;
1379                offset = 0;
1380                len -= seg;
1381                data += seg;
1382                ++gfn;
1383        }
1384        return 0;
1385}
1386
1387int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1388                              gpa_t gpa)
1389{
1390        struct kvm_memslots *slots = kvm_memslots(kvm);
1391        int offset = offset_in_page(gpa);
1392        gfn_t gfn = gpa >> PAGE_SHIFT;
1393
1394        ghc->gpa = gpa;
1395        ghc->generation = slots->generation;
1396        ghc->memslot = gfn_to_memslot(kvm, gfn);
1397        ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
1398        if (!kvm_is_error_hva(ghc->hva))
1399                ghc->hva += offset;
1400        else
1401                return -EFAULT;
1402
1403        return 0;
1404}
1405EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
1406
1407int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1408                           void *data, unsigned long len)
1409{
1410        struct kvm_memslots *slots = kvm_memslots(kvm);
1411        int r;
1412
1413        if (slots->generation != ghc->generation)
1414                kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1415
1416        if (kvm_is_error_hva(ghc->hva))
1417                return -EFAULT;
1418
1419        r = __copy_to_user((void __user *)ghc->hva, data, len);
1420        if (r)
1421                return -EFAULT;
1422        mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
1423
1424        return 0;
1425}
1426EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
1427
1428int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1429                           void *data, unsigned long len)
1430{
1431        struct kvm_memslots *slots = kvm_memslots(kvm);
1432        int r;
1433
1434        if (slots->generation != ghc->generation)
1435                kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1436
1437        if (kvm_is_error_hva(ghc->hva))
1438                return -EFAULT;
1439
1440        r = __copy_from_user(data, (void __user *)ghc->hva, len);
1441        if (r)
1442                return -EFAULT;
1443
1444        return 0;
1445}
1446EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
1447
1448int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1449{
1450        return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
1451                                    offset, len);
1452}
1453EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1454
1455int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1456{
1457        gfn_t gfn = gpa >> PAGE_SHIFT;
1458        int seg;
1459        int offset = offset_in_page(gpa);
1460        int ret;
1461
1462        while ((seg = next_segment(len, offset)) != 0) {
1463                ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
1464                if (ret < 0)
1465                        return ret;
1466                offset = 0;
1467                len -= seg;
1468                ++gfn;
1469        }
1470        return 0;
1471}
1472EXPORT_SYMBOL_GPL(kvm_clear_guest);
1473
1474void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
1475                             gfn_t gfn)
1476{
1477        if (memslot && memslot->dirty_bitmap) {
1478                unsigned long rel_gfn = gfn - memslot->base_gfn;
1479
1480                if (!test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap))
1481                        memslot->nr_dirty_pages++;
1482        }
1483}
1484
1485void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1486{
1487        struct kvm_memory_slot *memslot;
1488
1489        memslot = gfn_to_memslot(kvm, gfn);
1490        mark_page_dirty_in_slot(kvm, memslot, gfn);
1491}
1492
1493/*
1494 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1495 */
1496void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1497{
1498        DEFINE_WAIT(wait);
1499
1500        for (;;) {
1501                prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1502
1503                if (kvm_arch_vcpu_runnable(vcpu)) {
1504                        kvm_make_request(KVM_REQ_UNHALT, vcpu);
1505                        break;
1506                }
1507                if (kvm_cpu_has_pending_timer(vcpu))
1508                        break;
1509                if (signal_pending(current))
1510                        break;
1511
1512                schedule();
1513        }
1514
1515        finish_wait(&vcpu->wq, &wait);
1516}
1517
1518void kvm_resched(struct kvm_vcpu *vcpu)
1519{
1520        if (!need_resched())
1521                return;
1522        cond_resched();
1523}
1524EXPORT_SYMBOL_GPL(kvm_resched);
1525
1526void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1527{
1528        struct kvm *kvm = me->kvm;
1529        struct kvm_vcpu *vcpu;
1530        int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
1531        int yielded = 0;
1532        int pass;
1533        int i;
1534
1535        /*
1536         * We boost the priority of a VCPU that is runnable but not
1537         * currently running, because it got preempted by something
1538         * else and called schedule in __vcpu_run.  Hopefully that
1539         * VCPU is holding the lock that we need and will release it.
1540         * We approximate round-robin by starting at the last boosted VCPU.
1541         */
1542        for (pass = 0; pass < 2 && !yielded; pass++) {
1543                kvm_for_each_vcpu(i, vcpu, kvm) {
1544                        struct task_struct *task = NULL;
1545                        struct pid *pid;
1546                        if (!pass && i < last_boosted_vcpu) {
1547                                i = last_boosted_vcpu;
1548                                continue;
1549                        } else if (pass && i > last_boosted_vcpu)
1550                                break;
1551                        if (vcpu == me)
1552                                continue;
1553                        if (waitqueue_active(&vcpu->wq))
1554                                continue;
1555                        rcu_read_lock();
1556                        pid = rcu_dereference(vcpu->pid);
1557                        if (pid)
1558                                task = get_pid_task(vcpu->pid, PIDTYPE_PID);
1559                        rcu_read_unlock();
1560                        if (!task)
1561                                continue;
1562                        if (task->flags & PF_VCPU) {
1563                                put_task_struct(task);
1564                                continue;
1565                        }
1566                        if (yield_to(task, 1)) {
1567                                put_task_struct(task);
1568                                kvm->last_boosted_vcpu = i;
1569                                yielded = 1;
1570                                break;
1571                        }
1572                        put_task_struct(task);
1573                }
1574        }
1575}
1576EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1577
1578static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1579{
1580        struct kvm_vcpu *vcpu = vma->vm_file->private_data;
1581        struct page *page;
1582
1583        if (vmf->pgoff == 0)
1584                page = virt_to_page(vcpu->run);
1585#ifdef CONFIG_X86
1586        else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
1587                page = virt_to_page(vcpu->arch.pio_data);
1588#endif
1589#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1590        else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
1591                page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
1592#endif
1593        else
1594                return kvm_arch_vcpu_fault(vcpu, vmf);
1595        get_page(page);
1596        vmf->page = page;
1597        return 0;
1598}
1599
1600static const struct vm_operations_struct kvm_vcpu_vm_ops = {
1601        .fault = kvm_vcpu_fault,
1602};
1603
1604static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1605{
1606        vma->vm_ops = &kvm_vcpu_vm_ops;
1607        return 0;
1608}
1609
1610static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1611{
1612        struct kvm_vcpu *vcpu = filp->private_data;
1613
1614        kvm_put_kvm(vcpu->kvm);
1615        return 0;
1616}
1617
1618static struct file_operations kvm_vcpu_fops = {
1619        .release        = kvm_vcpu_release,
1620        .unlocked_ioctl = kvm_vcpu_ioctl,
1621#ifdef CONFIG_COMPAT
1622        .compat_ioctl   = kvm_vcpu_compat_ioctl,
1623#endif
1624        .mmap           = kvm_vcpu_mmap,
1625        .llseek         = noop_llseek,
1626};
1627
1628/*
1629 * Allocates an inode for the vcpu.
1630 */
1631static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1632{
1633        return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
1634}
1635
1636/*
1637 * Creates some virtual cpus.  Good luck creating more than one.
1638 */
1639static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1640{
1641        int r;
1642        struct kvm_vcpu *vcpu, *v;
1643
1644        vcpu = kvm_arch_vcpu_create(kvm, id);
1645        if (IS_ERR(vcpu))
1646                return PTR_ERR(vcpu);
1647
1648        preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1649
1650        r = kvm_arch_vcpu_setup(vcpu);
1651        if (r)
1652                goto vcpu_destroy;
1653
1654        mutex_lock(&kvm->lock);
1655        if (!kvm_vcpu_compatible(vcpu)) {
1656                r = -EINVAL;
1657                goto unlock_vcpu_destroy;
1658        }
1659        if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1660                r = -EINVAL;
1661                goto unlock_vcpu_destroy;
1662        }
1663
1664        kvm_for_each_vcpu(r, v, kvm)
1665                if (v->vcpu_id == id) {
1666                        r = -EEXIST;
1667                        goto unlock_vcpu_destroy;
1668                }
1669
1670        BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
1671
1672        /* Now it's all set up, let userspace reach it */
1673        kvm_get_kvm(kvm);
1674        r = create_vcpu_fd(vcpu);
1675        if (r < 0) {
1676                kvm_put_kvm(kvm);
1677                goto unlock_vcpu_destroy;
1678        }
1679
1680        kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
1681        smp_wmb();
1682        atomic_inc(&kvm->online_vcpus);
1683
1684        mutex_unlock(&kvm->lock);
1685        return r;
1686
1687unlock_vcpu_destroy:
1688        mutex_unlock(&kvm->lock);
1689vcpu_destroy:
1690        kvm_arch_vcpu_destroy(vcpu);
1691        return r;
1692}
1693
1694static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1695{
1696        if (sigset) {
1697                sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1698                vcpu->sigset_active = 1;
1699                vcpu->sigset = *sigset;
1700        } else
1701                vcpu->sigset_active = 0;
1702        return 0;
1703}
1704
1705static long kvm_vcpu_ioctl(struct file *filp,
1706                           unsigned int ioctl, unsigned long arg)
1707{
1708        struct kvm_vcpu *vcpu = filp->private_data;
1709        void __user *argp = (void __user *)arg;
1710        int r;
1711        struct kvm_fpu *fpu = NULL;
1712        struct kvm_sregs *kvm_sregs = NULL;
1713
1714        if (vcpu->kvm->mm != current->mm)
1715                return -EIO;
1716
1717#if defined(CONFIG_S390) || defined(CONFIG_PPC)
1718        /*
1719         * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
1720         * so vcpu_load() would break it.
1721         */
1722        if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
1723                return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1724#endif
1725
1726
1727        vcpu_load(vcpu);
1728        switch (ioctl) {
1729        case KVM_RUN:
1730                r = -EINVAL;
1731                if (arg)
1732                        goto out;
1733                r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
1734                trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
1735                break;
1736        case KVM_GET_REGS: {
1737                struct kvm_regs *kvm_regs;
1738
1739                r = -ENOMEM;
1740                kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1741                if (!kvm_regs)
1742                        goto out;
1743                r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
1744                if (r)
1745                        goto out_free1;
1746                r = -EFAULT;
1747                if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
1748                        goto out_free1;
1749                r = 0;
1750out_free1:
1751                kfree(kvm_regs);
1752                break;
1753        }
1754        case KVM_SET_REGS: {
1755                struct kvm_regs *kvm_regs;
1756
1757                r = -ENOMEM;
1758                kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
1759                if (IS_ERR(kvm_regs)) {
1760                        r = PTR_ERR(kvm_regs);
1761                        goto out;
1762                }
1763                r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
1764                if (r)
1765                        goto out_free2;
1766                r = 0;
1767out_free2:
1768                kfree(kvm_regs);
1769                break;
1770        }
1771        case KVM_GET_SREGS: {
1772                kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1773                r = -ENOMEM;
1774                if (!kvm_sregs)
1775                        goto out;
1776                r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
1777                if (r)
1778                        goto out;
1779                r = -EFAULT;
1780                if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
1781                        goto out;
1782                r = 0;
1783                break;
1784        }
1785        case KVM_SET_SREGS: {
1786                kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
1787                if (IS_ERR(kvm_sregs)) {
1788                        r = PTR_ERR(kvm_sregs);
1789                        goto out;
1790                }
1791                r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
1792                if (r)
1793                        goto out;
1794                r = 0;
1795                break;
1796        }
1797        case KVM_GET_MP_STATE: {
1798                struct kvm_mp_state mp_state;
1799
1800                r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
1801                if (r)
1802                        goto out;
1803                r = -EFAULT;
1804                if (copy_to_user(argp, &mp_state, sizeof mp_state))
1805                        goto out;
1806                r = 0;
1807                break;
1808        }
1809        case KVM_SET_MP_STATE: {
1810                struct kvm_mp_state mp_state;
1811
1812                r = -EFAULT;
1813                if (copy_from_user(&mp_state, argp, sizeof mp_state))
1814                        goto out;
1815                r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
1816                if (r)
1817                        goto out;
1818                r = 0;
1819                break;
1820        }
1821        case KVM_TRANSLATE: {
1822                struct kvm_translation tr;
1823
1824                r = -EFAULT;
1825                if (copy_from_user(&tr, argp, sizeof tr))
1826                        goto out;
1827                r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
1828                if (r)
1829                        goto out;
1830                r = -EFAULT;
1831                if (copy_to_user(argp, &tr, sizeof tr))
1832                        goto out;
1833                r = 0;
1834                break;
1835        }
1836        case KVM_SET_GUEST_DEBUG: {
1837                struct kvm_guest_debug dbg;
1838
1839                r = -EFAULT;
1840                if (copy_from_user(&dbg, argp, sizeof dbg))
1841                        goto out;
1842                r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
1843                if (r)
1844                        goto out;
1845                r = 0;
1846                break;
1847        }
1848        case KVM_SET_SIGNAL_MASK: {
1849                struct kvm_signal_mask __user *sigmask_arg = argp;
1850                struct kvm_signal_mask kvm_sigmask;
1851                sigset_t sigset, *p;
1852
1853                p = NULL;
1854                if (argp) {
1855                        r = -EFAULT;
1856                        if (copy_from_user(&kvm_sigmask, argp,
1857                                           sizeof kvm_sigmask))
1858                                goto out;
1859                        r = -EINVAL;
1860                        if (kvm_sigmask.len != sizeof sigset)
1861                                goto out;
1862                        r = -EFAULT;
1863                        if (copy_from_user(&sigset, sigmask_arg->sigset,
1864                                           sizeof sigset))
1865                                goto out;
1866                        p = &sigset;
1867                }
1868                r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
1869                break;
1870        }
1871        case KVM_GET_FPU: {
1872                fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1873                r = -ENOMEM;
1874                if (!fpu)
1875                        goto out;
1876                r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
1877                if (r)
1878                        goto out;
1879                r = -EFAULT;
1880                if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
1881                        goto out;
1882                r = 0;
1883                break;
1884        }
1885        case KVM_SET_FPU: {
1886                fpu = memdup_user(argp, sizeof(*fpu));
1887                if (IS_ERR(fpu)) {
1888                        r = PTR_ERR(fpu);
1889                        goto out;
1890                }
1891                r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
1892                if (r)
1893                        goto out;
1894                r = 0;
1895                break;
1896        }
1897        default:
1898                r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1899        }
1900out:
1901        vcpu_put(vcpu);
1902        kfree(fpu);
1903        kfree(kvm_sregs);
1904        return r;
1905}
1906
1907#ifdef CONFIG_COMPAT
1908static long kvm_vcpu_compat_ioctl(struct file *filp,
1909                                  unsigned int ioctl, unsigned long arg)
1910{
1911        struct kvm_vcpu *vcpu = filp->private_data;
1912        void __user *argp = compat_ptr(arg);
1913        int r;
1914
1915        if (vcpu->kvm->mm != current->mm)
1916                return -EIO;
1917
1918        switch (ioctl) {
1919        case KVM_SET_SIGNAL_MASK: {
1920                struct kvm_signal_mask __user *sigmask_arg = argp;
1921                struct kvm_signal_mask kvm_sigmask;
1922                compat_sigset_t csigset;
1923                sigset_t sigset;
1924
1925                if (argp) {
1926                        r = -EFAULT;
1927                        if (copy_from_user(&kvm_sigmask, argp,
1928                                           sizeof kvm_sigmask))
1929                                goto out;
1930                        r = -EINVAL;
1931                        if (kvm_sigmask.len != sizeof csigset)
1932                                goto out;
1933                        r = -EFAULT;
1934                        if (copy_from_user(&csigset, sigmask_arg->sigset,
1935                                           sizeof csigset))
1936                                goto out;
1937                }
1938                sigset_from_compat(&sigset, &csigset);
1939                r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
1940                break;
1941        }
1942        default:
1943                r = kvm_vcpu_ioctl(filp, ioctl, arg);
1944        }
1945
1946out:
1947        return r;
1948}
1949#endif
1950
1951static long kvm_vm_ioctl(struct file *filp,
1952                           unsigned int ioctl, unsigned long arg)
1953{
1954        struct kvm *kvm = filp->private_data;
1955        void __user *argp = (void __user *)arg;
1956        int r;
1957
1958        if (kvm->mm != current->mm)
1959                return -EIO;
1960        switch (ioctl) {
1961        case KVM_CREATE_VCPU:
1962                r = kvm_vm_ioctl_create_vcpu(kvm, arg);
1963                if (r < 0)
1964                        goto out;
1965                break;
1966        case KVM_SET_USER_MEMORY_REGION: {
1967                struct kvm_userspace_memory_region kvm_userspace_mem;
1968
1969                r = -EFAULT;
1970                if (copy_from_user(&kvm_userspace_mem, argp,
1971                                                sizeof kvm_userspace_mem))
1972                        goto out;
1973
1974                r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
1975                if (r)
1976                        goto out;
1977                break;
1978        }
1979        case KVM_GET_DIRTY_LOG: {
1980                struct kvm_dirty_log log;
1981
1982                r = -EFAULT;
1983                if (copy_from_user(&log, argp, sizeof log))
1984                        goto out;
1985                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
1986                if (r)
1987                        goto out;
1988                break;
1989        }
1990#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1991        case KVM_REGISTER_COALESCED_MMIO: {
1992                struct kvm_coalesced_mmio_zone zone;
1993                r = -EFAULT;
1994                if (copy_from_user(&zone, argp, sizeof zone))
1995                        goto out;
1996                r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
1997                if (r)
1998                        goto out;
1999                r = 0;
2000                break;
2001        }
2002        case KVM_UNREGISTER_COALESCED_MMIO: {
2003                struct kvm_coalesced_mmio_zone zone;
2004                r = -EFAULT;
2005                if (copy_from_user(&zone, argp, sizeof zone))
2006                        goto out;
2007                r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
2008                if (r)
2009                        goto out;
2010                r = 0;
2011                break;
2012        }
2013#endif
2014        case KVM_IRQFD: {
2015                struct kvm_irqfd data;
2016
2017                r = -EFAULT;
2018                if (copy_from_user(&data, argp, sizeof data))
2019                        goto out;
2020                r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
2021                break;
2022        }
2023        case KVM_IOEVENTFD: {
2024                struct kvm_ioeventfd data;
2025
2026                r = -EFAULT;
2027                if (copy_from_user(&data, argp, sizeof data))
2028                        goto out;
2029                r = kvm_ioeventfd(kvm, &data);
2030                break;
2031        }
2032#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2033        case KVM_SET_BOOT_CPU_ID:
2034                r = 0;
2035                mutex_lock(&kvm->lock);
2036                if (atomic_read(&kvm->online_vcpus) != 0)
2037                        r = -EBUSY;
2038                else
2039                        kvm->bsp_vcpu_id = arg;
2040                mutex_unlock(&kvm->lock);
2041                break;
2042#endif
2043        default:
2044                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
2045                if (r == -ENOTTY)
2046                        r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
2047        }
2048out:
2049        return r;
2050}
2051
2052#ifdef CONFIG_COMPAT
2053struct compat_kvm_dirty_log {
2054        __u32 slot;
2055        __u32 padding1;
2056        union {
2057                compat_uptr_t dirty_bitmap; /* one bit per page */
2058                __u64 padding2;
2059        };
2060};
2061
2062static long kvm_vm_compat_ioctl(struct file *filp,
2063                           unsigned int ioctl, unsigned long arg)
2064{
2065        struct kvm *kvm = filp->private_data;
2066        int r;
2067
2068        if (kvm->mm != current->mm)
2069                return -EIO;
2070        switch (ioctl) {
2071        case KVM_GET_DIRTY_LOG: {
2072                struct compat_kvm_dirty_log compat_log;
2073                struct kvm_dirty_log log;
2074
2075                r = -EFAULT;
2076                if (copy_from_user(&compat_log, (void __user *)arg,
2077                                   sizeof(compat_log)))
2078                        goto out;
2079                log.slot         = compat_log.slot;
2080                log.padding1     = compat_log.padding1;
2081                log.padding2     = compat_log.padding2;
2082                log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
2083
2084                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2085                if (r)
2086                        goto out;
2087                break;
2088        }
2089        default:
2090                r = kvm_vm_ioctl(filp, ioctl, arg);
2091        }
2092
2093out:
2094        return r;
2095}
2096#endif
2097
2098static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2099{
2100        struct page *page[1];
2101        unsigned long addr;
2102        int npages;
2103        gfn_t gfn = vmf->pgoff;
2104        struct kvm *kvm = vma->vm_file->private_data;
2105
2106        addr = gfn_to_hva(kvm, gfn);
2107        if (kvm_is_error_hva(addr))
2108                return VM_FAULT_SIGBUS;
2109
2110        npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
2111                                NULL);
2112        if (unlikely(npages != 1))
2113                return VM_FAULT_SIGBUS;
2114
2115        vmf->page = page[0];
2116        return 0;
2117}
2118
2119static const struct vm_operations_struct kvm_vm_vm_ops = {
2120        .fault = kvm_vm_fault,
2121};
2122
2123static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2124{
2125        vma->vm_ops = &kvm_vm_vm_ops;
2126        return 0;
2127}
2128
2129static struct file_operations kvm_vm_fops = {
2130        .release        = kvm_vm_release,
2131        .unlocked_ioctl = kvm_vm_ioctl,
2132#ifdef CONFIG_COMPAT
2133        .compat_ioctl   = kvm_vm_compat_ioctl,
2134#endif
2135        .mmap           = kvm_vm_mmap,
2136        .llseek         = noop_llseek,
2137};
2138
2139static int kvm_dev_ioctl_create_vm(unsigned long type)
2140{
2141        int r;
2142        struct kvm *kvm;
2143
2144        kvm = kvm_create_vm(type);
2145        if (IS_ERR(kvm))
2146                return PTR_ERR(kvm);
2147#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2148        r = kvm_coalesced_mmio_init(kvm);
2149        if (r < 0) {
2150                kvm_put_kvm(kvm);
2151                return r;
2152        }
2153#endif
2154        r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
2155        if (r < 0)
2156                kvm_put_kvm(kvm);
2157
2158        return r;
2159}
2160
2161static long kvm_dev_ioctl_check_extension_generic(long arg)
2162{
2163        switch (arg) {
2164        case KVM_CAP_USER_MEMORY:
2165        case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
2166        case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
2167#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2168        case KVM_CAP_SET_BOOT_CPU_ID:
2169#endif
2170        case KVM_CAP_INTERNAL_ERROR_DATA:
2171                return 1;
2172#ifdef CONFIG_HAVE_KVM_IRQCHIP
2173        case KVM_CAP_IRQ_ROUTING:
2174                return KVM_MAX_IRQ_ROUTES;
2175#endif
2176        default:
2177                break;
2178        }
2179        return kvm_dev_ioctl_check_extension(arg);
2180}
2181
2182static long kvm_dev_ioctl(struct file *filp,
2183                          unsigned int ioctl, unsigned long arg)
2184{
2185        long r = -EINVAL;
2186
2187        switch (ioctl) {
2188        case KVM_GET_API_VERSION:
2189                r = -EINVAL;
2190                if (arg)
2191                        goto out;
2192                r = KVM_API_VERSION;
2193                break;
2194        case KVM_CREATE_VM:
2195                r = kvm_dev_ioctl_create_vm(arg);
2196                break;
2197        case KVM_CHECK_EXTENSION:
2198                r = kvm_dev_ioctl_check_extension_generic(arg);
2199                break;
2200        case KVM_GET_VCPU_MMAP_SIZE:
2201                r = -EINVAL;
2202                if (arg)
2203                        goto out;
2204                r = PAGE_SIZE;     /* struct kvm_run */
2205#ifdef CONFIG_X86
2206                r += PAGE_SIZE;    /* pio data page */
2207#endif
2208#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2209                r += PAGE_SIZE;    /* coalesced mmio ring page */
2210#endif
2211                break;
2212        case KVM_TRACE_ENABLE:
2213        case KVM_TRACE_PAUSE:
2214        case KVM_TRACE_DISABLE:
2215                r = -EOPNOTSUPP;
2216                break;
2217        default:
2218                return kvm_arch_dev_ioctl(filp, ioctl, arg);
2219        }
2220out:
2221        return r;
2222}
2223
2224static struct file_operations kvm_chardev_ops = {
2225        .unlocked_ioctl = kvm_dev_ioctl,
2226        .compat_ioctl   = kvm_dev_ioctl,
2227        .llseek         = noop_llseek,
2228};
2229
2230static struct miscdevice kvm_dev = {
2231        KVM_MINOR,
2232        "kvm",
2233        &kvm_chardev_ops,
2234};
2235
2236static void hardware_enable_nolock(void *junk)
2237{
2238        int cpu = raw_smp_processor_id();
2239        int r;
2240
2241        if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
2242                return;
2243
2244        cpumask_set_cpu(cpu, cpus_hardware_enabled);
2245
2246        r = kvm_arch_hardware_enable(NULL);
2247
2248        if (r) {
2249                cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2250                atomic_inc(&hardware_enable_failed);
2251                printk(KERN_INFO "kvm: enabling virtualization on "
2252                                 "CPU%d failed\n", cpu);
2253        }
2254}
2255
2256static void hardware_enable(void *junk)
2257{
2258        raw_spin_lock(&kvm_lock);
2259        hardware_enable_nolock(junk);
2260        raw_spin_unlock(&kvm_lock);
2261}
2262
2263static void hardware_disable_nolock(void *junk)
2264{
2265        int cpu = raw_smp_processor_id();
2266
2267        if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
2268                return;
2269        cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2270        kvm_arch_hardware_disable(NULL);
2271}
2272
2273static void hardware_disable(void *junk)
2274{
2275        raw_spin_lock(&kvm_lock);
2276        hardware_disable_nolock(junk);
2277        raw_spin_unlock(&kvm_lock);
2278}
2279
2280static void hardware_disable_all_nolock(void)
2281{
2282        BUG_ON(!kvm_usage_count);
2283
2284        kvm_usage_count--;
2285        if (!kvm_usage_count)
2286                on_each_cpu(hardware_disable_nolock, NULL, 1);
2287}
2288
2289static void hardware_disable_all(void)
2290{
2291        raw_spin_lock(&kvm_lock);
2292        hardware_disable_all_nolock();
2293        raw_spin_unlock(&kvm_lock);
2294}
2295
2296static int hardware_enable_all(void)
2297{
2298        int r = 0;
2299
2300        raw_spin_lock(&kvm_lock);
2301
2302        kvm_usage_count++;
2303        if (kvm_usage_count == 1) {
2304                atomic_set(&hardware_enable_failed, 0);
2305                on_each_cpu(hardware_enable_nolock, NULL, 1);
2306
2307                if (atomic_read(&hardware_enable_failed)) {
2308                        hardware_disable_all_nolock();
2309                        r = -EBUSY;
2310                }
2311        }
2312
2313        raw_spin_unlock(&kvm_lock);
2314
2315        return r;
2316}
2317
2318static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2319                           void *v)
2320{
2321        int cpu = (long)v;
2322
2323        if (!kvm_usage_count)
2324                return NOTIFY_OK;
2325
2326        val &= ~CPU_TASKS_FROZEN;
2327        switch (val) {
2328        case CPU_DYING:
2329                printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2330                       cpu);
2331                hardware_disable(NULL);
2332                break;
2333        case CPU_STARTING:
2334                printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2335                       cpu);
2336                hardware_enable(NULL);
2337                break;
2338        }
2339        return NOTIFY_OK;
2340}
2341
2342
2343asmlinkage void kvm_spurious_fault(void)
2344{
2345        /* Fault while not rebooting.  We want the trace. */
2346        BUG();
2347}
2348EXPORT_SYMBOL_GPL(kvm_spurious_fault);
2349
2350static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2351                      void *v)
2352{
2353        /*
2354         * Some (well, at least mine) BIOSes hang on reboot if
2355         * in vmx root mode.
2356         *
2357         * And Intel TXT required VMX off for all cpu when system shutdown.
2358         */
2359        printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2360        kvm_rebooting = true;
2361        on_each_cpu(hardware_disable_nolock, NULL, 1);
2362        return NOTIFY_OK;
2363}
2364
2365static struct notifier_block kvm_reboot_notifier = {
2366        .notifier_call = kvm_reboot,
2367        .priority = 0,
2368};
2369
2370static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2371{
2372        int i;
2373
2374        for (i = 0; i < bus->dev_count; i++) {
2375                struct kvm_io_device *pos = bus->range[i].dev;
2376
2377                kvm_iodevice_destructor(pos);
2378        }
2379        kfree(bus);
2380}
2381
2382int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
2383{
2384        const struct kvm_io_range *r1 = p1;
2385        const struct kvm_io_range *r2 = p2;
2386
2387        if (r1->addr < r2->addr)
2388                return -1;
2389        if (r1->addr + r1->len > r2->addr + r2->len)
2390                return 1;
2391        return 0;
2392}
2393
2394int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
2395                          gpa_t addr, int len)
2396{
2397        if (bus->dev_count == NR_IOBUS_DEVS)
2398                return -ENOSPC;
2399
2400        bus->range[bus->dev_count++] = (struct kvm_io_range) {
2401                .addr = addr,
2402                .len = len,
2403                .dev = dev,
2404        };
2405
2406        sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
2407                kvm_io_bus_sort_cmp, NULL);
2408
2409        return 0;
2410}
2411
2412int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
2413                             gpa_t addr, int len)
2414{
2415        struct kvm_io_range *range, key;
2416        int off;
2417
2418        key = (struct kvm_io_range) {
2419                .addr = addr,
2420                .len = len,
2421        };
2422
2423        range = bsearch(&key, bus->range, bus->dev_count,
2424                        sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
2425        if (range == NULL)
2426                return -ENOENT;
2427
2428        off = range - bus->range;
2429
2430        while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
2431                off--;
2432
2433        return off;
2434}
2435
2436/* kvm_io_bus_write - called under kvm->slots_lock */
2437int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2438                     int len, const void *val)
2439{
2440        int idx;
2441        struct kvm_io_bus *bus;
2442        struct kvm_io_range range;
2443
2444        range = (struct kvm_io_range) {
2445                .addr = addr,
2446                .len = len,
2447        };
2448
2449        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2450        idx = kvm_io_bus_get_first_dev(bus, addr, len);
2451        if (idx < 0)
2452                return -EOPNOTSUPP;
2453
2454        while (idx < bus->dev_count &&
2455                kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
2456                if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
2457                        return 0;
2458                idx++;
2459        }
2460
2461        return -EOPNOTSUPP;
2462}
2463
2464/* kvm_io_bus_read - called under kvm->slots_lock */
2465int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2466                    int len, void *val)
2467{
2468        int idx;
2469        struct kvm_io_bus *bus;
2470        struct kvm_io_range range;
2471
2472        range = (struct kvm_io_range) {
2473                .addr = addr,
2474                .len = len,
2475        };
2476
2477        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2478        idx = kvm_io_bus_get_first_dev(bus, addr, len);
2479        if (idx < 0)
2480                return -EOPNOTSUPP;
2481
2482        while (idx < bus->dev_count &&
2483                kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
2484                if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
2485                        return 0;
2486                idx++;
2487        }
2488
2489        return -EOPNOTSUPP;
2490}
2491
2492/* Caller must hold slots_lock. */
2493int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2494                            int len, struct kvm_io_device *dev)
2495{
2496        struct kvm_io_bus *new_bus, *bus;
2497
2498        bus = kvm->buses[bus_idx];
2499        if (bus->dev_count > NR_IOBUS_DEVS-1)
2500                return -ENOSPC;
2501
2502        new_bus = kmemdup(bus, sizeof(struct kvm_io_bus), GFP_KERNEL);
2503        if (!new_bus)
2504                return -ENOMEM;
2505        kvm_io_bus_insert_dev(new_bus, dev, addr, len);
2506        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2507        synchronize_srcu_expedited(&kvm->srcu);
2508        kfree(bus);
2509
2510        return 0;
2511}
2512
2513/* Caller must hold slots_lock. */
2514int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2515                              struct kvm_io_device *dev)
2516{
2517        int i, r;
2518        struct kvm_io_bus *new_bus, *bus;
2519
2520        bus = kvm->buses[bus_idx];
2521
2522        new_bus = kmemdup(bus, sizeof(*bus), GFP_KERNEL);
2523        if (!new_bus)
2524                return -ENOMEM;
2525
2526        r = -ENOENT;
2527        for (i = 0; i < new_bus->dev_count; i++)
2528                if (new_bus->range[i].dev == dev) {
2529                        r = 0;
2530                        new_bus->dev_count--;
2531                        new_bus->range[i] = new_bus->range[new_bus->dev_count];
2532                        sort(new_bus->range, new_bus->dev_count,
2533                             sizeof(struct kvm_io_range),
2534                             kvm_io_bus_sort_cmp, NULL);
2535                        break;
2536                }
2537
2538        if (r) {
2539                kfree(new_bus);
2540                return r;
2541        }
2542
2543        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2544        synchronize_srcu_expedited(&kvm->srcu);
2545        kfree(bus);
2546        return r;
2547}
2548
2549static struct notifier_block kvm_cpu_notifier = {
2550        .notifier_call = kvm_cpu_hotplug,
2551};
2552
2553static int vm_stat_get(void *_offset, u64 *val)
2554{
2555        unsigned offset = (long)_offset;
2556        struct kvm *kvm;
2557
2558        *val = 0;
2559        raw_spin_lock(&kvm_lock);
2560        list_for_each_entry(kvm, &vm_list, vm_list)
2561                *val += *(u32 *)((void *)kvm + offset);
2562        raw_spin_unlock(&kvm_lock);
2563        return 0;
2564}
2565
2566DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
2567
2568static int vcpu_stat_get(void *_offset, u64 *val)
2569{
2570        unsigned offset = (long)_offset;
2571        struct kvm *kvm;
2572        struct kvm_vcpu *vcpu;
2573        int i;
2574
2575        *val = 0;
2576        raw_spin_lock(&kvm_lock);
2577        list_for_each_entry(kvm, &vm_list, vm_list)
2578                kvm_for_each_vcpu(i, vcpu, kvm)
2579                        *val += *(u32 *)((void *)vcpu + offset);
2580
2581        raw_spin_unlock(&kvm_lock);
2582        return 0;
2583}
2584
2585DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
2586
2587static const struct file_operations *stat_fops[] = {
2588        [KVM_STAT_VCPU] = &vcpu_stat_fops,
2589        [KVM_STAT_VM]   = &vm_stat_fops,
2590};
2591
2592static int kvm_init_debug(void)
2593{
2594        int r = -EFAULT;
2595        struct kvm_stats_debugfs_item *p;
2596
2597        kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
2598        if (kvm_debugfs_dir == NULL)
2599                goto out;
2600
2601        for (p = debugfs_entries; p->name; ++p) {
2602                p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
2603                                                (void *)(long)p->offset,
2604                                                stat_fops[p->kind]);
2605                if (p->dentry == NULL)
2606                        goto out_dir;
2607        }
2608
2609        return 0;
2610
2611out_dir:
2612        debugfs_remove_recursive(kvm_debugfs_dir);
2613out:
2614        return r;
2615}
2616
2617static void kvm_exit_debug(void)
2618{
2619        struct kvm_stats_debugfs_item *p;
2620
2621        for (p = debugfs_entries; p->name; ++p)
2622                debugfs_remove(p->dentry);
2623        debugfs_remove(kvm_debugfs_dir);
2624}
2625
2626static int kvm_suspend(void)
2627{
2628        if (kvm_usage_count)
2629                hardware_disable_nolock(NULL);
2630        return 0;
2631}
2632
2633static void kvm_resume(void)
2634{
2635        if (kvm_usage_count) {
2636                WARN_ON(raw_spin_is_locked(&kvm_lock));
2637                hardware_enable_nolock(NULL);
2638        }
2639}
2640
2641static struct syscore_ops kvm_syscore_ops = {
2642        .suspend = kvm_suspend,
2643        .resume = kvm_resume,
2644};
2645
2646struct page *bad_page;
2647pfn_t bad_pfn;
2648
2649static inline
2650struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2651{
2652        return container_of(pn, struct kvm_vcpu, preempt_notifier);
2653}
2654
2655static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
2656{
2657        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2658
2659        kvm_arch_vcpu_load(vcpu, cpu);
2660}
2661
2662static void kvm_sched_out(struct preempt_notifier *pn,
2663                          struct task_struct *next)
2664{
2665        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2666
2667        kvm_arch_vcpu_put(vcpu);
2668}
2669
2670int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2671                  struct module *module)
2672{
2673        int r;
2674        int cpu;
2675
2676        r = kvm_arch_init(opaque);
2677        if (r)
2678                goto out_fail;
2679
2680        bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2681
2682        if (bad_page == NULL) {
2683                r = -ENOMEM;
2684                goto out;
2685        }
2686
2687        bad_pfn = page_to_pfn(bad_page);
2688
2689        hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2690
2691        if (hwpoison_page == NULL) {
2692                r = -ENOMEM;
2693                goto out_free_0;
2694        }
2695
2696        hwpoison_pfn = page_to_pfn(hwpoison_page);
2697
2698        fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2699
2700        if (fault_page == NULL) {
2701                r = -ENOMEM;
2702                goto out_free_0;
2703        }
2704
2705        fault_pfn = page_to_pfn(fault_page);
2706
2707        if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2708                r = -ENOMEM;
2709                goto out_free_0;
2710        }
2711
2712        r = kvm_arch_hardware_setup();
2713        if (r < 0)
2714                goto out_free_0a;
2715
2716        for_each_online_cpu(cpu) {
2717                smp_call_function_single(cpu,
2718                                kvm_arch_check_processor_compat,
2719                                &r, 1);
2720                if (r < 0)
2721                        goto out_free_1;
2722        }
2723
2724        r = register_cpu_notifier(&kvm_cpu_notifier);
2725        if (r)
2726                goto out_free_2;
2727        register_reboot_notifier(&kvm_reboot_notifier);
2728
2729        /* A kmem cache lets us meet the alignment requirements of fx_save. */
2730        if (!vcpu_align)
2731                vcpu_align = __alignof__(struct kvm_vcpu);
2732        kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
2733                                           0, NULL);
2734        if (!kvm_vcpu_cache) {
2735                r = -ENOMEM;
2736                goto out_free_3;
2737        }
2738
2739        r = kvm_async_pf_init();
2740        if (r)
2741                goto out_free;
2742
2743        kvm_chardev_ops.owner = module;
2744        kvm_vm_fops.owner = module;
2745        kvm_vcpu_fops.owner = module;
2746
2747        r = misc_register(&kvm_dev);
2748        if (r) {
2749                printk(KERN_ERR "kvm: misc device register failed\n");
2750                goto out_unreg;
2751        }
2752
2753        register_syscore_ops(&kvm_syscore_ops);
2754
2755        kvm_preempt_ops.sched_in = kvm_sched_in;
2756        kvm_preempt_ops.sched_out = kvm_sched_out;
2757
2758        r = kvm_init_debug();
2759        if (r) {
2760                printk(KERN_ERR "kvm: create debugfs files failed\n");
2761                goto out_undebugfs;
2762        }
2763
2764        return 0;
2765
2766out_undebugfs:
2767        unregister_syscore_ops(&kvm_syscore_ops);
2768out_unreg:
2769        kvm_async_pf_deinit();
2770out_free:
2771        kmem_cache_destroy(kvm_vcpu_cache);
2772out_free_3:
2773        unregister_reboot_notifier(&kvm_reboot_notifier);
2774        unregister_cpu_notifier(&kvm_cpu_notifier);
2775out_free_2:
2776out_free_1:
2777        kvm_arch_hardware_unsetup();
2778out_free_0a:
2779        free_cpumask_var(cpus_hardware_enabled);
2780out_free_0:
2781        if (fault_page)
2782                __free_page(fault_page);
2783        if (hwpoison_page)
2784                __free_page(hwpoison_page);
2785        __free_page(bad_page);
2786out:
2787        kvm_arch_exit();
2788out_fail:
2789        return r;
2790}
2791EXPORT_SYMBOL_GPL(kvm_init);
2792
2793void kvm_exit(void)
2794{
2795        kvm_exit_debug();
2796        misc_deregister(&kvm_dev);
2797        kmem_cache_destroy(kvm_vcpu_cache);
2798        kvm_async_pf_deinit();
2799        unregister_syscore_ops(&kvm_syscore_ops);
2800        unregister_reboot_notifier(&kvm_reboot_notifier);
2801        unregister_cpu_notifier(&kvm_cpu_notifier);
2802        on_each_cpu(hardware_disable_nolock, NULL, 1);
2803        kvm_arch_hardware_unsetup();
2804        kvm_arch_exit();
2805        free_cpumask_var(cpus_hardware_enabled);
2806        __free_page(hwpoison_page);
2807        __free_page(bad_page);
2808}
2809EXPORT_SYMBOL_GPL(kvm_exit);
2810