linux/virt/kvm/kvm_main.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * This module enables machines with Intel VT-x extensions to run virtual
   5 * machines without emulation or binary translation.
   6 *
   7 * Copyright (C) 2006 Qumranet, Inc.
   8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   9 *
  10 * Authors:
  11 *   Avi Kivity   <avi@qumranet.com>
  12 *   Yaniv Kamay  <yaniv@qumranet.com>
  13 *
  14 * This work is licensed under the terms of the GNU GPL, version 2.  See
  15 * the COPYING file in the top-level directory.
  16 *
  17 */
  18
  19#include "iodev.h"
  20
  21#include <linux/kvm_host.h>
  22#include <linux/kvm.h>
  23#include <linux/module.h>
  24#include <linux/errno.h>
  25#include <linux/percpu.h>
  26#include <linux/mm.h>
  27#include <linux/miscdevice.h>
  28#include <linux/vmalloc.h>
  29#include <linux/reboot.h>
  30#include <linux/debugfs.h>
  31#include <linux/highmem.h>
  32#include <linux/file.h>
  33#include <linux/syscore_ops.h>
  34#include <linux/cpu.h>
  35#include <linux/sched.h>
  36#include <linux/cpumask.h>
  37#include <linux/smp.h>
  38#include <linux/anon_inodes.h>
  39#include <linux/profile.h>
  40#include <linux/kvm_para.h>
  41#include <linux/pagemap.h>
  42#include <linux/mman.h>
  43#include <linux/swap.h>
  44#include <linux/bitops.h>
  45#include <linux/spinlock.h>
  46#include <linux/compat.h>
  47#include <linux/srcu.h>
  48#include <linux/hugetlb.h>
  49#include <linux/slab.h>
  50#include <linux/sort.h>
  51#include <linux/bsearch.h>
  52
  53#include <asm/processor.h>
  54#include <asm/io.h>
  55#include <asm/uaccess.h>
  56#include <asm/pgtable.h>
  57
  58#include "coalesced_mmio.h"
  59#include "async_pf.h"
  60
  61#define CREATE_TRACE_POINTS
  62#include <trace/events/kvm.h>
  63
  64MODULE_AUTHOR("Qumranet");
  65MODULE_LICENSE("GPL");
  66
  67/*
  68 * Ordering of locks:
  69 *
  70 *              kvm->lock --> kvm->slots_lock --> kvm->irq_lock
  71 */
  72
  73DEFINE_RAW_SPINLOCK(kvm_lock);
  74LIST_HEAD(vm_list);
  75
  76static cpumask_var_t cpus_hardware_enabled;
  77static int kvm_usage_count = 0;
  78static atomic_t hardware_enable_failed;
  79
  80struct kmem_cache *kvm_vcpu_cache;
  81EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
  82
  83static __read_mostly struct preempt_ops kvm_preempt_ops;
  84
  85struct dentry *kvm_debugfs_dir;
  86
  87static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
  88                           unsigned long arg);
  89#ifdef CONFIG_COMPAT
  90static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
  91                                  unsigned long arg);
  92#endif
  93static int hardware_enable_all(void);
  94static void hardware_disable_all(void);
  95
  96static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
  97
  98bool kvm_rebooting;
  99EXPORT_SYMBOL_GPL(kvm_rebooting);
 100
 101static bool largepages_enabled = true;
 102
 103static struct page *hwpoison_page;
 104static pfn_t hwpoison_pfn;
 105
 106struct page *fault_page;
 107pfn_t fault_pfn;
 108
 109inline int kvm_is_mmio_pfn(pfn_t pfn)
 110{
 111        if (pfn_valid(pfn)) {
 112                int reserved;
 113                struct page *tail = pfn_to_page(pfn);
 114                struct page *head = compound_trans_head(tail);
 115                reserved = PageReserved(head);
 116                if (head != tail) {
 117                        /*
 118                         * "head" is not a dangling pointer
 119                         * (compound_trans_head takes care of that)
 120                         * but the hugepage may have been splitted
 121                         * from under us (and we may not hold a
 122                         * reference count on the head page so it can
 123                         * be reused before we run PageReferenced), so
 124                         * we've to check PageTail before returning
 125                         * what we just read.
 126                         */
 127                        smp_rmb();
 128                        if (PageTail(tail))
 129                                return reserved;
 130                }
 131                return PageReserved(tail);
 132        }
 133
 134        return true;
 135}
 136
 137/*
 138 * Switches to specified vcpu, until a matching vcpu_put()
 139 */
 140void vcpu_load(struct kvm_vcpu *vcpu)
 141{
 142        int cpu;
 143
 144        mutex_lock(&vcpu->mutex);
 145        if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
 146                /* The thread running this VCPU changed. */
 147                struct pid *oldpid = vcpu->pid;
 148                struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
 149                rcu_assign_pointer(vcpu->pid, newpid);
 150                synchronize_rcu();
 151                put_pid(oldpid);
 152        }
 153        cpu = get_cpu();
 154        preempt_notifier_register(&vcpu->preempt_notifier);
 155        kvm_arch_vcpu_load(vcpu, cpu);
 156        put_cpu();
 157}
 158
 159void vcpu_put(struct kvm_vcpu *vcpu)
 160{
 161        preempt_disable();
 162        kvm_arch_vcpu_put(vcpu);
 163        preempt_notifier_unregister(&vcpu->preempt_notifier);
 164        preempt_enable();
 165        mutex_unlock(&vcpu->mutex);
 166}
 167
 168static void ack_flush(void *_completed)
 169{
 170}
 171
 172static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 173{
 174        int i, cpu, me;
 175        cpumask_var_t cpus;
 176        bool called = true;
 177        struct kvm_vcpu *vcpu;
 178
 179        zalloc_cpumask_var(&cpus, GFP_ATOMIC);
 180
 181        me = get_cpu();
 182        kvm_for_each_vcpu(i, vcpu, kvm) {
 183                kvm_make_request(req, vcpu);
 184                cpu = vcpu->cpu;
 185
 186                /* Set ->requests bit before we read ->mode */
 187                smp_mb();
 188
 189                if (cpus != NULL && cpu != -1 && cpu != me &&
 190                      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
 191                        cpumask_set_cpu(cpu, cpus);
 192        }
 193        if (unlikely(cpus == NULL))
 194                smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
 195        else if (!cpumask_empty(cpus))
 196                smp_call_function_many(cpus, ack_flush, NULL, 1);
 197        else
 198                called = false;
 199        put_cpu();
 200        free_cpumask_var(cpus);
 201        return called;
 202}
 203
 204void kvm_flush_remote_tlbs(struct kvm *kvm)
 205{
 206        long dirty_count = kvm->tlbs_dirty;
 207
 208        smp_mb();
 209        if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 210                ++kvm->stat.remote_tlb_flush;
 211        cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 212}
 213
 214void kvm_reload_remote_mmus(struct kvm *kvm)
 215{
 216        make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 217}
 218
 219int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 220{
 221        struct page *page;
 222        int r;
 223
 224        mutex_init(&vcpu->mutex);
 225        vcpu->cpu = -1;
 226        vcpu->kvm = kvm;
 227        vcpu->vcpu_id = id;
 228        vcpu->pid = NULL;
 229        init_waitqueue_head(&vcpu->wq);
 230        kvm_async_pf_vcpu_init(vcpu);
 231
 232        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 233        if (!page) {
 234                r = -ENOMEM;
 235                goto fail;
 236        }
 237        vcpu->run = page_address(page);
 238
 239        r = kvm_arch_vcpu_init(vcpu);
 240        if (r < 0)
 241                goto fail_free_run;
 242        return 0;
 243
 244fail_free_run:
 245        free_page((unsigned long)vcpu->run);
 246fail:
 247        return r;
 248}
 249EXPORT_SYMBOL_GPL(kvm_vcpu_init);
 250
 251void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 252{
 253        put_pid(vcpu->pid);
 254        kvm_arch_vcpu_uninit(vcpu);
 255        free_page((unsigned long)vcpu->run);
 256}
 257EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
 258
 259#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 260static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 261{
 262        return container_of(mn, struct kvm, mmu_notifier);
 263}
 264
 265static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 266                                             struct mm_struct *mm,
 267                                             unsigned long address)
 268{
 269        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 270        int need_tlb_flush, idx;
 271
 272        /*
 273         * When ->invalidate_page runs, the linux pte has been zapped
 274         * already but the page is still allocated until
 275         * ->invalidate_page returns. So if we increase the sequence
 276         * here the kvm page fault will notice if the spte can't be
 277         * established because the page is going to be freed. If
 278         * instead the kvm page fault establishes the spte before
 279         * ->invalidate_page runs, kvm_unmap_hva will release it
 280         * before returning.
 281         *
 282         * The sequence increase only need to be seen at spin_unlock
 283         * time, and not at spin_lock time.
 284         *
 285         * Increasing the sequence after the spin_unlock would be
 286         * unsafe because the kvm page fault could then establish the
 287         * pte after kvm_unmap_hva returned, without noticing the page
 288         * is going to be freed.
 289         */
 290        idx = srcu_read_lock(&kvm->srcu);
 291        spin_lock(&kvm->mmu_lock);
 292
 293        kvm->mmu_notifier_seq++;
 294        need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
 295        /* we've to flush the tlb before the pages can be freed */
 296        if (need_tlb_flush)
 297                kvm_flush_remote_tlbs(kvm);
 298
 299        spin_unlock(&kvm->mmu_lock);
 300        srcu_read_unlock(&kvm->srcu, idx);
 301}
 302
 303static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 304                                        struct mm_struct *mm,
 305                                        unsigned long address,
 306                                        pte_t pte)
 307{
 308        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 309        int idx;
 310
 311        idx = srcu_read_lock(&kvm->srcu);
 312        spin_lock(&kvm->mmu_lock);
 313        kvm->mmu_notifier_seq++;
 314        kvm_set_spte_hva(kvm, address, pte);
 315        spin_unlock(&kvm->mmu_lock);
 316        srcu_read_unlock(&kvm->srcu, idx);
 317}
 318
 319static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 320                                                    struct mm_struct *mm,
 321                                                    unsigned long start,
 322                                                    unsigned long end)
 323{
 324        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 325        int need_tlb_flush = 0, idx;
 326
 327        idx = srcu_read_lock(&kvm->srcu);
 328        spin_lock(&kvm->mmu_lock);
 329        /*
 330         * The count increase must become visible at unlock time as no
 331         * spte can be established without taking the mmu_lock and
 332         * count is also read inside the mmu_lock critical section.
 333         */
 334        kvm->mmu_notifier_count++;
 335        for (; start < end; start += PAGE_SIZE)
 336                need_tlb_flush |= kvm_unmap_hva(kvm, start);
 337        need_tlb_flush |= kvm->tlbs_dirty;
 338        /* we've to flush the tlb before the pages can be freed */
 339        if (need_tlb_flush)
 340                kvm_flush_remote_tlbs(kvm);
 341
 342        spin_unlock(&kvm->mmu_lock);
 343        srcu_read_unlock(&kvm->srcu, idx);
 344}
 345
 346static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 347                                                  struct mm_struct *mm,
 348                                                  unsigned long start,
 349                                                  unsigned long end)
 350{
 351        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 352
 353        spin_lock(&kvm->mmu_lock);
 354        /*
 355         * This sequence increase will notify the kvm page fault that
 356         * the page that is going to be mapped in the spte could have
 357         * been freed.
 358         */
 359        kvm->mmu_notifier_seq++;
 360        smp_wmb();
 361        /*
 362         * The above sequence increase must be visible before the
 363         * below count decrease, which is ensured by the smp_wmb above
 364         * in conjunction with the smp_rmb in mmu_notifier_retry().
 365         */
 366        kvm->mmu_notifier_count--;
 367        spin_unlock(&kvm->mmu_lock);
 368
 369        BUG_ON(kvm->mmu_notifier_count < 0);
 370}
 371
 372static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 373                                              struct mm_struct *mm,
 374                                              unsigned long address)
 375{
 376        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 377        int young, idx;
 378
 379        idx = srcu_read_lock(&kvm->srcu);
 380        spin_lock(&kvm->mmu_lock);
 381
 382        young = kvm_age_hva(kvm, address);
 383        if (young)
 384                kvm_flush_remote_tlbs(kvm);
 385
 386        spin_unlock(&kvm->mmu_lock);
 387        srcu_read_unlock(&kvm->srcu, idx);
 388
 389        return young;
 390}
 391
 392static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 393                                       struct mm_struct *mm,
 394                                       unsigned long address)
 395{
 396        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 397        int young, idx;
 398
 399        idx = srcu_read_lock(&kvm->srcu);
 400        spin_lock(&kvm->mmu_lock);
 401        young = kvm_test_age_hva(kvm, address);
 402        spin_unlock(&kvm->mmu_lock);
 403        srcu_read_unlock(&kvm->srcu, idx);
 404
 405        return young;
 406}
 407
 408static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 409                                     struct mm_struct *mm)
 410{
 411        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 412        int idx;
 413
 414        idx = srcu_read_lock(&kvm->srcu);
 415        kvm_arch_flush_shadow(kvm);
 416        srcu_read_unlock(&kvm->srcu, idx);
 417}
 418
 419static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 420        .invalidate_page        = kvm_mmu_notifier_invalidate_page,
 421        .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
 422        .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
 423        .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
 424        .test_young             = kvm_mmu_notifier_test_young,
 425        .change_pte             = kvm_mmu_notifier_change_pte,
 426        .release                = kvm_mmu_notifier_release,
 427};
 428
 429static int kvm_init_mmu_notifier(struct kvm *kvm)
 430{
 431        kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 432        return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 433}
 434
 435#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
 436
 437static int kvm_init_mmu_notifier(struct kvm *kvm)
 438{
 439        return 0;
 440}
 441
 442#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 443
 444static void kvm_init_memslots_id(struct kvm *kvm)
 445{
 446        int i;
 447        struct kvm_memslots *slots = kvm->memslots;
 448
 449        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
 450                slots->id_to_index[i] = slots->memslots[i].id = i;
 451}
 452
 453static struct kvm *kvm_create_vm(unsigned long type)
 454{
 455        int r, i;
 456        struct kvm *kvm = kvm_arch_alloc_vm();
 457
 458        if (!kvm)
 459                return ERR_PTR(-ENOMEM);
 460
 461        r = kvm_arch_init_vm(kvm, type);
 462        if (r)
 463                goto out_err_nodisable;
 464
 465        r = hardware_enable_all();
 466        if (r)
 467                goto out_err_nodisable;
 468
 469#ifdef CONFIG_HAVE_KVM_IRQCHIP
 470        INIT_HLIST_HEAD(&kvm->mask_notifier_list);
 471        INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 472#endif
 473
 474        r = -ENOMEM;
 475        kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 476        if (!kvm->memslots)
 477                goto out_err_nosrcu;
 478        kvm_init_memslots_id(kvm);
 479        if (init_srcu_struct(&kvm->srcu))
 480                goto out_err_nosrcu;
 481        for (i = 0; i < KVM_NR_BUSES; i++) {
 482                kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
 483                                        GFP_KERNEL);
 484                if (!kvm->buses[i])
 485                        goto out_err;
 486        }
 487
 488        spin_lock_init(&kvm->mmu_lock);
 489        kvm->mm = current->mm;
 490        atomic_inc(&kvm->mm->mm_count);
 491        kvm_eventfd_init(kvm);
 492        mutex_init(&kvm->lock);
 493        mutex_init(&kvm->irq_lock);
 494        mutex_init(&kvm->slots_lock);
 495        atomic_set(&kvm->users_count, 1);
 496
 497        r = kvm_init_mmu_notifier(kvm);
 498        if (r)
 499                goto out_err;
 500
 501        raw_spin_lock(&kvm_lock);
 502        list_add(&kvm->vm_list, &vm_list);
 503        raw_spin_unlock(&kvm_lock);
 504
 505        return kvm;
 506
 507out_err:
 508        cleanup_srcu_struct(&kvm->srcu);
 509out_err_nosrcu:
 510        hardware_disable_all();
 511out_err_nodisable:
 512        for (i = 0; i < KVM_NR_BUSES; i++)
 513                kfree(kvm->buses[i]);
 514        kfree(kvm->memslots);
 515        kvm_arch_free_vm(kvm);
 516        return ERR_PTR(r);
 517}
 518
 519/*
 520 * Avoid using vmalloc for a small buffer.
 521 * Should not be used when the size is statically known.
 522 */
 523void *kvm_kvzalloc(unsigned long size)
 524{
 525        if (size > PAGE_SIZE)
 526                return vzalloc(size);
 527        else
 528                return kzalloc(size, GFP_KERNEL);
 529}
 530
 531void kvm_kvfree(const void *addr)
 532{
 533        if (is_vmalloc_addr(addr))
 534                vfree(addr);
 535        else
 536                kfree(addr);
 537}
 538
 539static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 540{
 541        if (!memslot->dirty_bitmap)
 542                return;
 543
 544        kvm_kvfree(memslot->dirty_bitmap);
 545        memslot->dirty_bitmap = NULL;
 546}
 547
 548/*
 549 * Free any memory in @free but not in @dont.
 550 */
 551static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 552                                  struct kvm_memory_slot *dont)
 553{
 554        if (!dont || free->rmap != dont->rmap)
 555                vfree(free->rmap);
 556
 557        if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
 558                kvm_destroy_dirty_bitmap(free);
 559
 560        kvm_arch_free_memslot(free, dont);
 561
 562        free->npages = 0;
 563        free->rmap = NULL;
 564}
 565
 566void kvm_free_physmem(struct kvm *kvm)
 567{
 568        struct kvm_memslots *slots = kvm->memslots;
 569        struct kvm_memory_slot *memslot;
 570
 571        kvm_for_each_memslot(memslot, slots)
 572                kvm_free_physmem_slot(memslot, NULL);
 573
 574        kfree(kvm->memslots);
 575}
 576
 577static void kvm_destroy_vm(struct kvm *kvm)
 578{
 579        int i;
 580        struct mm_struct *mm = kvm->mm;
 581
 582        kvm_arch_sync_events(kvm);
 583        raw_spin_lock(&kvm_lock);
 584        list_del(&kvm->vm_list);
 585        raw_spin_unlock(&kvm_lock);
 586        kvm_free_irq_routing(kvm);
 587        for (i = 0; i < KVM_NR_BUSES; i++)
 588                kvm_io_bus_destroy(kvm->buses[i]);
 589        kvm_coalesced_mmio_free(kvm);
 590#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 591        mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
 592#else
 593        kvm_arch_flush_shadow(kvm);
 594#endif
 595        kvm_arch_destroy_vm(kvm);
 596        kvm_free_physmem(kvm);
 597        cleanup_srcu_struct(&kvm->srcu);
 598        kvm_arch_free_vm(kvm);
 599        hardware_disable_all();
 600        mmdrop(mm);
 601}
 602
 603void kvm_get_kvm(struct kvm *kvm)
 604{
 605        atomic_inc(&kvm->users_count);
 606}
 607EXPORT_SYMBOL_GPL(kvm_get_kvm);
 608
 609void kvm_put_kvm(struct kvm *kvm)
 610{
 611        if (atomic_dec_and_test(&kvm->users_count))
 612                kvm_destroy_vm(kvm);
 613}
 614EXPORT_SYMBOL_GPL(kvm_put_kvm);
 615
 616
 617static int kvm_vm_release(struct inode *inode, struct file *filp)
 618{
 619        struct kvm *kvm = filp->private_data;
 620
 621        kvm_irqfd_release(kvm);
 622
 623        kvm_put_kvm(kvm);
 624        return 0;
 625}
 626
 627/*
 628 * Allocation size is twice as large as the actual dirty bitmap size.
 629 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
 630 */
 631static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 632{
 633#ifndef CONFIG_S390
 634        unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 635
 636        memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes);
 637        if (!memslot->dirty_bitmap)
 638                return -ENOMEM;
 639
 640#endif /* !CONFIG_S390 */
 641        return 0;
 642}
 643
 644static int cmp_memslot(const void *slot1, const void *slot2)
 645{
 646        struct kvm_memory_slot *s1, *s2;
 647
 648        s1 = (struct kvm_memory_slot *)slot1;
 649        s2 = (struct kvm_memory_slot *)slot2;
 650
 651        if (s1->npages < s2->npages)
 652                return 1;
 653        if (s1->npages > s2->npages)
 654                return -1;
 655
 656        return 0;
 657}
 658
 659/*
 660 * Sort the memslots base on its size, so the larger slots
 661 * will get better fit.
 662 */
 663static void sort_memslots(struct kvm_memslots *slots)
 664{
 665        int i;
 666
 667        sort(slots->memslots, KVM_MEM_SLOTS_NUM,
 668              sizeof(struct kvm_memory_slot), cmp_memslot, NULL);
 669
 670        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
 671                slots->id_to_index[slots->memslots[i].id] = i;
 672}
 673
 674void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
 675{
 676        if (new) {
 677                int id = new->id;
 678                struct kvm_memory_slot *old = id_to_memslot(slots, id);
 679                unsigned long npages = old->npages;
 680
 681                *old = *new;
 682                if (new->npages != npages)
 683                        sort_memslots(slots);
 684        }
 685
 686        slots->generation++;
 687}
 688
 689/*
 690 * Allocate some memory and give it an address in the guest physical address
 691 * space.
 692 *
 693 * Discontiguous memory is allowed, mostly for framebuffers.
 694 *
 695 * Must be called holding mmap_sem for write.
 696 */
 697int __kvm_set_memory_region(struct kvm *kvm,
 698                            struct kvm_userspace_memory_region *mem,
 699                            int user_alloc)
 700{
 701        int r;
 702        gfn_t base_gfn;
 703        unsigned long npages;
 704        unsigned long i;
 705        struct kvm_memory_slot *memslot;
 706        struct kvm_memory_slot old, new;
 707        struct kvm_memslots *slots, *old_memslots;
 708
 709        r = -EINVAL;
 710        /* General sanity checks */
 711        if (mem->memory_size & (PAGE_SIZE - 1))
 712                goto out;
 713        if (mem->guest_phys_addr & (PAGE_SIZE - 1))
 714                goto out;
 715        /* We can read the guest memory with __xxx_user() later on. */
 716        if (user_alloc &&
 717            ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
 718             !access_ok(VERIFY_WRITE,
 719                        (void __user *)(unsigned long)mem->userspace_addr,
 720                        mem->memory_size)))
 721                goto out;
 722        if (mem->slot >= KVM_MEM_SLOTS_NUM)
 723                goto out;
 724        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
 725                goto out;
 726
 727        memslot = id_to_memslot(kvm->memslots, mem->slot);
 728        base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
 729        npages = mem->memory_size >> PAGE_SHIFT;
 730
 731        r = -EINVAL;
 732        if (npages > KVM_MEM_MAX_NR_PAGES)
 733                goto out;
 734
 735        if (!npages)
 736                mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
 737
 738        new = old = *memslot;
 739
 740        new.id = mem->slot;
 741        new.base_gfn = base_gfn;
 742        new.npages = npages;
 743        new.flags = mem->flags;
 744
 745        /* Disallow changing a memory slot's size. */
 746        r = -EINVAL;
 747        if (npages && old.npages && npages != old.npages)
 748                goto out_free;
 749
 750        /* Check for overlaps */
 751        r = -EEXIST;
 752        for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
 753                struct kvm_memory_slot *s = &kvm->memslots->memslots[i];
 754
 755                if (s == memslot || !s->npages)
 756                        continue;
 757                if (!((base_gfn + npages <= s->base_gfn) ||
 758                      (base_gfn >= s->base_gfn + s->npages)))
 759                        goto out_free;
 760        }
 761
 762        /* Free page dirty bitmap if unneeded */
 763        if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
 764                new.dirty_bitmap = NULL;
 765
 766        r = -ENOMEM;
 767
 768        /* Allocate if a slot is being created */
 769        if (npages && !old.npages) {
 770                new.user_alloc = user_alloc;
 771                new.userspace_addr = mem->userspace_addr;
 772#ifndef CONFIG_S390
 773                new.rmap = vzalloc(npages * sizeof(*new.rmap));
 774                if (!new.rmap)
 775                        goto out_free;
 776#endif /* not defined CONFIG_S390 */
 777                if (kvm_arch_create_memslot(&new, npages))
 778                        goto out_free;
 779        }
 780
 781        /* Allocate page dirty bitmap if needed */
 782        if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
 783                if (kvm_create_dirty_bitmap(&new) < 0)
 784                        goto out_free;
 785                /* destroy any largepage mappings for dirty tracking */
 786        }
 787
 788        if (!npages) {
 789                struct kvm_memory_slot *slot;
 790
 791                r = -ENOMEM;
 792                slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
 793                                GFP_KERNEL);
 794                if (!slots)
 795                        goto out_free;
 796                slot = id_to_memslot(slots, mem->slot);
 797                slot->flags |= KVM_MEMSLOT_INVALID;
 798
 799                update_memslots(slots, NULL);
 800
 801                old_memslots = kvm->memslots;
 802                rcu_assign_pointer(kvm->memslots, slots);
 803                synchronize_srcu_expedited(&kvm->srcu);
 804                /* From this point no new shadow pages pointing to a deleted
 805                 * memslot will be created.
 806                 *
 807                 * validation of sp->gfn happens in:
 808                 *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
 809                 *      - kvm_is_visible_gfn (mmu_check_roots)
 810                 */
 811                kvm_arch_flush_shadow(kvm);
 812                kfree(old_memslots);
 813        }
 814
 815        r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
 816        if (r)
 817                goto out_free;
 818
 819        /* map/unmap the pages in iommu page table */
 820        if (npages) {
 821                r = kvm_iommu_map_pages(kvm, &new);
 822                if (r)
 823                        goto out_free;
 824        } else
 825                kvm_iommu_unmap_pages(kvm, &old);
 826
 827        r = -ENOMEM;
 828        slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots),
 829                        GFP_KERNEL);
 830        if (!slots)
 831                goto out_free;
 832
 833        /* actual memory is freed via old in kvm_free_physmem_slot below */
 834        if (!npages) {
 835                new.rmap = NULL;
 836                new.dirty_bitmap = NULL;
 837                memset(&new.arch, 0, sizeof(new.arch));
 838        }
 839
 840        update_memslots(slots, &new);
 841        old_memslots = kvm->memslots;
 842        rcu_assign_pointer(kvm->memslots, slots);
 843        synchronize_srcu_expedited(&kvm->srcu);
 844
 845        kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
 846
 847        /*
 848         * If the new memory slot is created, we need to clear all
 849         * mmio sptes.
 850         */
 851        if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
 852                kvm_arch_flush_shadow(kvm);
 853
 854        kvm_free_physmem_slot(&old, &new);
 855        kfree(old_memslots);
 856
 857        return 0;
 858
 859out_free:
 860        kvm_free_physmem_slot(&new, &old);
 861out:
 862        return r;
 863
 864}
 865EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
 866
 867int kvm_set_memory_region(struct kvm *kvm,
 868                          struct kvm_userspace_memory_region *mem,
 869                          int user_alloc)
 870{
 871        int r;
 872
 873        mutex_lock(&kvm->slots_lock);
 874        r = __kvm_set_memory_region(kvm, mem, user_alloc);
 875        mutex_unlock(&kvm->slots_lock);
 876        return r;
 877}
 878EXPORT_SYMBOL_GPL(kvm_set_memory_region);
 879
 880int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 881                                   struct
 882                                   kvm_userspace_memory_region *mem,
 883                                   int user_alloc)
 884{
 885        if (mem->slot >= KVM_MEMORY_SLOTS)
 886                return -EINVAL;
 887        return kvm_set_memory_region(kvm, mem, user_alloc);
 888}
 889
 890int kvm_get_dirty_log(struct kvm *kvm,
 891                        struct kvm_dirty_log *log, int *is_dirty)
 892{
 893        struct kvm_memory_slot *memslot;
 894        int r, i;
 895        unsigned long n;
 896        unsigned long any = 0;
 897
 898        r = -EINVAL;
 899        if (log->slot >= KVM_MEMORY_SLOTS)
 900                goto out;
 901
 902        memslot = id_to_memslot(kvm->memslots, log->slot);
 903        r = -ENOENT;
 904        if (!memslot->dirty_bitmap)
 905                goto out;
 906
 907        n = kvm_dirty_bitmap_bytes(memslot);
 908
 909        for (i = 0; !any && i < n/sizeof(long); ++i)
 910                any = memslot->dirty_bitmap[i];
 911
 912        r = -EFAULT;
 913        if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
 914                goto out;
 915
 916        if (any)
 917                *is_dirty = 1;
 918
 919        r = 0;
 920out:
 921        return r;
 922}
 923
 924bool kvm_largepages_enabled(void)
 925{
 926        return largepages_enabled;
 927}
 928
 929void kvm_disable_largepages(void)
 930{
 931        largepages_enabled = false;
 932}
 933EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 934
 935int is_error_page(struct page *page)
 936{
 937        return page == bad_page || page == hwpoison_page || page == fault_page;
 938}
 939EXPORT_SYMBOL_GPL(is_error_page);
 940
 941int is_error_pfn(pfn_t pfn)
 942{
 943        return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
 944}
 945EXPORT_SYMBOL_GPL(is_error_pfn);
 946
 947int is_hwpoison_pfn(pfn_t pfn)
 948{
 949        return pfn == hwpoison_pfn;
 950}
 951EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
 952
 953int is_fault_pfn(pfn_t pfn)
 954{
 955        return pfn == fault_pfn;
 956}
 957EXPORT_SYMBOL_GPL(is_fault_pfn);
 958
 959int is_noslot_pfn(pfn_t pfn)
 960{
 961        return pfn == bad_pfn;
 962}
 963EXPORT_SYMBOL_GPL(is_noslot_pfn);
 964
 965int is_invalid_pfn(pfn_t pfn)
 966{
 967        return pfn == hwpoison_pfn || pfn == fault_pfn;
 968}
 969EXPORT_SYMBOL_GPL(is_invalid_pfn);
 970
 971static inline unsigned long bad_hva(void)
 972{
 973        return PAGE_OFFSET;
 974}
 975
 976int kvm_is_error_hva(unsigned long addr)
 977{
 978        return addr == bad_hva();
 979}
 980EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 981
 982struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 983{
 984        return __gfn_to_memslot(kvm_memslots(kvm), gfn);
 985}
 986EXPORT_SYMBOL_GPL(gfn_to_memslot);
 987
 988int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 989{
 990        struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
 991
 992        if (!memslot || memslot->id >= KVM_MEMORY_SLOTS ||
 993              memslot->flags & KVM_MEMSLOT_INVALID)
 994                return 0;
 995
 996        return 1;
 997}
 998EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
 999
1000unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
1001{
1002        struct vm_area_struct *vma;
1003        unsigned long addr, size;
1004
1005        size = PAGE_SIZE;
1006
1007        addr = gfn_to_hva(kvm, gfn);
1008        if (kvm_is_error_hva(addr))
1009                return PAGE_SIZE;
1010
1011        down_read(&current->mm->mmap_sem);
1012        vma = find_vma(current->mm, addr);
1013        if (!vma)
1014                goto out;
1015
1016        size = vma_kernel_pagesize(vma);
1017
1018out:
1019        up_read(&current->mm->mmap_sem);
1020
1021        return size;
1022}
1023
1024static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1025                                     gfn_t *nr_pages)
1026{
1027        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1028                return bad_hva();
1029
1030        if (nr_pages)
1031                *nr_pages = slot->npages - (gfn - slot->base_gfn);
1032
1033        return gfn_to_hva_memslot(slot, gfn);
1034}
1035
1036unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1037{
1038        return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1039}
1040EXPORT_SYMBOL_GPL(gfn_to_hva);
1041
1042static pfn_t get_fault_pfn(void)
1043{
1044        get_page(fault_page);
1045        return fault_pfn;
1046}
1047
1048int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
1049        unsigned long start, int write, struct page **page)
1050{
1051        int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
1052
1053        if (write)
1054                flags |= FOLL_WRITE;
1055
1056        return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
1057}
1058
1059static inline int check_user_page_hwpoison(unsigned long addr)
1060{
1061        int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
1062
1063        rc = __get_user_pages(current, current->mm, addr, 1,
1064                              flags, NULL, NULL, NULL);
1065        return rc == -EHWPOISON;
1066}
1067
1068static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
1069                        bool *async, bool write_fault, bool *writable)
1070{
1071        struct page *page[1];
1072        int npages = 0;
1073        pfn_t pfn;
1074
1075        /* we can do it either atomically or asynchronously, not both */
1076        BUG_ON(atomic && async);
1077
1078        BUG_ON(!write_fault && !writable);
1079
1080        if (writable)
1081                *writable = true;
1082
1083        if (atomic || async)
1084                npages = __get_user_pages_fast(addr, 1, 1, page);
1085
1086        if (unlikely(npages != 1) && !atomic) {
1087                might_sleep();
1088
1089                if (writable)
1090                        *writable = write_fault;
1091
1092                if (async) {
1093                        down_read(&current->mm->mmap_sem);
1094                        npages = get_user_page_nowait(current, current->mm,
1095                                                     addr, write_fault, page);
1096                        up_read(&current->mm->mmap_sem);
1097                } else
1098                        npages = get_user_pages_fast(addr, 1, write_fault,
1099                                                     page);
1100
1101                /* map read fault as writable if possible */
1102                if (unlikely(!write_fault) && npages == 1) {
1103                        struct page *wpage[1];
1104
1105                        npages = __get_user_pages_fast(addr, 1, 1, wpage);
1106                        if (npages == 1) {
1107                                *writable = true;
1108                                put_page(page[0]);
1109                                page[0] = wpage[0];
1110                        }
1111                        npages = 1;
1112                }
1113        }
1114
1115        if (unlikely(npages != 1)) {
1116                struct vm_area_struct *vma;
1117
1118                if (atomic)
1119                        return get_fault_pfn();
1120
1121                down_read(&current->mm->mmap_sem);
1122                if (npages == -EHWPOISON ||
1123                        (!async && check_user_page_hwpoison(addr))) {
1124                        up_read(&current->mm->mmap_sem);
1125                        get_page(hwpoison_page);
1126                        return page_to_pfn(hwpoison_page);
1127                }
1128
1129                vma = find_vma_intersection(current->mm, addr, addr+1);
1130
1131                if (vma == NULL)
1132                        pfn = get_fault_pfn();
1133                else if ((vma->vm_flags & VM_PFNMAP)) {
1134                        pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
1135                                vma->vm_pgoff;
1136                        BUG_ON(!kvm_is_mmio_pfn(pfn));
1137                } else {
1138                        if (async && (vma->vm_flags & VM_WRITE))
1139                                *async = true;
1140                        pfn = get_fault_pfn();
1141                }
1142                up_read(&current->mm->mmap_sem);
1143        } else
1144                pfn = page_to_pfn(page[0]);
1145
1146        return pfn;
1147}
1148
1149pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
1150{
1151        return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
1152}
1153EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
1154
1155static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
1156                          bool write_fault, bool *writable)
1157{
1158        unsigned long addr;
1159
1160        if (async)
1161                *async = false;
1162
1163        addr = gfn_to_hva(kvm, gfn);
1164        if (kvm_is_error_hva(addr)) {
1165                get_page(bad_page);
1166                return page_to_pfn(bad_page);
1167        }
1168
1169        return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
1170}
1171
1172pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1173{
1174        return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
1175}
1176EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1177
1178pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
1179                       bool write_fault, bool *writable)
1180{
1181        return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
1182}
1183EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
1184
1185pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1186{
1187        return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
1188}
1189EXPORT_SYMBOL_GPL(gfn_to_pfn);
1190
1191pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1192                      bool *writable)
1193{
1194        return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
1195}
1196EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1197
1198pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
1199                         struct kvm_memory_slot *slot, gfn_t gfn)
1200{
1201        unsigned long addr = gfn_to_hva_memslot(slot, gfn);
1202        return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
1203}
1204
1205int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1206                                                                  int nr_pages)
1207{
1208        unsigned long addr;
1209        gfn_t entry;
1210
1211        addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
1212        if (kvm_is_error_hva(addr))
1213                return -1;
1214
1215        if (entry < nr_pages)
1216                return 0;
1217
1218        return __get_user_pages_fast(addr, nr_pages, 1, pages);
1219}
1220EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1221
1222struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1223{
1224        pfn_t pfn;
1225
1226        pfn = gfn_to_pfn(kvm, gfn);
1227        if (!kvm_is_mmio_pfn(pfn))
1228                return pfn_to_page(pfn);
1229
1230        WARN_ON(kvm_is_mmio_pfn(pfn));
1231
1232        get_page(bad_page);
1233        return bad_page;
1234}
1235
1236EXPORT_SYMBOL_GPL(gfn_to_page);
1237
1238void kvm_release_page_clean(struct page *page)
1239{
1240        kvm_release_pfn_clean(page_to_pfn(page));
1241}
1242EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1243
1244void kvm_release_pfn_clean(pfn_t pfn)
1245{
1246        if (!kvm_is_mmio_pfn(pfn))
1247                put_page(pfn_to_page(pfn));
1248}
1249EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1250
1251void kvm_release_page_dirty(struct page *page)
1252{
1253        kvm_release_pfn_dirty(page_to_pfn(page));
1254}
1255EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
1256
1257void kvm_release_pfn_dirty(pfn_t pfn)
1258{
1259        kvm_set_pfn_dirty(pfn);
1260        kvm_release_pfn_clean(pfn);
1261}
1262EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1263
1264void kvm_set_page_dirty(struct page *page)
1265{
1266        kvm_set_pfn_dirty(page_to_pfn(page));
1267}
1268EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
1269
1270void kvm_set_pfn_dirty(pfn_t pfn)
1271{
1272        if (!kvm_is_mmio_pfn(pfn)) {
1273                struct page *page = pfn_to_page(pfn);
1274                if (!PageReserved(page))
1275                        SetPageDirty(page);
1276        }
1277}
1278EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1279
1280void kvm_set_pfn_accessed(pfn_t pfn)
1281{
1282        if (!kvm_is_mmio_pfn(pfn))
1283                mark_page_accessed(pfn_to_page(pfn));
1284}
1285EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
1286
1287void kvm_get_pfn(pfn_t pfn)
1288{
1289        if (!kvm_is_mmio_pfn(pfn))
1290                get_page(pfn_to_page(pfn));
1291}
1292EXPORT_SYMBOL_GPL(kvm_get_pfn);
1293
1294static int next_segment(unsigned long len, int offset)
1295{
1296        if (len > PAGE_SIZE - offset)
1297                return PAGE_SIZE - offset;
1298        else
1299                return len;
1300}
1301
1302int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1303                        int len)
1304{
1305        int r;
1306        unsigned long addr;
1307
1308        addr = gfn_to_hva(kvm, gfn);
1309        if (kvm_is_error_hva(addr))
1310                return -EFAULT;
1311        r = __copy_from_user(data, (void __user *)addr + offset, len);
1312        if (r)
1313                return -EFAULT;
1314        return 0;
1315}
1316EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1317
1318int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1319{
1320        gfn_t gfn = gpa >> PAGE_SHIFT;
1321        int seg;
1322        int offset = offset_in_page(gpa);
1323        int ret;
1324
1325        while ((seg = next_segment(len, offset)) != 0) {
1326                ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
1327                if (ret < 0)
1328                        return ret;
1329                offset = 0;
1330                len -= seg;
1331                data += seg;
1332                ++gfn;
1333        }
1334        return 0;
1335}
1336EXPORT_SYMBOL_GPL(kvm_read_guest);
1337
1338int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
1339                          unsigned long len)
1340{
1341        int r;
1342        unsigned long addr;
1343        gfn_t gfn = gpa >> PAGE_SHIFT;
1344        int offset = offset_in_page(gpa);
1345
1346        addr = gfn_to_hva(kvm, gfn);
1347        if (kvm_is_error_hva(addr))
1348                return -EFAULT;
1349        pagefault_disable();
1350        r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
1351        pagefault_enable();
1352        if (r)
1353                return -EFAULT;
1354        return 0;
1355}
1356EXPORT_SYMBOL(kvm_read_guest_atomic);
1357
1358int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1359                         int offset, int len)
1360{
1361        int r;
1362        unsigned long addr;
1363
1364        addr = gfn_to_hva(kvm, gfn);
1365        if (kvm_is_error_hva(addr))
1366                return -EFAULT;
1367        r = __copy_to_user((void __user *)addr + offset, data, len);
1368        if (r)
1369                return -EFAULT;
1370        mark_page_dirty(kvm, gfn);
1371        return 0;
1372}
1373EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1374
1375int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1376                    unsigned long len)
1377{
1378        gfn_t gfn = gpa >> PAGE_SHIFT;
1379        int seg;
1380        int offset = offset_in_page(gpa);
1381        int ret;
1382
1383        while ((seg = next_segment(len, offset)) != 0) {
1384                ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
1385                if (ret < 0)
1386                        return ret;
1387                offset = 0;
1388                len -= seg;
1389                data += seg;
1390                ++gfn;
1391        }
1392        return 0;
1393}
1394
1395int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1396                              gpa_t gpa)
1397{
1398        struct kvm_memslots *slots = kvm_memslots(kvm);
1399        int offset = offset_in_page(gpa);
1400        gfn_t gfn = gpa >> PAGE_SHIFT;
1401
1402        ghc->gpa = gpa;
1403        ghc->generation = slots->generation;
1404        ghc->memslot = gfn_to_memslot(kvm, gfn);
1405        ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL);
1406        if (!kvm_is_error_hva(ghc->hva))
1407                ghc->hva += offset;
1408        else
1409                return -EFAULT;
1410
1411        return 0;
1412}
1413EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
1414
1415int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1416                           void *data, unsigned long len)
1417{
1418        struct kvm_memslots *slots = kvm_memslots(kvm);
1419        int r;
1420
1421        if (slots->generation != ghc->generation)
1422                kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1423
1424        if (kvm_is_error_hva(ghc->hva))
1425                return -EFAULT;
1426
1427        r = __copy_to_user((void __user *)ghc->hva, data, len);
1428        if (r)
1429                return -EFAULT;
1430        mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
1431
1432        return 0;
1433}
1434EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
1435
1436int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
1437                           void *data, unsigned long len)
1438{
1439        struct kvm_memslots *slots = kvm_memslots(kvm);
1440        int r;
1441
1442        if (slots->generation != ghc->generation)
1443                kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa);
1444
1445        if (kvm_is_error_hva(ghc->hva))
1446                return -EFAULT;
1447
1448        r = __copy_from_user(data, (void __user *)ghc->hva, len);
1449        if (r)
1450                return -EFAULT;
1451
1452        return 0;
1453}
1454EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
1455
1456int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1457{
1458        return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page,
1459                                    offset, len);
1460}
1461EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1462
1463int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1464{
1465        gfn_t gfn = gpa >> PAGE_SHIFT;
1466        int seg;
1467        int offset = offset_in_page(gpa);
1468        int ret;
1469
1470        while ((seg = next_segment(len, offset)) != 0) {
1471                ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
1472                if (ret < 0)
1473                        return ret;
1474                offset = 0;
1475                len -= seg;
1476                ++gfn;
1477        }
1478        return 0;
1479}
1480EXPORT_SYMBOL_GPL(kvm_clear_guest);
1481
1482void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot,
1483                             gfn_t gfn)
1484{
1485        if (memslot && memslot->dirty_bitmap) {
1486                unsigned long rel_gfn = gfn - memslot->base_gfn;
1487
1488                /* TODO: introduce set_bit_le() and use it */
1489                test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap);
1490        }
1491}
1492
1493void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1494{
1495        struct kvm_memory_slot *memslot;
1496
1497        memslot = gfn_to_memslot(kvm, gfn);
1498        mark_page_dirty_in_slot(kvm, memslot, gfn);
1499}
1500
1501/*
1502 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1503 */
1504void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1505{
1506        DEFINE_WAIT(wait);
1507
1508        for (;;) {
1509                prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1510
1511                if (kvm_arch_vcpu_runnable(vcpu)) {
1512                        kvm_make_request(KVM_REQ_UNHALT, vcpu);
1513                        break;
1514                }
1515                if (kvm_cpu_has_pending_timer(vcpu))
1516                        break;
1517                if (signal_pending(current))
1518                        break;
1519
1520                schedule();
1521        }
1522
1523        finish_wait(&vcpu->wq, &wait);
1524}
1525
1526#ifndef CONFIG_S390
1527/*
1528 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
1529 */
1530void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
1531{
1532        int me;
1533        int cpu = vcpu->cpu;
1534        wait_queue_head_t *wqp;
1535
1536        wqp = kvm_arch_vcpu_wq(vcpu);
1537        if (waitqueue_active(wqp)) {
1538                wake_up_interruptible(wqp);
1539                ++vcpu->stat.halt_wakeup;
1540        }
1541
1542        me = get_cpu();
1543        if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
1544                if (kvm_arch_vcpu_should_kick(vcpu))
1545                        smp_send_reschedule(cpu);
1546        put_cpu();
1547}
1548#endif /* !CONFIG_S390 */
1549
1550void kvm_resched(struct kvm_vcpu *vcpu)
1551{
1552        if (!need_resched())
1553                return;
1554        cond_resched();
1555}
1556EXPORT_SYMBOL_GPL(kvm_resched);
1557
1558bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
1559{
1560        struct pid *pid;
1561        struct task_struct *task = NULL;
1562
1563        rcu_read_lock();
1564        pid = rcu_dereference(target->pid);
1565        if (pid)
1566                task = get_pid_task(target->pid, PIDTYPE_PID);
1567        rcu_read_unlock();
1568        if (!task)
1569                return false;
1570        if (task->flags & PF_VCPU) {
1571                put_task_struct(task);
1572                return false;
1573        }
1574        if (yield_to(task, 1)) {
1575                put_task_struct(task);
1576                return true;
1577        }
1578        put_task_struct(task);
1579        return false;
1580}
1581EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
1582
1583void kvm_vcpu_on_spin(struct kvm_vcpu *me)
1584{
1585        struct kvm *kvm = me->kvm;
1586        struct kvm_vcpu *vcpu;
1587        int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
1588        int yielded = 0;
1589        int pass;
1590        int i;
1591
1592        /*
1593         * We boost the priority of a VCPU that is runnable but not
1594         * currently running, because it got preempted by something
1595         * else and called schedule in __vcpu_run.  Hopefully that
1596         * VCPU is holding the lock that we need and will release it.
1597         * We approximate round-robin by starting at the last boosted VCPU.
1598         */
1599        for (pass = 0; pass < 2 && !yielded; pass++) {
1600                kvm_for_each_vcpu(i, vcpu, kvm) {
1601                        if (!pass && i <= last_boosted_vcpu) {
1602                                i = last_boosted_vcpu;
1603                                continue;
1604                        } else if (pass && i > last_boosted_vcpu)
1605                                break;
1606                        if (vcpu == me)
1607                                continue;
1608                        if (waitqueue_active(&vcpu->wq))
1609                                continue;
1610                        if (kvm_vcpu_yield_to(vcpu)) {
1611                                kvm->last_boosted_vcpu = i;
1612                                yielded = 1;
1613                                break;
1614                        }
1615                }
1616        }
1617}
1618EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
1619
1620static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1621{
1622        struct kvm_vcpu *vcpu = vma->vm_file->private_data;
1623        struct page *page;
1624
1625        if (vmf->pgoff == 0)
1626                page = virt_to_page(vcpu->run);
1627#ifdef CONFIG_X86
1628        else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
1629                page = virt_to_page(vcpu->arch.pio_data);
1630#endif
1631#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
1632        else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
1633                page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
1634#endif
1635        else
1636                return kvm_arch_vcpu_fault(vcpu, vmf);
1637        get_page(page);
1638        vmf->page = page;
1639        return 0;
1640}
1641
1642static const struct vm_operations_struct kvm_vcpu_vm_ops = {
1643        .fault = kvm_vcpu_fault,
1644};
1645
1646static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
1647{
1648        vma->vm_ops = &kvm_vcpu_vm_ops;
1649        return 0;
1650}
1651
1652static int kvm_vcpu_release(struct inode *inode, struct file *filp)
1653{
1654        struct kvm_vcpu *vcpu = filp->private_data;
1655
1656        kvm_put_kvm(vcpu->kvm);
1657        return 0;
1658}
1659
1660static struct file_operations kvm_vcpu_fops = {
1661        .release        = kvm_vcpu_release,
1662        .unlocked_ioctl = kvm_vcpu_ioctl,
1663#ifdef CONFIG_COMPAT
1664        .compat_ioctl   = kvm_vcpu_compat_ioctl,
1665#endif
1666        .mmap           = kvm_vcpu_mmap,
1667        .llseek         = noop_llseek,
1668};
1669
1670/*
1671 * Allocates an inode for the vcpu.
1672 */
1673static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1674{
1675        return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
1676}
1677
1678/*
1679 * Creates some virtual cpus.  Good luck creating more than one.
1680 */
1681static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1682{
1683        int r;
1684        struct kvm_vcpu *vcpu, *v;
1685
1686        vcpu = kvm_arch_vcpu_create(kvm, id);
1687        if (IS_ERR(vcpu))
1688                return PTR_ERR(vcpu);
1689
1690        preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
1691
1692        r = kvm_arch_vcpu_setup(vcpu);
1693        if (r)
1694                goto vcpu_destroy;
1695
1696        mutex_lock(&kvm->lock);
1697        if (!kvm_vcpu_compatible(vcpu)) {
1698                r = -EINVAL;
1699                goto unlock_vcpu_destroy;
1700        }
1701        if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1702                r = -EINVAL;
1703                goto unlock_vcpu_destroy;
1704        }
1705
1706        kvm_for_each_vcpu(r, v, kvm)
1707                if (v->vcpu_id == id) {
1708                        r = -EEXIST;
1709                        goto unlock_vcpu_destroy;
1710                }
1711
1712        BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
1713
1714        /* Now it's all set up, let userspace reach it */
1715        kvm_get_kvm(kvm);
1716        r = create_vcpu_fd(vcpu);
1717        if (r < 0) {
1718                kvm_put_kvm(kvm);
1719                goto unlock_vcpu_destroy;
1720        }
1721
1722        kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
1723        smp_wmb();
1724        atomic_inc(&kvm->online_vcpus);
1725
1726        mutex_unlock(&kvm->lock);
1727        return r;
1728
1729unlock_vcpu_destroy:
1730        mutex_unlock(&kvm->lock);
1731vcpu_destroy:
1732        kvm_arch_vcpu_destroy(vcpu);
1733        return r;
1734}
1735
1736static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
1737{
1738        if (sigset) {
1739                sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
1740                vcpu->sigset_active = 1;
1741                vcpu->sigset = *sigset;
1742        } else
1743                vcpu->sigset_active = 0;
1744        return 0;
1745}
1746
1747static long kvm_vcpu_ioctl(struct file *filp,
1748                           unsigned int ioctl, unsigned long arg)
1749{
1750        struct kvm_vcpu *vcpu = filp->private_data;
1751        void __user *argp = (void __user *)arg;
1752        int r;
1753        struct kvm_fpu *fpu = NULL;
1754        struct kvm_sregs *kvm_sregs = NULL;
1755
1756        if (vcpu->kvm->mm != current->mm)
1757                return -EIO;
1758
1759#if defined(CONFIG_S390) || defined(CONFIG_PPC)
1760        /*
1761         * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
1762         * so vcpu_load() would break it.
1763         */
1764        if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
1765                return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1766#endif
1767
1768
1769        vcpu_load(vcpu);
1770        switch (ioctl) {
1771        case KVM_RUN:
1772                r = -EINVAL;
1773                if (arg)
1774                        goto out;
1775                r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
1776                trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
1777                break;
1778        case KVM_GET_REGS: {
1779                struct kvm_regs *kvm_regs;
1780
1781                r = -ENOMEM;
1782                kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
1783                if (!kvm_regs)
1784                        goto out;
1785                r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
1786                if (r)
1787                        goto out_free1;
1788                r = -EFAULT;
1789                if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
1790                        goto out_free1;
1791                r = 0;
1792out_free1:
1793                kfree(kvm_regs);
1794                break;
1795        }
1796        case KVM_SET_REGS: {
1797                struct kvm_regs *kvm_regs;
1798
1799                r = -ENOMEM;
1800                kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
1801                if (IS_ERR(kvm_regs)) {
1802                        r = PTR_ERR(kvm_regs);
1803                        goto out;
1804                }
1805                r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
1806                if (r)
1807                        goto out_free2;
1808                r = 0;
1809out_free2:
1810                kfree(kvm_regs);
1811                break;
1812        }
1813        case KVM_GET_SREGS: {
1814                kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
1815                r = -ENOMEM;
1816                if (!kvm_sregs)
1817                        goto out;
1818                r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
1819                if (r)
1820                        goto out;
1821                r = -EFAULT;
1822                if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
1823                        goto out;
1824                r = 0;
1825                break;
1826        }
1827        case KVM_SET_SREGS: {
1828                kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
1829                if (IS_ERR(kvm_sregs)) {
1830                        r = PTR_ERR(kvm_sregs);
1831                        goto out;
1832                }
1833                r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
1834                if (r)
1835                        goto out;
1836                r = 0;
1837                break;
1838        }
1839        case KVM_GET_MP_STATE: {
1840                struct kvm_mp_state mp_state;
1841
1842                r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
1843                if (r)
1844                        goto out;
1845                r = -EFAULT;
1846                if (copy_to_user(argp, &mp_state, sizeof mp_state))
1847                        goto out;
1848                r = 0;
1849                break;
1850        }
1851        case KVM_SET_MP_STATE: {
1852                struct kvm_mp_state mp_state;
1853
1854                r = -EFAULT;
1855                if (copy_from_user(&mp_state, argp, sizeof mp_state))
1856                        goto out;
1857                r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
1858                if (r)
1859                        goto out;
1860                r = 0;
1861                break;
1862        }
1863        case KVM_TRANSLATE: {
1864                struct kvm_translation tr;
1865
1866                r = -EFAULT;
1867                if (copy_from_user(&tr, argp, sizeof tr))
1868                        goto out;
1869                r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
1870                if (r)
1871                        goto out;
1872                r = -EFAULT;
1873                if (copy_to_user(argp, &tr, sizeof tr))
1874                        goto out;
1875                r = 0;
1876                break;
1877        }
1878        case KVM_SET_GUEST_DEBUG: {
1879                struct kvm_guest_debug dbg;
1880
1881                r = -EFAULT;
1882                if (copy_from_user(&dbg, argp, sizeof dbg))
1883                        goto out;
1884                r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
1885                if (r)
1886                        goto out;
1887                r = 0;
1888                break;
1889        }
1890        case KVM_SET_SIGNAL_MASK: {
1891                struct kvm_signal_mask __user *sigmask_arg = argp;
1892                struct kvm_signal_mask kvm_sigmask;
1893                sigset_t sigset, *p;
1894
1895                p = NULL;
1896                if (argp) {
1897                        r = -EFAULT;
1898                        if (copy_from_user(&kvm_sigmask, argp,
1899                                           sizeof kvm_sigmask))
1900                                goto out;
1901                        r = -EINVAL;
1902                        if (kvm_sigmask.len != sizeof sigset)
1903                                goto out;
1904                        r = -EFAULT;
1905                        if (copy_from_user(&sigset, sigmask_arg->sigset,
1906                                           sizeof sigset))
1907                                goto out;
1908                        p = &sigset;
1909                }
1910                r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
1911                break;
1912        }
1913        case KVM_GET_FPU: {
1914                fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
1915                r = -ENOMEM;
1916                if (!fpu)
1917                        goto out;
1918                r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
1919                if (r)
1920                        goto out;
1921                r = -EFAULT;
1922                if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
1923                        goto out;
1924                r = 0;
1925                break;
1926        }
1927        case KVM_SET_FPU: {
1928                fpu = memdup_user(argp, sizeof(*fpu));
1929                if (IS_ERR(fpu)) {
1930                        r = PTR_ERR(fpu);
1931                        goto out;
1932                }
1933                r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
1934                if (r)
1935                        goto out;
1936                r = 0;
1937                break;
1938        }
1939        default:
1940                r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
1941        }
1942out:
1943        vcpu_put(vcpu);
1944        kfree(fpu);
1945        kfree(kvm_sregs);
1946        return r;
1947}
1948
1949#ifdef CONFIG_COMPAT
1950static long kvm_vcpu_compat_ioctl(struct file *filp,
1951                                  unsigned int ioctl, unsigned long arg)
1952{
1953        struct kvm_vcpu *vcpu = filp->private_data;
1954        void __user *argp = compat_ptr(arg);
1955        int r;
1956
1957        if (vcpu->kvm->mm != current->mm)
1958                return -EIO;
1959
1960        switch (ioctl) {
1961        case KVM_SET_SIGNAL_MASK: {
1962                struct kvm_signal_mask __user *sigmask_arg = argp;
1963                struct kvm_signal_mask kvm_sigmask;
1964                compat_sigset_t csigset;
1965                sigset_t sigset;
1966
1967                if (argp) {
1968                        r = -EFAULT;
1969                        if (copy_from_user(&kvm_sigmask, argp,
1970                                           sizeof kvm_sigmask))
1971                                goto out;
1972                        r = -EINVAL;
1973                        if (kvm_sigmask.len != sizeof csigset)
1974                                goto out;
1975                        r = -EFAULT;
1976                        if (copy_from_user(&csigset, sigmask_arg->sigset,
1977                                           sizeof csigset))
1978                                goto out;
1979                        sigset_from_compat(&sigset, &csigset);
1980                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
1981                } else
1982                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
1983                break;
1984        }
1985        default:
1986                r = kvm_vcpu_ioctl(filp, ioctl, arg);
1987        }
1988
1989out:
1990        return r;
1991}
1992#endif
1993
1994static long kvm_vm_ioctl(struct file *filp,
1995                           unsigned int ioctl, unsigned long arg)
1996{
1997        struct kvm *kvm = filp->private_data;
1998        void __user *argp = (void __user *)arg;
1999        int r;
2000
2001        if (kvm->mm != current->mm)
2002                return -EIO;
2003        switch (ioctl) {
2004        case KVM_CREATE_VCPU:
2005                r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2006                if (r < 0)
2007                        goto out;
2008                break;
2009        case KVM_SET_USER_MEMORY_REGION: {
2010                struct kvm_userspace_memory_region kvm_userspace_mem;
2011
2012                r = -EFAULT;
2013                if (copy_from_user(&kvm_userspace_mem, argp,
2014                                                sizeof kvm_userspace_mem))
2015                        goto out;
2016
2017                r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
2018                if (r)
2019                        goto out;
2020                break;
2021        }
2022        case KVM_GET_DIRTY_LOG: {
2023                struct kvm_dirty_log log;
2024
2025                r = -EFAULT;
2026                if (copy_from_user(&log, argp, sizeof log))
2027                        goto out;
2028                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2029                if (r)
2030                        goto out;
2031                break;
2032        }
2033#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2034        case KVM_REGISTER_COALESCED_MMIO: {
2035                struct kvm_coalesced_mmio_zone zone;
2036                r = -EFAULT;
2037                if (copy_from_user(&zone, argp, sizeof zone))
2038                        goto out;
2039                r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
2040                if (r)
2041                        goto out;
2042                r = 0;
2043                break;
2044        }
2045        case KVM_UNREGISTER_COALESCED_MMIO: {
2046                struct kvm_coalesced_mmio_zone zone;
2047                r = -EFAULT;
2048                if (copy_from_user(&zone, argp, sizeof zone))
2049                        goto out;
2050                r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
2051                if (r)
2052                        goto out;
2053                r = 0;
2054                break;
2055        }
2056#endif
2057        case KVM_IRQFD: {
2058                struct kvm_irqfd data;
2059
2060                r = -EFAULT;
2061                if (copy_from_user(&data, argp, sizeof data))
2062                        goto out;
2063                r = kvm_irqfd(kvm, &data);
2064                break;
2065        }
2066        case KVM_IOEVENTFD: {
2067                struct kvm_ioeventfd data;
2068
2069                r = -EFAULT;
2070                if (copy_from_user(&data, argp, sizeof data))
2071                        goto out;
2072                r = kvm_ioeventfd(kvm, &data);
2073                break;
2074        }
2075#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2076        case KVM_SET_BOOT_CPU_ID:
2077                r = 0;
2078                mutex_lock(&kvm->lock);
2079                if (atomic_read(&kvm->online_vcpus) != 0)
2080                        r = -EBUSY;
2081                else
2082                        kvm->bsp_vcpu_id = arg;
2083                mutex_unlock(&kvm->lock);
2084                break;
2085#endif
2086#ifdef CONFIG_HAVE_KVM_MSI
2087        case KVM_SIGNAL_MSI: {
2088                struct kvm_msi msi;
2089
2090                r = -EFAULT;
2091                if (copy_from_user(&msi, argp, sizeof msi))
2092                        goto out;
2093                r = kvm_send_userspace_msi(kvm, &msi);
2094                break;
2095        }
2096#endif
2097        default:
2098                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
2099                if (r == -ENOTTY)
2100                        r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
2101        }
2102out:
2103        return r;
2104}
2105
2106#ifdef CONFIG_COMPAT
2107struct compat_kvm_dirty_log {
2108        __u32 slot;
2109        __u32 padding1;
2110        union {
2111                compat_uptr_t dirty_bitmap; /* one bit per page */
2112                __u64 padding2;
2113        };
2114};
2115
2116static long kvm_vm_compat_ioctl(struct file *filp,
2117                           unsigned int ioctl, unsigned long arg)
2118{
2119        struct kvm *kvm = filp->private_data;
2120        int r;
2121
2122        if (kvm->mm != current->mm)
2123                return -EIO;
2124        switch (ioctl) {
2125        case KVM_GET_DIRTY_LOG: {
2126                struct compat_kvm_dirty_log compat_log;
2127                struct kvm_dirty_log log;
2128
2129                r = -EFAULT;
2130                if (copy_from_user(&compat_log, (void __user *)arg,
2131                                   sizeof(compat_log)))
2132                        goto out;
2133                log.slot         = compat_log.slot;
2134                log.padding1     = compat_log.padding1;
2135                log.padding2     = compat_log.padding2;
2136                log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
2137
2138                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2139                if (r)
2140                        goto out;
2141                break;
2142        }
2143        default:
2144                r = kvm_vm_ioctl(filp, ioctl, arg);
2145        }
2146
2147out:
2148        return r;
2149}
2150#endif
2151
2152static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2153{
2154        struct page *page[1];
2155        unsigned long addr;
2156        int npages;
2157        gfn_t gfn = vmf->pgoff;
2158        struct kvm *kvm = vma->vm_file->private_data;
2159
2160        addr = gfn_to_hva(kvm, gfn);
2161        if (kvm_is_error_hva(addr))
2162                return VM_FAULT_SIGBUS;
2163
2164        npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
2165                                NULL);
2166        if (unlikely(npages != 1))
2167                return VM_FAULT_SIGBUS;
2168
2169        vmf->page = page[0];
2170        return 0;
2171}
2172
2173static const struct vm_operations_struct kvm_vm_vm_ops = {
2174        .fault = kvm_vm_fault,
2175};
2176
2177static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
2178{
2179        vma->vm_ops = &kvm_vm_vm_ops;
2180        return 0;
2181}
2182
2183static struct file_operations kvm_vm_fops = {
2184        .release        = kvm_vm_release,
2185        .unlocked_ioctl = kvm_vm_ioctl,
2186#ifdef CONFIG_COMPAT
2187        .compat_ioctl   = kvm_vm_compat_ioctl,
2188#endif
2189        .mmap           = kvm_vm_mmap,
2190        .llseek         = noop_llseek,
2191};
2192
2193static int kvm_dev_ioctl_create_vm(unsigned long type)
2194{
2195        int r;
2196        struct kvm *kvm;
2197
2198        kvm = kvm_create_vm(type);
2199        if (IS_ERR(kvm))
2200                return PTR_ERR(kvm);
2201#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2202        r = kvm_coalesced_mmio_init(kvm);
2203        if (r < 0) {
2204                kvm_put_kvm(kvm);
2205                return r;
2206        }
2207#endif
2208        r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
2209        if (r < 0)
2210                kvm_put_kvm(kvm);
2211
2212        return r;
2213}
2214
2215static long kvm_dev_ioctl_check_extension_generic(long arg)
2216{
2217        switch (arg) {
2218        case KVM_CAP_USER_MEMORY:
2219        case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
2220        case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
2221#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2222        case KVM_CAP_SET_BOOT_CPU_ID:
2223#endif
2224        case KVM_CAP_INTERNAL_ERROR_DATA:
2225#ifdef CONFIG_HAVE_KVM_MSI
2226        case KVM_CAP_SIGNAL_MSI:
2227#endif
2228                return 1;
2229#ifdef KVM_CAP_IRQ_ROUTING
2230        case KVM_CAP_IRQ_ROUTING:
2231                return KVM_MAX_IRQ_ROUTES;
2232#endif
2233        default:
2234                break;
2235        }
2236        return kvm_dev_ioctl_check_extension(arg);
2237}
2238
2239static long kvm_dev_ioctl(struct file *filp,
2240                          unsigned int ioctl, unsigned long arg)
2241{
2242        long r = -EINVAL;
2243
2244        switch (ioctl) {
2245        case KVM_GET_API_VERSION:
2246                r = -EINVAL;
2247                if (arg)
2248                        goto out;
2249                r = KVM_API_VERSION;
2250                break;
2251        case KVM_CREATE_VM:
2252                r = kvm_dev_ioctl_create_vm(arg);
2253                break;
2254        case KVM_CHECK_EXTENSION:
2255                r = kvm_dev_ioctl_check_extension_generic(arg);
2256                break;
2257        case KVM_GET_VCPU_MMAP_SIZE:
2258                r = -EINVAL;
2259                if (arg)
2260                        goto out;
2261                r = PAGE_SIZE;     /* struct kvm_run */
2262#ifdef CONFIG_X86
2263                r += PAGE_SIZE;    /* pio data page */
2264#endif
2265#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
2266                r += PAGE_SIZE;    /* coalesced mmio ring page */
2267#endif
2268                break;
2269        case KVM_TRACE_ENABLE:
2270        case KVM_TRACE_PAUSE:
2271        case KVM_TRACE_DISABLE:
2272                r = -EOPNOTSUPP;
2273                break;
2274        default:
2275                return kvm_arch_dev_ioctl(filp, ioctl, arg);
2276        }
2277out:
2278        return r;
2279}
2280
2281static struct file_operations kvm_chardev_ops = {
2282        .unlocked_ioctl = kvm_dev_ioctl,
2283        .compat_ioctl   = kvm_dev_ioctl,
2284        .llseek         = noop_llseek,
2285};
2286
2287static struct miscdevice kvm_dev = {
2288        KVM_MINOR,
2289        "kvm",
2290        &kvm_chardev_ops,
2291};
2292
2293static void hardware_enable_nolock(void *junk)
2294{
2295        int cpu = raw_smp_processor_id();
2296        int r;
2297
2298        if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
2299                return;
2300
2301        cpumask_set_cpu(cpu, cpus_hardware_enabled);
2302
2303        r = kvm_arch_hardware_enable(NULL);
2304
2305        if (r) {
2306                cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2307                atomic_inc(&hardware_enable_failed);
2308                printk(KERN_INFO "kvm: enabling virtualization on "
2309                                 "CPU%d failed\n", cpu);
2310        }
2311}
2312
2313static void hardware_enable(void *junk)
2314{
2315        raw_spin_lock(&kvm_lock);
2316        hardware_enable_nolock(junk);
2317        raw_spin_unlock(&kvm_lock);
2318}
2319
2320static void hardware_disable_nolock(void *junk)
2321{
2322        int cpu = raw_smp_processor_id();
2323
2324        if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
2325                return;
2326        cpumask_clear_cpu(cpu, cpus_hardware_enabled);
2327        kvm_arch_hardware_disable(NULL);
2328}
2329
2330static void hardware_disable(void *junk)
2331{
2332        raw_spin_lock(&kvm_lock);
2333        hardware_disable_nolock(junk);
2334        raw_spin_unlock(&kvm_lock);
2335}
2336
2337static void hardware_disable_all_nolock(void)
2338{
2339        BUG_ON(!kvm_usage_count);
2340
2341        kvm_usage_count--;
2342        if (!kvm_usage_count)
2343                on_each_cpu(hardware_disable_nolock, NULL, 1);
2344}
2345
2346static void hardware_disable_all(void)
2347{
2348        raw_spin_lock(&kvm_lock);
2349        hardware_disable_all_nolock();
2350        raw_spin_unlock(&kvm_lock);
2351}
2352
2353static int hardware_enable_all(void)
2354{
2355        int r = 0;
2356
2357        raw_spin_lock(&kvm_lock);
2358
2359        kvm_usage_count++;
2360        if (kvm_usage_count == 1) {
2361                atomic_set(&hardware_enable_failed, 0);
2362                on_each_cpu(hardware_enable_nolock, NULL, 1);
2363
2364                if (atomic_read(&hardware_enable_failed)) {
2365                        hardware_disable_all_nolock();
2366                        r = -EBUSY;
2367                }
2368        }
2369
2370        raw_spin_unlock(&kvm_lock);
2371
2372        return r;
2373}
2374
2375static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2376                           void *v)
2377{
2378        int cpu = (long)v;
2379
2380        if (!kvm_usage_count)
2381                return NOTIFY_OK;
2382
2383        val &= ~CPU_TASKS_FROZEN;
2384        switch (val) {
2385        case CPU_DYING:
2386                printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
2387                       cpu);
2388                hardware_disable(NULL);
2389                break;
2390        case CPU_STARTING:
2391                printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
2392                       cpu);
2393                hardware_enable(NULL);
2394                break;
2395        }
2396        return NOTIFY_OK;
2397}
2398
2399
2400asmlinkage void kvm_spurious_fault(void)
2401{
2402        /* Fault while not rebooting.  We want the trace. */
2403        BUG();
2404}
2405EXPORT_SYMBOL_GPL(kvm_spurious_fault);
2406
2407static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2408                      void *v)
2409{
2410        /*
2411         * Some (well, at least mine) BIOSes hang on reboot if
2412         * in vmx root mode.
2413         *
2414         * And Intel TXT required VMX off for all cpu when system shutdown.
2415         */
2416        printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2417        kvm_rebooting = true;
2418        on_each_cpu(hardware_disable_nolock, NULL, 1);
2419        return NOTIFY_OK;
2420}
2421
2422static struct notifier_block kvm_reboot_notifier = {
2423        .notifier_call = kvm_reboot,
2424        .priority = 0,
2425};
2426
2427static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2428{
2429        int i;
2430
2431        for (i = 0; i < bus->dev_count; i++) {
2432                struct kvm_io_device *pos = bus->range[i].dev;
2433
2434                kvm_iodevice_destructor(pos);
2435        }
2436        kfree(bus);
2437}
2438
2439int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
2440{
2441        const struct kvm_io_range *r1 = p1;
2442        const struct kvm_io_range *r2 = p2;
2443
2444        if (r1->addr < r2->addr)
2445                return -1;
2446        if (r1->addr + r1->len > r2->addr + r2->len)
2447                return 1;
2448        return 0;
2449}
2450
2451int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
2452                          gpa_t addr, int len)
2453{
2454        bus->range[bus->dev_count++] = (struct kvm_io_range) {
2455                .addr = addr,
2456                .len = len,
2457                .dev = dev,
2458        };
2459
2460        sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range),
2461                kvm_io_bus_sort_cmp, NULL);
2462
2463        return 0;
2464}
2465
2466int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
2467                             gpa_t addr, int len)
2468{
2469        struct kvm_io_range *range, key;
2470        int off;
2471
2472        key = (struct kvm_io_range) {
2473                .addr = addr,
2474                .len = len,
2475        };
2476
2477        range = bsearch(&key, bus->range, bus->dev_count,
2478                        sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
2479        if (range == NULL)
2480                return -ENOENT;
2481
2482        off = range - bus->range;
2483
2484        while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0)
2485                off--;
2486
2487        return off;
2488}
2489
2490/* kvm_io_bus_write - called under kvm->slots_lock */
2491int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2492                     int len, const void *val)
2493{
2494        int idx;
2495        struct kvm_io_bus *bus;
2496        struct kvm_io_range range;
2497
2498        range = (struct kvm_io_range) {
2499                .addr = addr,
2500                .len = len,
2501        };
2502
2503        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2504        idx = kvm_io_bus_get_first_dev(bus, addr, len);
2505        if (idx < 0)
2506                return -EOPNOTSUPP;
2507
2508        while (idx < bus->dev_count &&
2509                kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
2510                if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val))
2511                        return 0;
2512                idx++;
2513        }
2514
2515        return -EOPNOTSUPP;
2516}
2517
2518/* kvm_io_bus_read - called under kvm->slots_lock */
2519int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2520                    int len, void *val)
2521{
2522        int idx;
2523        struct kvm_io_bus *bus;
2524        struct kvm_io_range range;
2525
2526        range = (struct kvm_io_range) {
2527                .addr = addr,
2528                .len = len,
2529        };
2530
2531        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
2532        idx = kvm_io_bus_get_first_dev(bus, addr, len);
2533        if (idx < 0)
2534                return -EOPNOTSUPP;
2535
2536        while (idx < bus->dev_count &&
2537                kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) {
2538                if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val))
2539                        return 0;
2540                idx++;
2541        }
2542
2543        return -EOPNOTSUPP;
2544}
2545
2546/* Caller must hold slots_lock. */
2547int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
2548                            int len, struct kvm_io_device *dev)
2549{
2550        struct kvm_io_bus *new_bus, *bus;
2551
2552        bus = kvm->buses[bus_idx];
2553        if (bus->dev_count > NR_IOBUS_DEVS - 1)
2554                return -ENOSPC;
2555
2556        new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
2557                          sizeof(struct kvm_io_range)), GFP_KERNEL);
2558        if (!new_bus)
2559                return -ENOMEM;
2560        memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count *
2561               sizeof(struct kvm_io_range)));
2562        kvm_io_bus_insert_dev(new_bus, dev, addr, len);
2563        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2564        synchronize_srcu_expedited(&kvm->srcu);
2565        kfree(bus);
2566
2567        return 0;
2568}
2569
2570/* Caller must hold slots_lock. */
2571int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
2572                              struct kvm_io_device *dev)
2573{
2574        int i, r;
2575        struct kvm_io_bus *new_bus, *bus;
2576
2577        bus = kvm->buses[bus_idx];
2578        r = -ENOENT;
2579        for (i = 0; i < bus->dev_count; i++)
2580                if (bus->range[i].dev == dev) {
2581                        r = 0;
2582                        break;
2583                }
2584
2585        if (r)
2586                return r;
2587
2588        new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
2589                          sizeof(struct kvm_io_range)), GFP_KERNEL);
2590        if (!new_bus)
2591                return -ENOMEM;
2592
2593        memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
2594        new_bus->dev_count--;
2595        memcpy(new_bus->range + i, bus->range + i + 1,
2596               (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
2597
2598        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
2599        synchronize_srcu_expedited(&kvm->srcu);
2600        kfree(bus);
2601        return r;
2602}
2603
2604static struct notifier_block kvm_cpu_notifier = {
2605        .notifier_call = kvm_cpu_hotplug,
2606};
2607
2608static int vm_stat_get(void *_offset, u64 *val)
2609{
2610        unsigned offset = (long)_offset;
2611        struct kvm *kvm;
2612
2613        *val = 0;
2614        raw_spin_lock(&kvm_lock);
2615        list_for_each_entry(kvm, &vm_list, vm_list)
2616                *val += *(u32 *)((void *)kvm + offset);
2617        raw_spin_unlock(&kvm_lock);
2618        return 0;
2619}
2620
2621DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
2622
2623static int vcpu_stat_get(void *_offset, u64 *val)
2624{
2625        unsigned offset = (long)_offset;
2626        struct kvm *kvm;
2627        struct kvm_vcpu *vcpu;
2628        int i;
2629
2630        *val = 0;
2631        raw_spin_lock(&kvm_lock);
2632        list_for_each_entry(kvm, &vm_list, vm_list)
2633                kvm_for_each_vcpu(i, vcpu, kvm)
2634                        *val += *(u32 *)((void *)vcpu + offset);
2635
2636        raw_spin_unlock(&kvm_lock);
2637        return 0;
2638}
2639
2640DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
2641
2642static const struct file_operations *stat_fops[] = {
2643        [KVM_STAT_VCPU] = &vcpu_stat_fops,
2644        [KVM_STAT_VM]   = &vm_stat_fops,
2645};
2646
2647static int kvm_init_debug(void)
2648{
2649        int r = -EFAULT;
2650        struct kvm_stats_debugfs_item *p;
2651
2652        kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
2653        if (kvm_debugfs_dir == NULL)
2654                goto out;
2655
2656        for (p = debugfs_entries; p->name; ++p) {
2657                p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir,
2658                                                (void *)(long)p->offset,
2659                                                stat_fops[p->kind]);
2660                if (p->dentry == NULL)
2661                        goto out_dir;
2662        }
2663
2664        return 0;
2665
2666out_dir:
2667        debugfs_remove_recursive(kvm_debugfs_dir);
2668out:
2669        return r;
2670}
2671
2672static void kvm_exit_debug(void)
2673{
2674        struct kvm_stats_debugfs_item *p;
2675
2676        for (p = debugfs_entries; p->name; ++p)
2677                debugfs_remove(p->dentry);
2678        debugfs_remove(kvm_debugfs_dir);
2679}
2680
2681static int kvm_suspend(void)
2682{
2683        if (kvm_usage_count)
2684                hardware_disable_nolock(NULL);
2685        return 0;
2686}
2687
2688static void kvm_resume(void)
2689{
2690        if (kvm_usage_count) {
2691                WARN_ON(raw_spin_is_locked(&kvm_lock));
2692                hardware_enable_nolock(NULL);
2693        }
2694}
2695
2696static struct syscore_ops kvm_syscore_ops = {
2697        .suspend = kvm_suspend,
2698        .resume = kvm_resume,
2699};
2700
2701struct page *bad_page;
2702pfn_t bad_pfn;
2703
2704static inline
2705struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
2706{
2707        return container_of(pn, struct kvm_vcpu, preempt_notifier);
2708}
2709
2710static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
2711{
2712        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2713
2714        kvm_arch_vcpu_load(vcpu, cpu);
2715}
2716
2717static void kvm_sched_out(struct preempt_notifier *pn,
2718                          struct task_struct *next)
2719{
2720        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
2721
2722        kvm_arch_vcpu_put(vcpu);
2723}
2724
2725int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
2726                  struct module *module)
2727{
2728        int r;
2729        int cpu;
2730
2731        r = kvm_arch_init(opaque);
2732        if (r)
2733                goto out_fail;
2734
2735        bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2736
2737        if (bad_page == NULL) {
2738                r = -ENOMEM;
2739                goto out;
2740        }
2741
2742        bad_pfn = page_to_pfn(bad_page);
2743
2744        hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2745
2746        if (hwpoison_page == NULL) {
2747                r = -ENOMEM;
2748                goto out_free_0;
2749        }
2750
2751        hwpoison_pfn = page_to_pfn(hwpoison_page);
2752
2753        fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2754
2755        if (fault_page == NULL) {
2756                r = -ENOMEM;
2757                goto out_free_0;
2758        }
2759
2760        fault_pfn = page_to_pfn(fault_page);
2761
2762        if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2763                r = -ENOMEM;
2764                goto out_free_0;
2765        }
2766
2767        r = kvm_arch_hardware_setup();
2768        if (r < 0)
2769                goto out_free_0a;
2770
2771        for_each_online_cpu(cpu) {
2772                smp_call_function_single(cpu,
2773                                kvm_arch_check_processor_compat,
2774                                &r, 1);
2775                if (r < 0)
2776                        goto out_free_1;
2777        }
2778
2779        r = register_cpu_notifier(&kvm_cpu_notifier);
2780        if (r)
2781                goto out_free_2;
2782        register_reboot_notifier(&kvm_reboot_notifier);
2783
2784        /* A kmem cache lets us meet the alignment requirements of fx_save. */
2785        if (!vcpu_align)
2786                vcpu_align = __alignof__(struct kvm_vcpu);
2787        kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
2788                                           0, NULL);
2789        if (!kvm_vcpu_cache) {
2790                r = -ENOMEM;
2791                goto out_free_3;
2792        }
2793
2794        r = kvm_async_pf_init();
2795        if (r)
2796                goto out_free;
2797
2798        kvm_chardev_ops.owner = module;
2799        kvm_vm_fops.owner = module;
2800        kvm_vcpu_fops.owner = module;
2801
2802        r = misc_register(&kvm_dev);
2803        if (r) {
2804                printk(KERN_ERR "kvm: misc device register failed\n");
2805                goto out_unreg;
2806        }
2807
2808        register_syscore_ops(&kvm_syscore_ops);
2809
2810        kvm_preempt_ops.sched_in = kvm_sched_in;
2811        kvm_preempt_ops.sched_out = kvm_sched_out;
2812
2813        r = kvm_init_debug();
2814        if (r) {
2815                printk(KERN_ERR "kvm: create debugfs files failed\n");
2816                goto out_undebugfs;
2817        }
2818
2819        return 0;
2820
2821out_undebugfs:
2822        unregister_syscore_ops(&kvm_syscore_ops);
2823out_unreg:
2824        kvm_async_pf_deinit();
2825out_free:
2826        kmem_cache_destroy(kvm_vcpu_cache);
2827out_free_3:
2828        unregister_reboot_notifier(&kvm_reboot_notifier);
2829        unregister_cpu_notifier(&kvm_cpu_notifier);
2830out_free_2:
2831out_free_1:
2832        kvm_arch_hardware_unsetup();
2833out_free_0a:
2834        free_cpumask_var(cpus_hardware_enabled);
2835out_free_0:
2836        if (fault_page)
2837                __free_page(fault_page);
2838        if (hwpoison_page)
2839                __free_page(hwpoison_page);
2840        __free_page(bad_page);
2841out:
2842        kvm_arch_exit();
2843out_fail:
2844        return r;
2845}
2846EXPORT_SYMBOL_GPL(kvm_init);
2847
2848void kvm_exit(void)
2849{
2850        kvm_exit_debug();
2851        misc_deregister(&kvm_dev);
2852        kmem_cache_destroy(kvm_vcpu_cache);
2853        kvm_async_pf_deinit();
2854        unregister_syscore_ops(&kvm_syscore_ops);
2855        unregister_reboot_notifier(&kvm_reboot_notifier);
2856        unregister_cpu_notifier(&kvm_cpu_notifier);
2857        on_each_cpu(hardware_disable_nolock, NULL, 1);
2858        kvm_arch_hardware_unsetup();
2859        kvm_arch_exit();
2860        free_cpumask_var(cpus_hardware_enabled);
2861        __free_page(fault_page);
2862        __free_page(hwpoison_page);
2863        __free_page(bad_page);
2864}
2865EXPORT_SYMBOL_GPL(kvm_exit);
2866