linux/virt/kvm/kvm_main.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Kernel-based Virtual Machine driver for Linux
   4 *
   5 * This module enables machines with Intel VT-x extensions to run virtual
   6 * machines without emulation or binary translation.
   7 *
   8 * Copyright (C) 2006 Qumranet, Inc.
   9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10 *
  11 * Authors:
  12 *   Avi Kivity   <avi@qumranet.com>
  13 *   Yaniv Kamay  <yaniv@qumranet.com>
  14 */
  15
  16#include <kvm/iodev.h>
  17
  18#include <linux/kvm_host.h>
  19#include <linux/kvm.h>
  20#include <linux/module.h>
  21#include <linux/errno.h>
  22#include <linux/percpu.h>
  23#include <linux/mm.h>
  24#include <linux/miscdevice.h>
  25#include <linux/vmalloc.h>
  26#include <linux/reboot.h>
  27#include <linux/debugfs.h>
  28#include <linux/highmem.h>
  29#include <linux/file.h>
  30#include <linux/syscore_ops.h>
  31#include <linux/cpu.h>
  32#include <linux/sched/signal.h>
  33#include <linux/sched/mm.h>
  34#include <linux/sched/stat.h>
  35#include <linux/cpumask.h>
  36#include <linux/smp.h>
  37#include <linux/anon_inodes.h>
  38#include <linux/profile.h>
  39#include <linux/kvm_para.h>
  40#include <linux/pagemap.h>
  41#include <linux/mman.h>
  42#include <linux/swap.h>
  43#include <linux/bitops.h>
  44#include <linux/spinlock.h>
  45#include <linux/compat.h>
  46#include <linux/srcu.h>
  47#include <linux/hugetlb.h>
  48#include <linux/slab.h>
  49#include <linux/sort.h>
  50#include <linux/bsearch.h>
  51#include <linux/io.h>
  52#include <linux/lockdep.h>
  53#include <linux/kthread.h>
  54#include <linux/suspend.h>
  55
  56#include <asm/processor.h>
  57#include <asm/ioctl.h>
  58#include <linux/uaccess.h>
  59
  60#include "coalesced_mmio.h"
  61#include "async_pf.h"
  62#include "mmu_lock.h"
  63#include "vfio.h"
  64
  65#define CREATE_TRACE_POINTS
  66#include <trace/events/kvm.h>
  67
  68#include <linux/kvm_dirty_ring.h>
  69
  70/* Worst case buffer size needed for holding an integer. */
  71#define ITOA_MAX_LEN 12
  72
  73MODULE_AUTHOR("Qumranet");
  74MODULE_LICENSE("GPL");
  75
  76/* Architectures should define their poll value according to the halt latency */
  77unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
  78module_param(halt_poll_ns, uint, 0644);
  79EXPORT_SYMBOL_GPL(halt_poll_ns);
  80
  81/* Default doubles per-vcpu halt_poll_ns. */
  82unsigned int halt_poll_ns_grow = 2;
  83module_param(halt_poll_ns_grow, uint, 0644);
  84EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
  85
  86/* The start value to grow halt_poll_ns from */
  87unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
  88module_param(halt_poll_ns_grow_start, uint, 0644);
  89EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
  90
  91/* Default resets per-vcpu halt_poll_ns . */
  92unsigned int halt_poll_ns_shrink;
  93module_param(halt_poll_ns_shrink, uint, 0644);
  94EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
  95
  96/*
  97 * Ordering of locks:
  98 *
  99 *      kvm->lock --> kvm->slots_lock --> kvm->irq_lock
 100 */
 101
 102DEFINE_MUTEX(kvm_lock);
 103static DEFINE_RAW_SPINLOCK(kvm_count_lock);
 104LIST_HEAD(vm_list);
 105
 106static cpumask_var_t cpus_hardware_enabled;
 107static int kvm_usage_count;
 108static atomic_t hardware_enable_failed;
 109
 110static struct kmem_cache *kvm_vcpu_cache;
 111
 112static __read_mostly struct preempt_ops kvm_preempt_ops;
 113static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
 114
 115struct dentry *kvm_debugfs_dir;
 116EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 117
 118static const struct file_operations stat_fops_per_vm;
 119
 120static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 121                           unsigned long arg);
 122#ifdef CONFIG_KVM_COMPAT
 123static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
 124                                  unsigned long arg);
 125#define KVM_COMPAT(c)   .compat_ioctl   = (c)
 126#else
 127/*
 128 * For architectures that don't implement a compat infrastructure,
 129 * adopt a double line of defense:
 130 * - Prevent a compat task from opening /dev/kvm
 131 * - If the open has been done by a 64bit task, and the KVM fd
 132 *   passed to a compat task, let the ioctls fail.
 133 */
 134static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
 135                                unsigned long arg) { return -EINVAL; }
 136
 137static int kvm_no_compat_open(struct inode *inode, struct file *file)
 138{
 139        return is_compat_task() ? -ENODEV : 0;
 140}
 141#define KVM_COMPAT(c)   .compat_ioctl   = kvm_no_compat_ioctl,  \
 142                        .open           = kvm_no_compat_open
 143#endif
 144static int hardware_enable_all(void);
 145static void hardware_disable_all(void);
 146
 147static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 148
 149__visible bool kvm_rebooting;
 150EXPORT_SYMBOL_GPL(kvm_rebooting);
 151
 152#define KVM_EVENT_CREATE_VM 0
 153#define KVM_EVENT_DESTROY_VM 1
 154static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
 155static unsigned long long kvm_createvm_count;
 156static unsigned long long kvm_active_vms;
 157
 158static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
 159
 160__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 161                                                   unsigned long start, unsigned long end)
 162{
 163}
 164
 165bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
 166{
 167        /*
 168         * The metadata used by is_zone_device_page() to determine whether or
 169         * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
 170         * the device has been pinned, e.g. by get_user_pages().  WARN if the
 171         * page_count() is zero to help detect bad usage of this helper.
 172         */
 173        if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
 174                return false;
 175
 176        return is_zone_device_page(pfn_to_page(pfn));
 177}
 178
 179bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 180{
 181        /*
 182         * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
 183         * perspective they are "normal" pages, albeit with slightly different
 184         * usage rules.
 185         */
 186        if (pfn_valid(pfn))
 187                return PageReserved(pfn_to_page(pfn)) &&
 188                       !is_zero_pfn(pfn) &&
 189                       !kvm_is_zone_device_pfn(pfn);
 190
 191        return true;
 192}
 193
 194/*
 195 * Switches to specified vcpu, until a matching vcpu_put()
 196 */
 197void vcpu_load(struct kvm_vcpu *vcpu)
 198{
 199        int cpu = get_cpu();
 200
 201        __this_cpu_write(kvm_running_vcpu, vcpu);
 202        preempt_notifier_register(&vcpu->preempt_notifier);
 203        kvm_arch_vcpu_load(vcpu, cpu);
 204        put_cpu();
 205}
 206EXPORT_SYMBOL_GPL(vcpu_load);
 207
 208void vcpu_put(struct kvm_vcpu *vcpu)
 209{
 210        preempt_disable();
 211        kvm_arch_vcpu_put(vcpu);
 212        preempt_notifier_unregister(&vcpu->preempt_notifier);
 213        __this_cpu_write(kvm_running_vcpu, NULL);
 214        preempt_enable();
 215}
 216EXPORT_SYMBOL_GPL(vcpu_put);
 217
 218/* TODO: merge with kvm_arch_vcpu_should_kick */
 219static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
 220{
 221        int mode = kvm_vcpu_exiting_guest_mode(vcpu);
 222
 223        /*
 224         * We need to wait for the VCPU to reenable interrupts and get out of
 225         * READING_SHADOW_PAGE_TABLES mode.
 226         */
 227        if (req & KVM_REQUEST_WAIT)
 228                return mode != OUTSIDE_GUEST_MODE;
 229
 230        /*
 231         * Need to kick a running VCPU, but otherwise there is nothing to do.
 232         */
 233        return mode == IN_GUEST_MODE;
 234}
 235
 236static void ack_flush(void *_completed)
 237{
 238}
 239
 240static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
 241{
 242        if (cpumask_empty(cpus))
 243                return false;
 244
 245        smp_call_function_many(cpus, ack_flush, NULL, wait);
 246        return true;
 247}
 248
 249static void kvm_make_vcpu_request(struct kvm *kvm, struct kvm_vcpu *vcpu,
 250                                  unsigned int req, struct cpumask *tmp,
 251                                  int current_cpu)
 252{
 253        int cpu;
 254
 255        kvm_make_request(req, vcpu);
 256
 257        if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
 258                return;
 259
 260        /*
 261         * Note, the vCPU could get migrated to a different pCPU at any point
 262         * after kvm_request_needs_ipi(), which could result in sending an IPI
 263         * to the previous pCPU.  But, that's OK because the purpose of the IPI
 264         * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
 265         * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
 266         * after this point is also OK, as the requirement is only that KVM wait
 267         * for vCPUs that were reading SPTEs _before_ any changes were
 268         * finalized. See kvm_vcpu_kick() for more details on handling requests.
 269         */
 270        if (kvm_request_needs_ipi(vcpu, req)) {
 271                cpu = READ_ONCE(vcpu->cpu);
 272                if (cpu != -1 && cpu != current_cpu)
 273                        __cpumask_set_cpu(cpu, tmp);
 274        }
 275}
 276
 277bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 278                                 unsigned long *vcpu_bitmap)
 279{
 280        struct kvm_vcpu *vcpu;
 281        struct cpumask *cpus;
 282        int i, me;
 283        bool called;
 284
 285        me = get_cpu();
 286
 287        cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 288        cpumask_clear(cpus);
 289
 290        for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
 291                vcpu = kvm_get_vcpu(kvm, i);
 292                if (!vcpu)
 293                        continue;
 294                kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
 295        }
 296
 297        called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 298        put_cpu();
 299
 300        return called;
 301}
 302
 303bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
 304                                      struct kvm_vcpu *except)
 305{
 306        struct kvm_vcpu *vcpu;
 307        struct cpumask *cpus;
 308        bool called;
 309        int i, me;
 310
 311        me = get_cpu();
 312
 313        cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 314        cpumask_clear(cpus);
 315
 316        kvm_for_each_vcpu(i, vcpu, kvm) {
 317                if (vcpu == except)
 318                        continue;
 319                kvm_make_vcpu_request(kvm, vcpu, req, cpus, me);
 320        }
 321
 322        called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 323        put_cpu();
 324
 325        return called;
 326}
 327
 328bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 329{
 330        return kvm_make_all_cpus_request_except(kvm, req, NULL);
 331}
 332EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
 333
 334#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
 335void kvm_flush_remote_tlbs(struct kvm *kvm)
 336{
 337        ++kvm->stat.generic.remote_tlb_flush_requests;
 338
 339        /*
 340         * We want to publish modifications to the page tables before reading
 341         * mode. Pairs with a memory barrier in arch-specific code.
 342         * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
 343         * and smp_mb in walk_shadow_page_lockless_begin/end.
 344         * - powerpc: smp_mb in kvmppc_prepare_to_enter.
 345         *
 346         * There is already an smp_mb__after_atomic() before
 347         * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
 348         * barrier here.
 349         */
 350        if (!kvm_arch_flush_remote_tlb(kvm)
 351            || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 352                ++kvm->stat.generic.remote_tlb_flush;
 353}
 354EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 355#endif
 356
 357void kvm_reload_remote_mmus(struct kvm *kvm)
 358{
 359        kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 360}
 361
 362#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 363static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
 364                                               gfp_t gfp_flags)
 365{
 366        gfp_flags |= mc->gfp_zero;
 367
 368        if (mc->kmem_cache)
 369                return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
 370        else
 371                return (void *)__get_free_page(gfp_flags);
 372}
 373
 374int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
 375{
 376        void *obj;
 377
 378        if (mc->nobjs >= min)
 379                return 0;
 380        while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
 381                obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
 382                if (!obj)
 383                        return mc->nobjs >= min ? 0 : -ENOMEM;
 384                mc->objects[mc->nobjs++] = obj;
 385        }
 386        return 0;
 387}
 388
 389int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
 390{
 391        return mc->nobjs;
 392}
 393
 394void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 395{
 396        while (mc->nobjs) {
 397                if (mc->kmem_cache)
 398                        kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
 399                else
 400                        free_page((unsigned long)mc->objects[--mc->nobjs]);
 401        }
 402}
 403
 404void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 405{
 406        void *p;
 407
 408        if (WARN_ON(!mc->nobjs))
 409                p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
 410        else
 411                p = mc->objects[--mc->nobjs];
 412        BUG_ON(!p);
 413        return p;
 414}
 415#endif
 416
 417static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 418{
 419        mutex_init(&vcpu->mutex);
 420        vcpu->cpu = -1;
 421        vcpu->kvm = kvm;
 422        vcpu->vcpu_id = id;
 423        vcpu->pid = NULL;
 424        rcuwait_init(&vcpu->wait);
 425        kvm_async_pf_vcpu_init(vcpu);
 426
 427        vcpu->pre_pcpu = -1;
 428        INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
 429
 430        kvm_vcpu_set_in_spin_loop(vcpu, false);
 431        kvm_vcpu_set_dy_eligible(vcpu, false);
 432        vcpu->preempted = false;
 433        vcpu->ready = false;
 434        preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 435        vcpu->last_used_slot = 0;
 436}
 437
 438void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 439{
 440        kvm_dirty_ring_free(&vcpu->dirty_ring);
 441        kvm_arch_vcpu_destroy(vcpu);
 442
 443        /*
 444         * No need for rcu_read_lock as VCPU_RUN is the only place that changes
 445         * the vcpu->pid pointer, and at destruction time all file descriptors
 446         * are already gone.
 447         */
 448        put_pid(rcu_dereference_protected(vcpu->pid, 1));
 449
 450        free_page((unsigned long)vcpu->run);
 451        kmem_cache_free(kvm_vcpu_cache, vcpu);
 452}
 453EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
 454
 455#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 456static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 457{
 458        return container_of(mn, struct kvm, mmu_notifier);
 459}
 460
 461static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
 462                                              struct mm_struct *mm,
 463                                              unsigned long start, unsigned long end)
 464{
 465        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 466        int idx;
 467
 468        idx = srcu_read_lock(&kvm->srcu);
 469        kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
 470        srcu_read_unlock(&kvm->srcu, idx);
 471}
 472
 473typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
 474
 475typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
 476                             unsigned long end);
 477
 478struct kvm_hva_range {
 479        unsigned long start;
 480        unsigned long end;
 481        pte_t pte;
 482        hva_handler_t handler;
 483        on_lock_fn_t on_lock;
 484        bool flush_on_ret;
 485        bool may_block;
 486};
 487
 488/*
 489 * Use a dedicated stub instead of NULL to indicate that there is no callback
 490 * function/handler.  The compiler technically can't guarantee that a real
 491 * function will have a non-zero address, and so it will generate code to
 492 * check for !NULL, whereas comparing against a stub will be elided at compile
 493 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
 494 */
 495static void kvm_null_fn(void)
 496{
 497
 498}
 499#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
 500
 501static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 502                                                  const struct kvm_hva_range *range)
 503{
 504        bool ret = false, locked = false;
 505        struct kvm_gfn_range gfn_range;
 506        struct kvm_memory_slot *slot;
 507        struct kvm_memslots *slots;
 508        int i, idx;
 509
 510        /* A null handler is allowed if and only if on_lock() is provided. */
 511        if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
 512                         IS_KVM_NULL_FN(range->handler)))
 513                return 0;
 514
 515        idx = srcu_read_lock(&kvm->srcu);
 516
 517        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 518                slots = __kvm_memslots(kvm, i);
 519                kvm_for_each_memslot(slot, slots) {
 520                        unsigned long hva_start, hva_end;
 521
 522                        hva_start = max(range->start, slot->userspace_addr);
 523                        hva_end = min(range->end, slot->userspace_addr +
 524                                                  (slot->npages << PAGE_SHIFT));
 525                        if (hva_start >= hva_end)
 526                                continue;
 527
 528                        /*
 529                         * To optimize for the likely case where the address
 530                         * range is covered by zero or one memslots, don't
 531                         * bother making these conditional (to avoid writes on
 532                         * the second or later invocation of the handler).
 533                         */
 534                        gfn_range.pte = range->pte;
 535                        gfn_range.may_block = range->may_block;
 536
 537                        /*
 538                         * {gfn(page) | page intersects with [hva_start, hva_end)} =
 539                         * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 540                         */
 541                        gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
 542                        gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 543                        gfn_range.slot = slot;
 544
 545                        if (!locked) {
 546                                locked = true;
 547                                KVM_MMU_LOCK(kvm);
 548                                if (!IS_KVM_NULL_FN(range->on_lock))
 549                                        range->on_lock(kvm, range->start, range->end);
 550                                if (IS_KVM_NULL_FN(range->handler))
 551                                        break;
 552                        }
 553                        ret |= range->handler(kvm, &gfn_range);
 554                }
 555        }
 556
 557        if (range->flush_on_ret && ret)
 558                kvm_flush_remote_tlbs(kvm);
 559
 560        if (locked)
 561                KVM_MMU_UNLOCK(kvm);
 562
 563        srcu_read_unlock(&kvm->srcu, idx);
 564
 565        /* The notifiers are averse to booleans. :-( */
 566        return (int)ret;
 567}
 568
 569static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 570                                                unsigned long start,
 571                                                unsigned long end,
 572                                                pte_t pte,
 573                                                hva_handler_t handler)
 574{
 575        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 576        const struct kvm_hva_range range = {
 577                .start          = start,
 578                .end            = end,
 579                .pte            = pte,
 580                .handler        = handler,
 581                .on_lock        = (void *)kvm_null_fn,
 582                .flush_on_ret   = true,
 583                .may_block      = false,
 584        };
 585
 586        return __kvm_handle_hva_range(kvm, &range);
 587}
 588
 589static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
 590                                                         unsigned long start,
 591                                                         unsigned long end,
 592                                                         hva_handler_t handler)
 593{
 594        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 595        const struct kvm_hva_range range = {
 596                .start          = start,
 597                .end            = end,
 598                .pte            = __pte(0),
 599                .handler        = handler,
 600                .on_lock        = (void *)kvm_null_fn,
 601                .flush_on_ret   = false,
 602                .may_block      = false,
 603        };
 604
 605        return __kvm_handle_hva_range(kvm, &range);
 606}
 607static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 608                                        struct mm_struct *mm,
 609                                        unsigned long address,
 610                                        pte_t pte)
 611{
 612        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 613
 614        trace_kvm_set_spte_hva(address);
 615
 616        /*
 617         * .change_pte() must be surrounded by .invalidate_range_{start,end}().
 618         * If mmu_notifier_count is zero, then no in-progress invalidations,
 619         * including this one, found a relevant memslot at start(); rechecking
 620         * memslots here is unnecessary.  Note, a false positive (count elevated
 621         * by a different invalidation) is sub-optimal but functionally ok.
 622         */
 623        WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
 624        if (!READ_ONCE(kvm->mmu_notifier_count))
 625                return;
 626
 627        kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
 628}
 629
 630void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
 631                                   unsigned long end)
 632{
 633        /*
 634         * The count increase must become visible at unlock time as no
 635         * spte can be established without taking the mmu_lock and
 636         * count is also read inside the mmu_lock critical section.
 637         */
 638        kvm->mmu_notifier_count++;
 639        if (likely(kvm->mmu_notifier_count == 1)) {
 640                kvm->mmu_notifier_range_start = start;
 641                kvm->mmu_notifier_range_end = end;
 642        } else {
 643                /*
 644                 * Fully tracking multiple concurrent ranges has dimishing
 645                 * returns. Keep things simple and just find the minimal range
 646                 * which includes the current and new ranges. As there won't be
 647                 * enough information to subtract a range after its invalidate
 648                 * completes, any ranges invalidated concurrently will
 649                 * accumulate and persist until all outstanding invalidates
 650                 * complete.
 651                 */
 652                kvm->mmu_notifier_range_start =
 653                        min(kvm->mmu_notifier_range_start, start);
 654                kvm->mmu_notifier_range_end =
 655                        max(kvm->mmu_notifier_range_end, end);
 656        }
 657}
 658
 659static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 660                                        const struct mmu_notifier_range *range)
 661{
 662        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 663        const struct kvm_hva_range hva_range = {
 664                .start          = range->start,
 665                .end            = range->end,
 666                .pte            = __pte(0),
 667                .handler        = kvm_unmap_gfn_range,
 668                .on_lock        = kvm_inc_notifier_count,
 669                .flush_on_ret   = true,
 670                .may_block      = mmu_notifier_range_blockable(range),
 671        };
 672
 673        trace_kvm_unmap_hva_range(range->start, range->end);
 674
 675        /*
 676         * Prevent memslot modification between range_start() and range_end()
 677         * so that conditionally locking provides the same result in both
 678         * functions.  Without that guarantee, the mmu_notifier_count
 679         * adjustments will be imbalanced.
 680         *
 681         * Pairs with the decrement in range_end().
 682         */
 683        spin_lock(&kvm->mn_invalidate_lock);
 684        kvm->mn_active_invalidate_count++;
 685        spin_unlock(&kvm->mn_invalidate_lock);
 686
 687        __kvm_handle_hva_range(kvm, &hva_range);
 688
 689        return 0;
 690}
 691
 692void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
 693                                   unsigned long end)
 694{
 695        /*
 696         * This sequence increase will notify the kvm page fault that
 697         * the page that is going to be mapped in the spte could have
 698         * been freed.
 699         */
 700        kvm->mmu_notifier_seq++;
 701        smp_wmb();
 702        /*
 703         * The above sequence increase must be visible before the
 704         * below count decrease, which is ensured by the smp_wmb above
 705         * in conjunction with the smp_rmb in mmu_notifier_retry().
 706         */
 707        kvm->mmu_notifier_count--;
 708}
 709
 710static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 711                                        const struct mmu_notifier_range *range)
 712{
 713        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 714        const struct kvm_hva_range hva_range = {
 715                .start          = range->start,
 716                .end            = range->end,
 717                .pte            = __pte(0),
 718                .handler        = (void *)kvm_null_fn,
 719                .on_lock        = kvm_dec_notifier_count,
 720                .flush_on_ret   = false,
 721                .may_block      = mmu_notifier_range_blockable(range),
 722        };
 723        bool wake;
 724
 725        __kvm_handle_hva_range(kvm, &hva_range);
 726
 727        /* Pairs with the increment in range_start(). */
 728        spin_lock(&kvm->mn_invalidate_lock);
 729        wake = (--kvm->mn_active_invalidate_count == 0);
 730        spin_unlock(&kvm->mn_invalidate_lock);
 731
 732        /*
 733         * There can only be one waiter, since the wait happens under
 734         * slots_lock.
 735         */
 736        if (wake)
 737                rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
 738
 739        BUG_ON(kvm->mmu_notifier_count < 0);
 740}
 741
 742static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 743                                              struct mm_struct *mm,
 744                                              unsigned long start,
 745                                              unsigned long end)
 746{
 747        trace_kvm_age_hva(start, end);
 748
 749        return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
 750}
 751
 752static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 753                                        struct mm_struct *mm,
 754                                        unsigned long start,
 755                                        unsigned long end)
 756{
 757        trace_kvm_age_hva(start, end);
 758
 759        /*
 760         * Even though we do not flush TLB, this will still adversely
 761         * affect performance on pre-Haswell Intel EPT, where there is
 762         * no EPT Access Bit to clear so that we have to tear down EPT
 763         * tables instead. If we find this unacceptable, we can always
 764         * add a parameter to kvm_age_hva so that it effectively doesn't
 765         * do anything on clear_young.
 766         *
 767         * Also note that currently we never issue secondary TLB flushes
 768         * from clear_young, leaving this job up to the regular system
 769         * cadence. If we find this inaccurate, we might come up with a
 770         * more sophisticated heuristic later.
 771         */
 772        return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
 773}
 774
 775static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 776                                       struct mm_struct *mm,
 777                                       unsigned long address)
 778{
 779        trace_kvm_test_age_hva(address);
 780
 781        return kvm_handle_hva_range_no_flush(mn, address, address + 1,
 782                                             kvm_test_age_gfn);
 783}
 784
 785static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 786                                     struct mm_struct *mm)
 787{
 788        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 789        int idx;
 790
 791        idx = srcu_read_lock(&kvm->srcu);
 792        kvm_arch_flush_shadow_all(kvm);
 793        srcu_read_unlock(&kvm->srcu, idx);
 794}
 795
 796static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 797        .invalidate_range       = kvm_mmu_notifier_invalidate_range,
 798        .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
 799        .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
 800        .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
 801        .clear_young            = kvm_mmu_notifier_clear_young,
 802        .test_young             = kvm_mmu_notifier_test_young,
 803        .change_pte             = kvm_mmu_notifier_change_pte,
 804        .release                = kvm_mmu_notifier_release,
 805};
 806
 807static int kvm_init_mmu_notifier(struct kvm *kvm)
 808{
 809        kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 810        return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 811}
 812
 813#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
 814
 815static int kvm_init_mmu_notifier(struct kvm *kvm)
 816{
 817        return 0;
 818}
 819
 820#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 821
 822#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 823static int kvm_pm_notifier_call(struct notifier_block *bl,
 824                                unsigned long state,
 825                                void *unused)
 826{
 827        struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
 828
 829        return kvm_arch_pm_notifier(kvm, state);
 830}
 831
 832static void kvm_init_pm_notifier(struct kvm *kvm)
 833{
 834        kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
 835        /* Suspend KVM before we suspend ftrace, RCU, etc. */
 836        kvm->pm_notifier.priority = INT_MAX;
 837        register_pm_notifier(&kvm->pm_notifier);
 838}
 839
 840static void kvm_destroy_pm_notifier(struct kvm *kvm)
 841{
 842        unregister_pm_notifier(&kvm->pm_notifier);
 843}
 844#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
 845static void kvm_init_pm_notifier(struct kvm *kvm)
 846{
 847}
 848
 849static void kvm_destroy_pm_notifier(struct kvm *kvm)
 850{
 851}
 852#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
 853
 854static struct kvm_memslots *kvm_alloc_memslots(void)
 855{
 856        int i;
 857        struct kvm_memslots *slots;
 858
 859        slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
 860        if (!slots)
 861                return NULL;
 862
 863        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
 864                slots->id_to_index[i] = -1;
 865
 866        return slots;
 867}
 868
 869static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 870{
 871        if (!memslot->dirty_bitmap)
 872                return;
 873
 874        kvfree(memslot->dirty_bitmap);
 875        memslot->dirty_bitmap = NULL;
 876}
 877
 878static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 879{
 880        kvm_destroy_dirty_bitmap(slot);
 881
 882        kvm_arch_free_memslot(kvm, slot);
 883
 884        slot->flags = 0;
 885        slot->npages = 0;
 886}
 887
 888static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
 889{
 890        struct kvm_memory_slot *memslot;
 891
 892        if (!slots)
 893                return;
 894
 895        kvm_for_each_memslot(memslot, slots)
 896                kvm_free_memslot(kvm, memslot);
 897
 898        kvfree(slots);
 899}
 900
 901static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
 902{
 903        switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
 904        case KVM_STATS_TYPE_INSTANT:
 905                return 0444;
 906        case KVM_STATS_TYPE_CUMULATIVE:
 907        case KVM_STATS_TYPE_PEAK:
 908        default:
 909                return 0644;
 910        }
 911}
 912
 913
 914static void kvm_destroy_vm_debugfs(struct kvm *kvm)
 915{
 916        int i;
 917        int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
 918                                      kvm_vcpu_stats_header.num_desc;
 919
 920        if (!kvm->debugfs_dentry)
 921                return;
 922
 923        debugfs_remove_recursive(kvm->debugfs_dentry);
 924
 925        if (kvm->debugfs_stat_data) {
 926                for (i = 0; i < kvm_debugfs_num_entries; i++)
 927                        kfree(kvm->debugfs_stat_data[i]);
 928                kfree(kvm->debugfs_stat_data);
 929        }
 930}
 931
 932static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
 933{
 934        static DEFINE_MUTEX(kvm_debugfs_lock);
 935        struct dentry *dent;
 936        char dir_name[ITOA_MAX_LEN * 2];
 937        struct kvm_stat_data *stat_data;
 938        const struct _kvm_stats_desc *pdesc;
 939        int i, ret;
 940        int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
 941                                      kvm_vcpu_stats_header.num_desc;
 942
 943        if (!debugfs_initialized())
 944                return 0;
 945
 946        snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
 947        mutex_lock(&kvm_debugfs_lock);
 948        dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
 949        if (dent) {
 950                pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
 951                dput(dent);
 952                mutex_unlock(&kvm_debugfs_lock);
 953                return 0;
 954        }
 955        dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
 956        mutex_unlock(&kvm_debugfs_lock);
 957        if (IS_ERR(dent))
 958                return 0;
 959
 960        kvm->debugfs_dentry = dent;
 961        kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
 962                                         sizeof(*kvm->debugfs_stat_data),
 963                                         GFP_KERNEL_ACCOUNT);
 964        if (!kvm->debugfs_stat_data)
 965                return -ENOMEM;
 966
 967        for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
 968                pdesc = &kvm_vm_stats_desc[i];
 969                stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
 970                if (!stat_data)
 971                        return -ENOMEM;
 972
 973                stat_data->kvm = kvm;
 974                stat_data->desc = pdesc;
 975                stat_data->kind = KVM_STAT_VM;
 976                kvm->debugfs_stat_data[i] = stat_data;
 977                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
 978                                    kvm->debugfs_dentry, stat_data,
 979                                    &stat_fops_per_vm);
 980        }
 981
 982        for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
 983                pdesc = &kvm_vcpu_stats_desc[i];
 984                stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
 985                if (!stat_data)
 986                        return -ENOMEM;
 987
 988                stat_data->kvm = kvm;
 989                stat_data->desc = pdesc;
 990                stat_data->kind = KVM_STAT_VCPU;
 991                kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
 992                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
 993                                    kvm->debugfs_dentry, stat_data,
 994                                    &stat_fops_per_vm);
 995        }
 996
 997        ret = kvm_arch_create_vm_debugfs(kvm);
 998        if (ret) {
 999                kvm_destroy_vm_debugfs(kvm);
1000                return i;
1001        }
1002
1003        return 0;
1004}
1005
1006/*
1007 * Called after the VM is otherwise initialized, but just before adding it to
1008 * the vm_list.
1009 */
1010int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1011{
1012        return 0;
1013}
1014
1015/*
1016 * Called just after removing the VM from the vm_list, but before doing any
1017 * other destruction.
1018 */
1019void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1020{
1021}
1022
1023/*
1024 * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
1025 * be setup already, so we can create arch-specific debugfs entries under it.
1026 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1027 * a per-arch destroy interface is not needed.
1028 */
1029int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1030{
1031        return 0;
1032}
1033
1034static struct kvm *kvm_create_vm(unsigned long type)
1035{
1036        struct kvm *kvm = kvm_arch_alloc_vm();
1037        int r = -ENOMEM;
1038        int i;
1039
1040        if (!kvm)
1041                return ERR_PTR(-ENOMEM);
1042
1043        KVM_MMU_LOCK_INIT(kvm);
1044        mmgrab(current->mm);
1045        kvm->mm = current->mm;
1046        kvm_eventfd_init(kvm);
1047        mutex_init(&kvm->lock);
1048        mutex_init(&kvm->irq_lock);
1049        mutex_init(&kvm->slots_lock);
1050        mutex_init(&kvm->slots_arch_lock);
1051        spin_lock_init(&kvm->mn_invalidate_lock);
1052        rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1053
1054        INIT_LIST_HEAD(&kvm->devices);
1055
1056        BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1057
1058        if (init_srcu_struct(&kvm->srcu))
1059                goto out_err_no_srcu;
1060        if (init_srcu_struct(&kvm->irq_srcu))
1061                goto out_err_no_irq_srcu;
1062
1063        refcount_set(&kvm->users_count, 1);
1064        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1065                struct kvm_memslots *slots = kvm_alloc_memslots();
1066
1067                if (!slots)
1068                        goto out_err_no_arch_destroy_vm;
1069                /* Generations must be different for each address space. */
1070                slots->generation = i;
1071                rcu_assign_pointer(kvm->memslots[i], slots);
1072        }
1073
1074        for (i = 0; i < KVM_NR_BUSES; i++) {
1075                rcu_assign_pointer(kvm->buses[i],
1076                        kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1077                if (!kvm->buses[i])
1078                        goto out_err_no_arch_destroy_vm;
1079        }
1080
1081        kvm->max_halt_poll_ns = halt_poll_ns;
1082
1083        r = kvm_arch_init_vm(kvm, type);
1084        if (r)
1085                goto out_err_no_arch_destroy_vm;
1086
1087        r = hardware_enable_all();
1088        if (r)
1089                goto out_err_no_disable;
1090
1091#ifdef CONFIG_HAVE_KVM_IRQFD
1092        INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1093#endif
1094
1095        r = kvm_init_mmu_notifier(kvm);
1096        if (r)
1097                goto out_err_no_mmu_notifier;
1098
1099        r = kvm_arch_post_init_vm(kvm);
1100        if (r)
1101                goto out_err;
1102
1103        mutex_lock(&kvm_lock);
1104        list_add(&kvm->vm_list, &vm_list);
1105        mutex_unlock(&kvm_lock);
1106
1107        preempt_notifier_inc();
1108        kvm_init_pm_notifier(kvm);
1109
1110        return kvm;
1111
1112out_err:
1113#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1114        if (kvm->mmu_notifier.ops)
1115                mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1116#endif
1117out_err_no_mmu_notifier:
1118        hardware_disable_all();
1119out_err_no_disable:
1120        kvm_arch_destroy_vm(kvm);
1121out_err_no_arch_destroy_vm:
1122        WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1123        for (i = 0; i < KVM_NR_BUSES; i++)
1124                kfree(kvm_get_bus(kvm, i));
1125        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1126                kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1127        cleanup_srcu_struct(&kvm->irq_srcu);
1128out_err_no_irq_srcu:
1129        cleanup_srcu_struct(&kvm->srcu);
1130out_err_no_srcu:
1131        kvm_arch_free_vm(kvm);
1132        mmdrop(current->mm);
1133        return ERR_PTR(r);
1134}
1135
1136static void kvm_destroy_devices(struct kvm *kvm)
1137{
1138        struct kvm_device *dev, *tmp;
1139
1140        /*
1141         * We do not need to take the kvm->lock here, because nobody else
1142         * has a reference to the struct kvm at this point and therefore
1143         * cannot access the devices list anyhow.
1144         */
1145        list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1146                list_del(&dev->vm_node);
1147                dev->ops->destroy(dev);
1148        }
1149}
1150
1151static void kvm_destroy_vm(struct kvm *kvm)
1152{
1153        int i;
1154        struct mm_struct *mm = kvm->mm;
1155
1156        kvm_destroy_pm_notifier(kvm);
1157        kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1158        kvm_destroy_vm_debugfs(kvm);
1159        kvm_arch_sync_events(kvm);
1160        mutex_lock(&kvm_lock);
1161        list_del(&kvm->vm_list);
1162        mutex_unlock(&kvm_lock);
1163        kvm_arch_pre_destroy_vm(kvm);
1164
1165        kvm_free_irq_routing(kvm);
1166        for (i = 0; i < KVM_NR_BUSES; i++) {
1167                struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1168
1169                if (bus)
1170                        kvm_io_bus_destroy(bus);
1171                kvm->buses[i] = NULL;
1172        }
1173        kvm_coalesced_mmio_free(kvm);
1174#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1175        mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1176        /*
1177         * At this point, pending calls to invalidate_range_start()
1178         * have completed but no more MMU notifiers will run, so
1179         * mn_active_invalidate_count may remain unbalanced.
1180         * No threads can be waiting in install_new_memslots as the
1181         * last reference on KVM has been dropped, but freeing
1182         * memslots would deadlock without this manual intervention.
1183         */
1184        WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1185        kvm->mn_active_invalidate_count = 0;
1186#else
1187        kvm_arch_flush_shadow_all(kvm);
1188#endif
1189        kvm_arch_destroy_vm(kvm);
1190        kvm_destroy_devices(kvm);
1191        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1192                kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1193        cleanup_srcu_struct(&kvm->irq_srcu);
1194        cleanup_srcu_struct(&kvm->srcu);
1195        kvm_arch_free_vm(kvm);
1196        preempt_notifier_dec();
1197        hardware_disable_all();
1198        mmdrop(mm);
1199}
1200
1201void kvm_get_kvm(struct kvm *kvm)
1202{
1203        refcount_inc(&kvm->users_count);
1204}
1205EXPORT_SYMBOL_GPL(kvm_get_kvm);
1206
1207/*
1208 * Make sure the vm is not during destruction, which is a safe version of
1209 * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
1210 */
1211bool kvm_get_kvm_safe(struct kvm *kvm)
1212{
1213        return refcount_inc_not_zero(&kvm->users_count);
1214}
1215EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1216
1217void kvm_put_kvm(struct kvm *kvm)
1218{
1219        if (refcount_dec_and_test(&kvm->users_count))
1220                kvm_destroy_vm(kvm);
1221}
1222EXPORT_SYMBOL_GPL(kvm_put_kvm);
1223
1224/*
1225 * Used to put a reference that was taken on behalf of an object associated
1226 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1227 * of the new file descriptor fails and the reference cannot be transferred to
1228 * its final owner.  In such cases, the caller is still actively using @kvm and
1229 * will fail miserably if the refcount unexpectedly hits zero.
1230 */
1231void kvm_put_kvm_no_destroy(struct kvm *kvm)
1232{
1233        WARN_ON(refcount_dec_and_test(&kvm->users_count));
1234}
1235EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1236
1237static int kvm_vm_release(struct inode *inode, struct file *filp)
1238{
1239        struct kvm *kvm = filp->private_data;
1240
1241        kvm_irqfd_release(kvm);
1242
1243        kvm_put_kvm(kvm);
1244        return 0;
1245}
1246
1247/*
1248 * Allocation size is twice as large as the actual dirty bitmap size.
1249 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1250 */
1251static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1252{
1253        unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
1254
1255        memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
1256        if (!memslot->dirty_bitmap)
1257                return -ENOMEM;
1258
1259        return 0;
1260}
1261
1262/*
1263 * Delete a memslot by decrementing the number of used slots and shifting all
1264 * other entries in the array forward one spot.
1265 */
1266static inline void kvm_memslot_delete(struct kvm_memslots *slots,
1267                                      struct kvm_memory_slot *memslot)
1268{
1269        struct kvm_memory_slot *mslots = slots->memslots;
1270        int i;
1271
1272        if (WARN_ON(slots->id_to_index[memslot->id] == -1))
1273                return;
1274
1275        slots->used_slots--;
1276
1277        if (atomic_read(&slots->last_used_slot) >= slots->used_slots)
1278                atomic_set(&slots->last_used_slot, 0);
1279
1280        for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
1281                mslots[i] = mslots[i + 1];
1282                slots->id_to_index[mslots[i].id] = i;
1283        }
1284        mslots[i] = *memslot;
1285        slots->id_to_index[memslot->id] = -1;
1286}
1287
1288/*
1289 * "Insert" a new memslot by incrementing the number of used slots.  Returns
1290 * the new slot's initial index into the memslots array.
1291 */
1292static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
1293{
1294        return slots->used_slots++;
1295}
1296
1297/*
1298 * Move a changed memslot backwards in the array by shifting existing slots
1299 * with a higher GFN toward the front of the array.  Note, the changed memslot
1300 * itself is not preserved in the array, i.e. not swapped at this time, only
1301 * its new index into the array is tracked.  Returns the changed memslot's
1302 * current index into the memslots array.
1303 */
1304static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
1305                                            struct kvm_memory_slot *memslot)
1306{
1307        struct kvm_memory_slot *mslots = slots->memslots;
1308        int i;
1309
1310        if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
1311            WARN_ON_ONCE(!slots->used_slots))
1312                return -1;
1313
1314        /*
1315         * Move the target memslot backward in the array by shifting existing
1316         * memslots with a higher GFN (than the target memslot) towards the
1317         * front of the array.
1318         */
1319        for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
1320                if (memslot->base_gfn > mslots[i + 1].base_gfn)
1321                        break;
1322
1323                WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1324
1325                /* Shift the next memslot forward one and update its index. */
1326                mslots[i] = mslots[i + 1];
1327                slots->id_to_index[mslots[i].id] = i;
1328        }
1329        return i;
1330}
1331
1332/*
1333 * Move a changed memslot forwards in the array by shifting existing slots with
1334 * a lower GFN toward the back of the array.  Note, the changed memslot itself
1335 * is not preserved in the array, i.e. not swapped at this time, only its new
1336 * index into the array is tracked.  Returns the changed memslot's final index
1337 * into the memslots array.
1338 */
1339static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1340                                           struct kvm_memory_slot *memslot,
1341                                           int start)
1342{
1343        struct kvm_memory_slot *mslots = slots->memslots;
1344        int i;
1345
1346        for (i = start; i > 0; i--) {
1347                if (memslot->base_gfn < mslots[i - 1].base_gfn)
1348                        break;
1349
1350                WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1351
1352                /* Shift the next memslot back one and update its index. */
1353                mslots[i] = mslots[i - 1];
1354                slots->id_to_index[mslots[i].id] = i;
1355        }
1356        return i;
1357}
1358
1359/*
1360 * Re-sort memslots based on their GFN to account for an added, deleted, or
1361 * moved memslot.  Sorting memslots by GFN allows using a binary search during
1362 * memslot lookup.
1363 *
1364 * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!  I.e. the entry
1365 * at memslots[0] has the highest GFN.
1366 *
1367 * The sorting algorithm takes advantage of having initially sorted memslots
1368 * and knowing the position of the changed memslot.  Sorting is also optimized
1369 * by not swapping the updated memslot and instead only shifting other memslots
1370 * and tracking the new index for the update memslot.  Only once its final
1371 * index is known is the updated memslot copied into its position in the array.
1372 *
1373 *  - When deleting a memslot, the deleted memslot simply needs to be moved to
1374 *    the end of the array.
1375 *
1376 *  - When creating a memslot, the algorithm "inserts" the new memslot at the
1377 *    end of the array and then it forward to its correct location.
1378 *
1379 *  - When moving a memslot, the algorithm first moves the updated memslot
1380 *    backward to handle the scenario where the memslot's GFN was changed to a
1381 *    lower value.  update_memslots() then falls through and runs the same flow
1382 *    as creating a memslot to move the memslot forward to handle the scenario
1383 *    where its GFN was changed to a higher value.
1384 *
1385 * Note, slots are sorted from highest->lowest instead of lowest->highest for
1386 * historical reasons.  Originally, invalid memslots where denoted by having
1387 * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots
1388 * to the end of the array.  The current algorithm uses dedicated logic to
1389 * delete a memslot and thus does not rely on invalid memslots having GFN=0.
1390 *
1391 * The other historical motiviation for highest->lowest was to improve the
1392 * performance of memslot lookup.  KVM originally used a linear search starting
1393 * at memslots[0].  On x86, the largest memslot usually has one of the highest,
1394 * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a
1395 * single memslot above the 4gb boundary.  As the largest memslot is also the
1396 * most likely to be referenced, sorting it to the front of the array was
1397 * advantageous.  The current binary search starts from the middle of the array
1398 * and uses an LRU pointer to improve performance for all memslots and GFNs.
1399 */
1400static void update_memslots(struct kvm_memslots *slots,
1401                            struct kvm_memory_slot *memslot,
1402                            enum kvm_mr_change change)
1403{
1404        int i;
1405
1406        if (change == KVM_MR_DELETE) {
1407                kvm_memslot_delete(slots, memslot);
1408        } else {
1409                if (change == KVM_MR_CREATE)
1410                        i = kvm_memslot_insert_back(slots);
1411                else
1412                        i = kvm_memslot_move_backward(slots, memslot);
1413                i = kvm_memslot_move_forward(slots, memslot, i);
1414
1415                /*
1416                 * Copy the memslot to its new position in memslots and update
1417                 * its index accordingly.
1418                 */
1419                slots->memslots[i] = *memslot;
1420                slots->id_to_index[memslot->id] = i;
1421        }
1422}
1423
1424static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1425{
1426        u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1427
1428#ifdef __KVM_HAVE_READONLY_MEM
1429        valid_flags |= KVM_MEM_READONLY;
1430#endif
1431
1432        if (mem->flags & ~valid_flags)
1433                return -EINVAL;
1434
1435        return 0;
1436}
1437
1438static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
1439                int as_id, struct kvm_memslots *slots)
1440{
1441        struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
1442        u64 gen = old_memslots->generation;
1443
1444        WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1445        slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1446
1447        /*
1448         * Do not store the new memslots while there are invalidations in
1449         * progress, otherwise the locking in invalidate_range_start and
1450         * invalidate_range_end will be unbalanced.
1451         */
1452        spin_lock(&kvm->mn_invalidate_lock);
1453        prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1454        while (kvm->mn_active_invalidate_count) {
1455                set_current_state(TASK_UNINTERRUPTIBLE);
1456                spin_unlock(&kvm->mn_invalidate_lock);
1457                schedule();
1458                spin_lock(&kvm->mn_invalidate_lock);
1459        }
1460        finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1461        rcu_assign_pointer(kvm->memslots[as_id], slots);
1462        spin_unlock(&kvm->mn_invalidate_lock);
1463
1464        /*
1465         * Acquired in kvm_set_memslot. Must be released before synchronize
1466         * SRCU below in order to avoid deadlock with another thread
1467         * acquiring the slots_arch_lock in an srcu critical section.
1468         */
1469        mutex_unlock(&kvm->slots_arch_lock);
1470
1471        synchronize_srcu_expedited(&kvm->srcu);
1472
1473        /*
1474         * Increment the new memslot generation a second time, dropping the
1475         * update in-progress flag and incrementing the generation based on
1476         * the number of address spaces.  This provides a unique and easily
1477         * identifiable generation number while the memslots are in flux.
1478         */
1479        gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1480
1481        /*
1482         * Generations must be unique even across address spaces.  We do not need
1483         * a global counter for that, instead the generation space is evenly split
1484         * across address spaces.  For example, with two address spaces, address
1485         * space 0 will use generations 0, 2, 4, ... while address space 1 will
1486         * use generations 1, 3, 5, ...
1487         */
1488        gen += KVM_ADDRESS_SPACE_NUM;
1489
1490        kvm_arch_memslots_updated(kvm, gen);
1491
1492        slots->generation = gen;
1493
1494        return old_memslots;
1495}
1496
1497static size_t kvm_memslots_size(int slots)
1498{
1499        return sizeof(struct kvm_memslots) +
1500               (sizeof(struct kvm_memory_slot) * slots);
1501}
1502
1503static void kvm_copy_memslots(struct kvm_memslots *to,
1504                              struct kvm_memslots *from)
1505{
1506        memcpy(to, from, kvm_memslots_size(from->used_slots));
1507}
1508
1509/*
1510 * Note, at a minimum, the current number of used slots must be allocated, even
1511 * when deleting a memslot, as we need a complete duplicate of the memslots for
1512 * use when invalidating a memslot prior to deleting/moving the memslot.
1513 */
1514static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1515                                             enum kvm_mr_change change)
1516{
1517        struct kvm_memslots *slots;
1518        size_t new_size;
1519
1520        if (change == KVM_MR_CREATE)
1521                new_size = kvm_memslots_size(old->used_slots + 1);
1522        else
1523                new_size = kvm_memslots_size(old->used_slots);
1524
1525        slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1526        if (likely(slots))
1527                kvm_copy_memslots(slots, old);
1528
1529        return slots;
1530}
1531
1532static int kvm_set_memslot(struct kvm *kvm,
1533                           const struct kvm_userspace_memory_region *mem,
1534                           struct kvm_memory_slot *new, int as_id,
1535                           enum kvm_mr_change change)
1536{
1537        struct kvm_memory_slot *slot, old;
1538        struct kvm_memslots *slots;
1539        int r;
1540
1541        /*
1542         * Released in install_new_memslots.
1543         *
1544         * Must be held from before the current memslots are copied until
1545         * after the new memslots are installed with rcu_assign_pointer,
1546         * then released before the synchronize srcu in install_new_memslots.
1547         *
1548         * When modifying memslots outside of the slots_lock, must be held
1549         * before reading the pointer to the current memslots until after all
1550         * changes to those memslots are complete.
1551         *
1552         * These rules ensure that installing new memslots does not lose
1553         * changes made to the previous memslots.
1554         */
1555        mutex_lock(&kvm->slots_arch_lock);
1556
1557        slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
1558        if (!slots) {
1559                mutex_unlock(&kvm->slots_arch_lock);
1560                return -ENOMEM;
1561        }
1562
1563        if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1564                /*
1565                 * Note, the INVALID flag needs to be in the appropriate entry
1566                 * in the freshly allocated memslots, not in @old or @new.
1567                 */
1568                slot = id_to_memslot(slots, new->id);
1569                slot->flags |= KVM_MEMSLOT_INVALID;
1570
1571                /*
1572                 * We can re-use the memory from the old memslots.
1573                 * It will be overwritten with a copy of the new memslots
1574                 * after reacquiring the slots_arch_lock below.
1575                 */
1576                slots = install_new_memslots(kvm, as_id, slots);
1577
1578                /* From this point no new shadow pages pointing to a deleted,
1579                 * or moved, memslot will be created.
1580                 *
1581                 * validation of sp->gfn happens in:
1582                 *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1583                 *      - kvm_is_visible_gfn (mmu_check_root)
1584                 */
1585                kvm_arch_flush_shadow_memslot(kvm, slot);
1586
1587                /* Released in install_new_memslots. */
1588                mutex_lock(&kvm->slots_arch_lock);
1589
1590                /*
1591                 * The arch-specific fields of the memslots could have changed
1592                 * between releasing the slots_arch_lock in
1593                 * install_new_memslots and here, so get a fresh copy of the
1594                 * slots.
1595                 */
1596                kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id));
1597        }
1598
1599        /*
1600         * Make a full copy of the old memslot, the pointer will become stale
1601         * when the memslots are re-sorted by update_memslots(), and the old
1602         * memslot needs to be referenced after calling update_memslots(), e.g.
1603         * to free its resources and for arch specific behavior.  This needs to
1604         * happen *after* (re)acquiring slots_arch_lock.
1605         */
1606        slot = id_to_memslot(slots, new->id);
1607        if (slot) {
1608                old = *slot;
1609        } else {
1610                WARN_ON_ONCE(change != KVM_MR_CREATE);
1611                memset(&old, 0, sizeof(old));
1612                old.id = new->id;
1613                old.as_id = as_id;
1614        }
1615
1616        /* Copy the arch-specific data, again after (re)acquiring slots_arch_lock. */
1617        memcpy(&new->arch, &old.arch, sizeof(old.arch));
1618
1619        r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
1620        if (r)
1621                goto out_slots;
1622
1623        update_memslots(slots, new, change);
1624        slots = install_new_memslots(kvm, as_id, slots);
1625
1626        kvm_arch_commit_memory_region(kvm, mem, &old, new, change);
1627
1628        /* Free the old memslot's metadata.  Note, this is the full copy!!! */
1629        if (change == KVM_MR_DELETE)
1630                kvm_free_memslot(kvm, &old);
1631
1632        kvfree(slots);
1633        return 0;
1634
1635out_slots:
1636        if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1637                slot = id_to_memslot(slots, new->id);
1638                slot->flags &= ~KVM_MEMSLOT_INVALID;
1639                slots = install_new_memslots(kvm, as_id, slots);
1640        } else {
1641                mutex_unlock(&kvm->slots_arch_lock);
1642        }
1643        kvfree(slots);
1644        return r;
1645}
1646
1647static int kvm_delete_memslot(struct kvm *kvm,
1648                              const struct kvm_userspace_memory_region *mem,
1649                              struct kvm_memory_slot *old, int as_id)
1650{
1651        struct kvm_memory_slot new;
1652
1653        if (!old->npages)
1654                return -EINVAL;
1655
1656        memset(&new, 0, sizeof(new));
1657        new.id = old->id;
1658        /*
1659         * This is only for debugging purpose; it should never be referenced
1660         * for a removed memslot.
1661         */
1662        new.as_id = as_id;
1663
1664        return kvm_set_memslot(kvm, mem, &new, as_id, KVM_MR_DELETE);
1665}
1666
1667/*
1668 * Allocate some memory and give it an address in the guest physical address
1669 * space.
1670 *
1671 * Discontiguous memory is allowed, mostly for framebuffers.
1672 *
1673 * Must be called holding kvm->slots_lock for write.
1674 */
1675int __kvm_set_memory_region(struct kvm *kvm,
1676                            const struct kvm_userspace_memory_region *mem)
1677{
1678        struct kvm_memory_slot old, new;
1679        struct kvm_memory_slot *tmp;
1680        enum kvm_mr_change change;
1681        int as_id, id;
1682        int r;
1683
1684        r = check_memory_region_flags(mem);
1685        if (r)
1686                return r;
1687
1688        as_id = mem->slot >> 16;
1689        id = (u16)mem->slot;
1690
1691        /* General sanity checks */
1692        if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1693            (mem->memory_size != (unsigned long)mem->memory_size))
1694                return -EINVAL;
1695        if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1696                return -EINVAL;
1697        /* We can read the guest memory with __xxx_user() later on. */
1698        if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1699            (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1700             !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1701                        mem->memory_size))
1702                return -EINVAL;
1703        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1704                return -EINVAL;
1705        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1706                return -EINVAL;
1707
1708        /*
1709         * Make a full copy of the old memslot, the pointer will become stale
1710         * when the memslots are re-sorted by update_memslots(), and the old
1711         * memslot needs to be referenced after calling update_memslots(), e.g.
1712         * to free its resources and for arch specific behavior.
1713         */
1714        tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1715        if (tmp) {
1716                old = *tmp;
1717                tmp = NULL;
1718        } else {
1719                memset(&old, 0, sizeof(old));
1720                old.id = id;
1721        }
1722
1723        if (!mem->memory_size)
1724                return kvm_delete_memslot(kvm, mem, &old, as_id);
1725
1726        new.as_id = as_id;
1727        new.id = id;
1728        new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1729        new.npages = mem->memory_size >> PAGE_SHIFT;
1730        new.flags = mem->flags;
1731        new.userspace_addr = mem->userspace_addr;
1732
1733        if (new.npages > KVM_MEM_MAX_NR_PAGES)
1734                return -EINVAL;
1735
1736        if (!old.npages) {
1737                change = KVM_MR_CREATE;
1738                new.dirty_bitmap = NULL;
1739        } else { /* Modify an existing slot. */
1740                if ((new.userspace_addr != old.userspace_addr) ||
1741                    (new.npages != old.npages) ||
1742                    ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1743                        return -EINVAL;
1744
1745                if (new.base_gfn != old.base_gfn)
1746                        change = KVM_MR_MOVE;
1747                else if (new.flags != old.flags)
1748                        change = KVM_MR_FLAGS_ONLY;
1749                else /* Nothing to change. */
1750                        return 0;
1751
1752                /* Copy dirty_bitmap from the current memslot. */
1753                new.dirty_bitmap = old.dirty_bitmap;
1754        }
1755
1756        if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
1757                /* Check for overlaps */
1758                kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1759                        if (tmp->id == id)
1760                                continue;
1761                        if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1762                              (new.base_gfn >= tmp->base_gfn + tmp->npages)))
1763                                return -EEXIST;
1764                }
1765        }
1766
1767        /* Allocate/free page dirty bitmap as needed */
1768        if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1769                new.dirty_bitmap = NULL;
1770        else if (!new.dirty_bitmap && !kvm->dirty_ring_size) {
1771                r = kvm_alloc_dirty_bitmap(&new);
1772                if (r)
1773                        return r;
1774
1775                if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1776                        bitmap_set(new.dirty_bitmap, 0, new.npages);
1777        }
1778
1779        r = kvm_set_memslot(kvm, mem, &new, as_id, change);
1780        if (r)
1781                goto out_bitmap;
1782
1783        if (old.dirty_bitmap && !new.dirty_bitmap)
1784                kvm_destroy_dirty_bitmap(&old);
1785        return 0;
1786
1787out_bitmap:
1788        if (new.dirty_bitmap && !old.dirty_bitmap)
1789                kvm_destroy_dirty_bitmap(&new);
1790        return r;
1791}
1792EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1793
1794int kvm_set_memory_region(struct kvm *kvm,
1795                          const struct kvm_userspace_memory_region *mem)
1796{
1797        int r;
1798
1799        mutex_lock(&kvm->slots_lock);
1800        r = __kvm_set_memory_region(kvm, mem);
1801        mutex_unlock(&kvm->slots_lock);
1802        return r;
1803}
1804EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1805
1806static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1807                                          struct kvm_userspace_memory_region *mem)
1808{
1809        if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1810                return -EINVAL;
1811
1812        return kvm_set_memory_region(kvm, mem);
1813}
1814
1815#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1816/**
1817 * kvm_get_dirty_log - get a snapshot of dirty pages
1818 * @kvm:        pointer to kvm instance
1819 * @log:        slot id and address to which we copy the log
1820 * @is_dirty:   set to '1' if any dirty pages were found
1821 * @memslot:    set to the associated memslot, always valid on success
1822 */
1823int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1824                      int *is_dirty, struct kvm_memory_slot **memslot)
1825{
1826        struct kvm_memslots *slots;
1827        int i, as_id, id;
1828        unsigned long n;
1829        unsigned long any = 0;
1830
1831        /* Dirty ring tracking is exclusive to dirty log tracking */
1832        if (kvm->dirty_ring_size)
1833                return -ENXIO;
1834
1835        *memslot = NULL;
1836        *is_dirty = 0;
1837
1838        as_id = log->slot >> 16;
1839        id = (u16)log->slot;
1840        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1841                return -EINVAL;
1842
1843        slots = __kvm_memslots(kvm, as_id);
1844        *memslot = id_to_memslot(slots, id);
1845        if (!(*memslot) || !(*memslot)->dirty_bitmap)
1846                return -ENOENT;
1847
1848        kvm_arch_sync_dirty_log(kvm, *memslot);
1849
1850        n = kvm_dirty_bitmap_bytes(*memslot);
1851
1852        for (i = 0; !any && i < n/sizeof(long); ++i)
1853                any = (*memslot)->dirty_bitmap[i];
1854
1855        if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1856                return -EFAULT;
1857
1858        if (any)
1859                *is_dirty = 1;
1860        return 0;
1861}
1862EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1863
1864#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
1865/**
1866 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
1867 *      and reenable dirty page tracking for the corresponding pages.
1868 * @kvm:        pointer to kvm instance
1869 * @log:        slot id and address to which we copy the log
1870 *
1871 * We need to keep it in mind that VCPU threads can write to the bitmap
1872 * concurrently. So, to avoid losing track of dirty pages we keep the
1873 * following order:
1874 *
1875 *    1. Take a snapshot of the bit and clear it if needed.
1876 *    2. Write protect the corresponding page.
1877 *    3. Copy the snapshot to the userspace.
1878 *    4. Upon return caller flushes TLB's if needed.
1879 *
1880 * Between 2 and 4, the guest may write to the page using the remaining TLB
1881 * entry.  This is not a problem because the page is reported dirty using
1882 * the snapshot taken before and step 4 ensures that writes done after
1883 * exiting to userspace will be logged for the next call.
1884 *
1885 */
1886static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
1887{
1888        struct kvm_memslots *slots;
1889        struct kvm_memory_slot *memslot;
1890        int i, as_id, id;
1891        unsigned long n;
1892        unsigned long *dirty_bitmap;
1893        unsigned long *dirty_bitmap_buffer;
1894        bool flush;
1895
1896        /* Dirty ring tracking is exclusive to dirty log tracking */
1897        if (kvm->dirty_ring_size)
1898                return -ENXIO;
1899
1900        as_id = log->slot >> 16;
1901        id = (u16)log->slot;
1902        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1903                return -EINVAL;
1904
1905        slots = __kvm_memslots(kvm, as_id);
1906        memslot = id_to_memslot(slots, id);
1907        if (!memslot || !memslot->dirty_bitmap)
1908                return -ENOENT;
1909
1910        dirty_bitmap = memslot->dirty_bitmap;
1911
1912        kvm_arch_sync_dirty_log(kvm, memslot);
1913
1914        n = kvm_dirty_bitmap_bytes(memslot);
1915        flush = false;
1916        if (kvm->manual_dirty_log_protect) {
1917                /*
1918                 * Unlike kvm_get_dirty_log, we always return false in *flush,
1919                 * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
1920                 * is some code duplication between this function and
1921                 * kvm_get_dirty_log, but hopefully all architecture
1922                 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
1923                 * can be eliminated.
1924                 */
1925                dirty_bitmap_buffer = dirty_bitmap;
1926        } else {
1927                dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1928                memset(dirty_bitmap_buffer, 0, n);
1929
1930                KVM_MMU_LOCK(kvm);
1931                for (i = 0; i < n / sizeof(long); i++) {
1932                        unsigned long mask;
1933                        gfn_t offset;
1934
1935                        if (!dirty_bitmap[i])
1936                                continue;
1937
1938                        flush = true;
1939                        mask = xchg(&dirty_bitmap[i], 0);
1940                        dirty_bitmap_buffer[i] = mask;
1941
1942                        offset = i * BITS_PER_LONG;
1943                        kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1944                                                                offset, mask);
1945                }
1946                KVM_MMU_UNLOCK(kvm);
1947        }
1948
1949        if (flush)
1950                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1951
1952        if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1953                return -EFAULT;
1954        return 0;
1955}
1956
1957
1958/**
1959 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
1960 * @kvm: kvm instance
1961 * @log: slot id and address to which we copy the log
1962 *
1963 * Steps 1-4 below provide general overview of dirty page logging. See
1964 * kvm_get_dirty_log_protect() function description for additional details.
1965 *
1966 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
1967 * always flush the TLB (step 4) even if previous step failed  and the dirty
1968 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
1969 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
1970 * writes will be marked dirty for next log read.
1971 *
1972 *   1. Take a snapshot of the bit and clear it if needed.
1973 *   2. Write protect the corresponding page.
1974 *   3. Copy the snapshot to the userspace.
1975 *   4. Flush TLB's if needed.
1976 */
1977static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1978                                      struct kvm_dirty_log *log)
1979{
1980        int r;
1981
1982        mutex_lock(&kvm->slots_lock);
1983
1984        r = kvm_get_dirty_log_protect(kvm, log);
1985
1986        mutex_unlock(&kvm->slots_lock);
1987        return r;
1988}
1989
1990/**
1991 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
1992 *      and reenable dirty page tracking for the corresponding pages.
1993 * @kvm:        pointer to kvm instance
1994 * @log:        slot id and address from which to fetch the bitmap of dirty pages
1995 */
1996static int kvm_clear_dirty_log_protect(struct kvm *kvm,
1997                                       struct kvm_clear_dirty_log *log)
1998{
1999        struct kvm_memslots *slots;
2000        struct kvm_memory_slot *memslot;
2001        int as_id, id;
2002        gfn_t offset;
2003        unsigned long i, n;
2004        unsigned long *dirty_bitmap;
2005        unsigned long *dirty_bitmap_buffer;
2006        bool flush;
2007
2008        /* Dirty ring tracking is exclusive to dirty log tracking */
2009        if (kvm->dirty_ring_size)
2010                return -ENXIO;
2011
2012        as_id = log->slot >> 16;
2013        id = (u16)log->slot;
2014        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2015                return -EINVAL;
2016
2017        if (log->first_page & 63)
2018                return -EINVAL;
2019
2020        slots = __kvm_memslots(kvm, as_id);
2021        memslot = id_to_memslot(slots, id);
2022        if (!memslot || !memslot->dirty_bitmap)
2023                return -ENOENT;
2024
2025        dirty_bitmap = memslot->dirty_bitmap;
2026
2027        n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2028
2029        if (log->first_page > memslot->npages ||
2030            log->num_pages > memslot->npages - log->first_page ||
2031            (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2032            return -EINVAL;
2033
2034        kvm_arch_sync_dirty_log(kvm, memslot);
2035
2036        flush = false;
2037        dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2038        if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2039                return -EFAULT;
2040
2041        KVM_MMU_LOCK(kvm);
2042        for (offset = log->first_page, i = offset / BITS_PER_LONG,
2043                 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2044             i++, offset += BITS_PER_LONG) {
2045                unsigned long mask = *dirty_bitmap_buffer++;
2046                atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2047                if (!mask)
2048                        continue;
2049
2050                mask &= atomic_long_fetch_andnot(mask, p);
2051
2052                /*
2053                 * mask contains the bits that really have been cleared.  This
2054                 * never includes any bits beyond the length of the memslot (if
2055                 * the length is not aligned to 64 pages), therefore it is not
2056                 * a problem if userspace sets them in log->dirty_bitmap.
2057                */
2058                if (mask) {
2059                        flush = true;
2060                        kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2061                                                                offset, mask);
2062                }
2063        }
2064        KVM_MMU_UNLOCK(kvm);
2065
2066        if (flush)
2067                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2068
2069        return 0;
2070}
2071
2072static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2073                                        struct kvm_clear_dirty_log *log)
2074{
2075        int r;
2076
2077        mutex_lock(&kvm->slots_lock);
2078
2079        r = kvm_clear_dirty_log_protect(kvm, log);
2080
2081        mutex_unlock(&kvm->slots_lock);
2082        return r;
2083}
2084#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2085
2086struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2087{
2088        return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2089}
2090EXPORT_SYMBOL_GPL(gfn_to_memslot);
2091
2092struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2093{
2094        struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2095        struct kvm_memory_slot *slot;
2096        int slot_index;
2097
2098        slot = try_get_memslot(slots, vcpu->last_used_slot, gfn);
2099        if (slot)
2100                return slot;
2101
2102        /*
2103         * Fall back to searching all memslots. We purposely use
2104         * search_memslots() instead of __gfn_to_memslot() to avoid
2105         * thrashing the VM-wide last_used_index in kvm_memslots.
2106         */
2107        slot = search_memslots(slots, gfn, &slot_index);
2108        if (slot) {
2109                vcpu->last_used_slot = slot_index;
2110                return slot;
2111        }
2112
2113        return NULL;
2114}
2115EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
2116
2117bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2118{
2119        struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2120
2121        return kvm_is_visible_memslot(memslot);
2122}
2123EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2124
2125bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2126{
2127        struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2128
2129        return kvm_is_visible_memslot(memslot);
2130}
2131EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2132
2133unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2134{
2135        struct vm_area_struct *vma;
2136        unsigned long addr, size;
2137
2138        size = PAGE_SIZE;
2139
2140        addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2141        if (kvm_is_error_hva(addr))
2142                return PAGE_SIZE;
2143
2144        mmap_read_lock(current->mm);
2145        vma = find_vma(current->mm, addr);
2146        if (!vma)
2147                goto out;
2148
2149        size = vma_kernel_pagesize(vma);
2150
2151out:
2152        mmap_read_unlock(current->mm);
2153
2154        return size;
2155}
2156
2157static bool memslot_is_readonly(struct kvm_memory_slot *slot)
2158{
2159        return slot->flags & KVM_MEM_READONLY;
2160}
2161
2162static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2163                                       gfn_t *nr_pages, bool write)
2164{
2165        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2166                return KVM_HVA_ERR_BAD;
2167
2168        if (memslot_is_readonly(slot) && write)
2169                return KVM_HVA_ERR_RO_BAD;
2170
2171        if (nr_pages)
2172                *nr_pages = slot->npages - (gfn - slot->base_gfn);
2173
2174        return __gfn_to_hva_memslot(slot, gfn);
2175}
2176
2177static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2178                                     gfn_t *nr_pages)
2179{
2180        return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2181}
2182
2183unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2184                                        gfn_t gfn)
2185{
2186        return gfn_to_hva_many(slot, gfn, NULL);
2187}
2188EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2189
2190unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2191{
2192        return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2193}
2194EXPORT_SYMBOL_GPL(gfn_to_hva);
2195
2196unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2197{
2198        return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2199}
2200EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2201
2202/*
2203 * Return the hva of a @gfn and the R/W attribute if possible.
2204 *
2205 * @slot: the kvm_memory_slot which contains @gfn
2206 * @gfn: the gfn to be translated
2207 * @writable: used to return the read/write attribute of the @slot if the hva
2208 * is valid and @writable is not NULL
2209 */
2210unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2211                                      gfn_t gfn, bool *writable)
2212{
2213        unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2214
2215        if (!kvm_is_error_hva(hva) && writable)
2216                *writable = !memslot_is_readonly(slot);
2217
2218        return hva;
2219}
2220
2221unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2222{
2223        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2224
2225        return gfn_to_hva_memslot_prot(slot, gfn, writable);
2226}
2227
2228unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2229{
2230        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2231
2232        return gfn_to_hva_memslot_prot(slot, gfn, writable);
2233}
2234
2235static inline int check_user_page_hwpoison(unsigned long addr)
2236{
2237        int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2238
2239        rc = get_user_pages(addr, 1, flags, NULL, NULL);
2240        return rc == -EHWPOISON;
2241}
2242
2243/*
2244 * The fast path to get the writable pfn which will be stored in @pfn,
2245 * true indicates success, otherwise false is returned.  It's also the
2246 * only part that runs if we can in atomic context.
2247 */
2248static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2249                            bool *writable, kvm_pfn_t *pfn)
2250{
2251        struct page *page[1];
2252
2253        /*
2254         * Fast pin a writable pfn only if it is a write fault request
2255         * or the caller allows to map a writable pfn for a read fault
2256         * request.
2257         */
2258        if (!(write_fault || writable))
2259                return false;
2260
2261        if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2262                *pfn = page_to_pfn(page[0]);
2263
2264                if (writable)
2265                        *writable = true;
2266                return true;
2267        }
2268
2269        return false;
2270}
2271
2272/*
2273 * The slow path to get the pfn of the specified host virtual address,
2274 * 1 indicates success, -errno is returned if error is detected.
2275 */
2276static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2277                           bool *writable, kvm_pfn_t *pfn)
2278{
2279        unsigned int flags = FOLL_HWPOISON;
2280        struct page *page;
2281        int npages = 0;
2282
2283        might_sleep();
2284
2285        if (writable)
2286                *writable = write_fault;
2287
2288        if (write_fault)
2289                flags |= FOLL_WRITE;
2290        if (async)
2291                flags |= FOLL_NOWAIT;
2292
2293        npages = get_user_pages_unlocked(addr, 1, &page, flags);
2294        if (npages != 1)
2295                return npages;
2296
2297        /* map read fault as writable if possible */
2298        if (unlikely(!write_fault) && writable) {
2299                struct page *wpage;
2300
2301                if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2302                        *writable = true;
2303                        put_page(page);
2304                        page = wpage;
2305                }
2306        }
2307        *pfn = page_to_pfn(page);
2308        return npages;
2309}
2310
2311static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2312{
2313        if (unlikely(!(vma->vm_flags & VM_READ)))
2314                return false;
2315
2316        if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2317                return false;
2318
2319        return true;
2320}
2321
2322static int kvm_try_get_pfn(kvm_pfn_t pfn)
2323{
2324        if (kvm_is_reserved_pfn(pfn))
2325                return 1;
2326        return get_page_unless_zero(pfn_to_page(pfn));
2327}
2328
2329static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2330                               unsigned long addr, bool *async,
2331                               bool write_fault, bool *writable,
2332                               kvm_pfn_t *p_pfn)
2333{
2334        kvm_pfn_t pfn;
2335        pte_t *ptep;
2336        spinlock_t *ptl;
2337        int r;
2338
2339        r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2340        if (r) {
2341                /*
2342                 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2343                 * not call the fault handler, so do it here.
2344                 */
2345                bool unlocked = false;
2346                r = fixup_user_fault(current->mm, addr,
2347                                     (write_fault ? FAULT_FLAG_WRITE : 0),
2348                                     &unlocked);
2349                if (unlocked)
2350                        return -EAGAIN;
2351                if (r)
2352                        return r;
2353
2354                r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2355                if (r)
2356                        return r;
2357        }
2358
2359        if (write_fault && !pte_write(*ptep)) {
2360                pfn = KVM_PFN_ERR_RO_FAULT;
2361                goto out;
2362        }
2363
2364        if (writable)
2365                *writable = pte_write(*ptep);
2366        pfn = pte_pfn(*ptep);
2367
2368        /*
2369         * Get a reference here because callers of *hva_to_pfn* and
2370         * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2371         * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
2372         * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2373         * simply do nothing for reserved pfns.
2374         *
2375         * Whoever called remap_pfn_range is also going to call e.g.
2376         * unmap_mapping_range before the underlying pages are freed,
2377         * causing a call to our MMU notifier.
2378         *
2379         * Certain IO or PFNMAP mappings can be backed with valid
2380         * struct pages, but be allocated without refcounting e.g.,
2381         * tail pages of non-compound higher order allocations, which
2382         * would then underflow the refcount when the caller does the
2383         * required put_page. Don't allow those pages here.
2384         */ 
2385        if (!kvm_try_get_pfn(pfn))
2386                r = -EFAULT;
2387
2388out:
2389        pte_unmap_unlock(ptep, ptl);
2390        *p_pfn = pfn;
2391
2392        return r;
2393}
2394
2395/*
2396 * Pin guest page in memory and return its pfn.
2397 * @addr: host virtual address which maps memory to the guest
2398 * @atomic: whether this function can sleep
2399 * @async: whether this function need to wait IO complete if the
2400 *         host page is not in the memory
2401 * @write_fault: whether we should get a writable host page
2402 * @writable: whether it allows to map a writable host page for !@write_fault
2403 *
2404 * The function will map a writable host page for these two cases:
2405 * 1): @write_fault = true
2406 * 2): @write_fault = false && @writable, @writable will tell the caller
2407 *     whether the mapping is writable.
2408 */
2409static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
2410                        bool write_fault, bool *writable)
2411{
2412        struct vm_area_struct *vma;
2413        kvm_pfn_t pfn = 0;
2414        int npages, r;
2415
2416        /* we can do it either atomically or asynchronously, not both */
2417        BUG_ON(atomic && async);
2418
2419        if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2420                return pfn;
2421
2422        if (atomic)
2423                return KVM_PFN_ERR_FAULT;
2424
2425        npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
2426        if (npages == 1)
2427                return pfn;
2428
2429        mmap_read_lock(current->mm);
2430        if (npages == -EHWPOISON ||
2431              (!async && check_user_page_hwpoison(addr))) {
2432                pfn = KVM_PFN_ERR_HWPOISON;
2433                goto exit;
2434        }
2435
2436retry:
2437        vma = vma_lookup(current->mm, addr);
2438
2439        if (vma == NULL)
2440                pfn = KVM_PFN_ERR_FAULT;
2441        else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2442                r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
2443                if (r == -EAGAIN)
2444                        goto retry;
2445                if (r < 0)
2446                        pfn = KVM_PFN_ERR_FAULT;
2447        } else {
2448                if (async && vma_is_valid(vma, write_fault))
2449                        *async = true;
2450                pfn = KVM_PFN_ERR_FAULT;
2451        }
2452exit:
2453        mmap_read_unlock(current->mm);
2454        return pfn;
2455}
2456
2457kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
2458                               bool atomic, bool *async, bool write_fault,
2459                               bool *writable, hva_t *hva)
2460{
2461        unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2462
2463        if (hva)
2464                *hva = addr;
2465
2466        if (addr == KVM_HVA_ERR_RO_BAD) {
2467                if (writable)
2468                        *writable = false;
2469                return KVM_PFN_ERR_RO_FAULT;
2470        }
2471
2472        if (kvm_is_error_hva(addr)) {
2473                if (writable)
2474                        *writable = false;
2475                return KVM_PFN_NOSLOT;
2476        }
2477
2478        /* Do not map writable pfn in the readonly memslot. */
2479        if (writable && memslot_is_readonly(slot)) {
2480                *writable = false;
2481                writable = NULL;
2482        }
2483
2484        return hva_to_pfn(addr, atomic, async, write_fault,
2485                          writable);
2486}
2487EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2488
2489kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2490                      bool *writable)
2491{
2492        return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2493                                    write_fault, writable, NULL);
2494}
2495EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2496
2497kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
2498{
2499        return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
2500}
2501EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2502
2503kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
2504{
2505        return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
2506}
2507EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2508
2509kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2510{
2511        return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2512}
2513EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2514
2515kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2516{
2517        return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2518}
2519EXPORT_SYMBOL_GPL(gfn_to_pfn);
2520
2521kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2522{
2523        return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2524}
2525EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2526
2527int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2528                            struct page **pages, int nr_pages)
2529{
2530        unsigned long addr;
2531        gfn_t entry = 0;
2532
2533        addr = gfn_to_hva_many(slot, gfn, &entry);
2534        if (kvm_is_error_hva(addr))
2535                return -1;
2536
2537        if (entry < nr_pages)
2538                return 0;
2539
2540        return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2541}
2542EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2543
2544static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
2545{
2546        if (is_error_noslot_pfn(pfn))
2547                return KVM_ERR_PTR_BAD_PAGE;
2548
2549        if (kvm_is_reserved_pfn(pfn)) {
2550                WARN_ON(1);
2551                return KVM_ERR_PTR_BAD_PAGE;
2552        }
2553
2554        return pfn_to_page(pfn);
2555}
2556
2557struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2558{
2559        kvm_pfn_t pfn;
2560
2561        pfn = gfn_to_pfn(kvm, gfn);
2562
2563        return kvm_pfn_to_page(pfn);
2564}
2565EXPORT_SYMBOL_GPL(gfn_to_page);
2566
2567void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
2568{
2569        if (pfn == 0)
2570                return;
2571
2572        if (dirty)
2573                kvm_release_pfn_dirty(pfn);
2574        else
2575                kvm_release_pfn_clean(pfn);
2576}
2577
2578int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2579{
2580        kvm_pfn_t pfn;
2581        void *hva = NULL;
2582        struct page *page = KVM_UNMAPPED_PAGE;
2583
2584        if (!map)
2585                return -EINVAL;
2586
2587        pfn = gfn_to_pfn(vcpu->kvm, gfn);
2588        if (is_error_noslot_pfn(pfn))
2589                return -EINVAL;
2590
2591        if (pfn_valid(pfn)) {
2592                page = pfn_to_page(pfn);
2593                hva = kmap(page);
2594#ifdef CONFIG_HAS_IOMEM
2595        } else {
2596                hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2597#endif
2598        }
2599
2600        if (!hva)
2601                return -EFAULT;
2602
2603        map->page = page;
2604        map->hva = hva;
2605        map->pfn = pfn;
2606        map->gfn = gfn;
2607
2608        return 0;
2609}
2610EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2611
2612void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2613{
2614        if (!map)
2615                return;
2616
2617        if (!map->hva)
2618                return;
2619
2620        if (map->page != KVM_UNMAPPED_PAGE)
2621                kunmap(map->page);
2622#ifdef CONFIG_HAS_IOMEM
2623        else
2624                memunmap(map->hva);
2625#endif
2626
2627        if (dirty)
2628                kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
2629
2630        kvm_release_pfn(map->pfn, dirty);
2631
2632        map->hva = NULL;
2633        map->page = NULL;
2634}
2635EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2636
2637struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2638{
2639        kvm_pfn_t pfn;
2640
2641        pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2642
2643        return kvm_pfn_to_page(pfn);
2644}
2645EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2646
2647void kvm_release_page_clean(struct page *page)
2648{
2649        WARN_ON(is_error_page(page));
2650
2651        kvm_release_pfn_clean(page_to_pfn(page));
2652}
2653EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2654
2655void kvm_release_pfn_clean(kvm_pfn_t pfn)
2656{
2657        if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2658                put_page(pfn_to_page(pfn));
2659}
2660EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2661
2662void kvm_release_page_dirty(struct page *page)
2663{
2664        WARN_ON(is_error_page(page));
2665
2666        kvm_release_pfn_dirty(page_to_pfn(page));
2667}
2668EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2669
2670void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2671{
2672        kvm_set_pfn_dirty(pfn);
2673        kvm_release_pfn_clean(pfn);
2674}
2675EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2676
2677void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2678{
2679        if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2680                SetPageDirty(pfn_to_page(pfn));
2681}
2682EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2683
2684void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2685{
2686        if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2687                mark_page_accessed(pfn_to_page(pfn));
2688}
2689EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2690
2691static int next_segment(unsigned long len, int offset)
2692{
2693        if (len > PAGE_SIZE - offset)
2694                return PAGE_SIZE - offset;
2695        else
2696                return len;
2697}
2698
2699static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2700                                 void *data, int offset, int len)
2701{
2702        int r;
2703        unsigned long addr;
2704
2705        addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2706        if (kvm_is_error_hva(addr))
2707                return -EFAULT;
2708        r = __copy_from_user(data, (void __user *)addr + offset, len);
2709        if (r)
2710                return -EFAULT;
2711        return 0;
2712}
2713
2714int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2715                        int len)
2716{
2717        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2718
2719        return __kvm_read_guest_page(slot, gfn, data, offset, len);
2720}
2721EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2722
2723int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2724                             int offset, int len)
2725{
2726        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2727
2728        return __kvm_read_guest_page(slot, gfn, data, offset, len);
2729}
2730EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2731
2732int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2733{
2734        gfn_t gfn = gpa >> PAGE_SHIFT;
2735        int seg;
2736        int offset = offset_in_page(gpa);
2737        int ret;
2738
2739        while ((seg = next_segment(len, offset)) != 0) {
2740                ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2741                if (ret < 0)
2742                        return ret;
2743                offset = 0;
2744                len -= seg;
2745                data += seg;
2746                ++gfn;
2747        }
2748        return 0;
2749}
2750EXPORT_SYMBOL_GPL(kvm_read_guest);
2751
2752int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2753{
2754        gfn_t gfn = gpa >> PAGE_SHIFT;
2755        int seg;
2756        int offset = offset_in_page(gpa);
2757        int ret;
2758
2759        while ((seg = next_segment(len, offset)) != 0) {
2760                ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2761                if (ret < 0)
2762                        return ret;
2763                offset = 0;
2764                len -= seg;
2765                data += seg;
2766                ++gfn;
2767        }
2768        return 0;
2769}
2770EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2771
2772static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2773                                   void *data, int offset, unsigned long len)
2774{
2775        int r;
2776        unsigned long addr;
2777
2778        addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2779        if (kvm_is_error_hva(addr))
2780                return -EFAULT;
2781        pagefault_disable();
2782        r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2783        pagefault_enable();
2784        if (r)
2785                return -EFAULT;
2786        return 0;
2787}
2788
2789int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2790                               void *data, unsigned long len)
2791{
2792        gfn_t gfn = gpa >> PAGE_SHIFT;
2793        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2794        int offset = offset_in_page(gpa);
2795
2796        return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2797}
2798EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2799
2800static int __kvm_write_guest_page(struct kvm *kvm,
2801                                  struct kvm_memory_slot *memslot, gfn_t gfn,
2802                                  const void *data, int offset, int len)
2803{
2804        int r;
2805        unsigned long addr;
2806
2807        addr = gfn_to_hva_memslot(memslot, gfn);
2808        if (kvm_is_error_hva(addr))
2809                return -EFAULT;
2810        r = __copy_to_user((void __user *)addr + offset, data, len);
2811        if (r)
2812                return -EFAULT;
2813        mark_page_dirty_in_slot(kvm, memslot, gfn);
2814        return 0;
2815}
2816
2817int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2818                         const void *data, int offset, int len)
2819{
2820        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2821
2822        return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
2823}
2824EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2825
2826int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2827                              const void *data, int offset, int len)
2828{
2829        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2830
2831        return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
2832}
2833EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2834
2835int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2836                    unsigned long len)
2837{
2838        gfn_t gfn = gpa >> PAGE_SHIFT;
2839        int seg;
2840        int offset = offset_in_page(gpa);
2841        int ret;
2842
2843        while ((seg = next_segment(len, offset)) != 0) {
2844                ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2845                if (ret < 0)
2846                        return ret;
2847                offset = 0;
2848                len -= seg;
2849                data += seg;
2850                ++gfn;
2851        }
2852        return 0;
2853}
2854EXPORT_SYMBOL_GPL(kvm_write_guest);
2855
2856int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2857                         unsigned long len)
2858{
2859        gfn_t gfn = gpa >> PAGE_SHIFT;
2860        int seg;
2861        int offset = offset_in_page(gpa);
2862        int ret;
2863
2864        while ((seg = next_segment(len, offset)) != 0) {
2865                ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2866                if (ret < 0)
2867                        return ret;
2868                offset = 0;
2869                len -= seg;
2870                data += seg;
2871                ++gfn;
2872        }
2873        return 0;
2874}
2875EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2876
2877static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2878                                       struct gfn_to_hva_cache *ghc,
2879                                       gpa_t gpa, unsigned long len)
2880{
2881        int offset = offset_in_page(gpa);
2882        gfn_t start_gfn = gpa >> PAGE_SHIFT;
2883        gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2884        gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2885        gfn_t nr_pages_avail;
2886
2887        /* Update ghc->generation before performing any error checks. */
2888        ghc->generation = slots->generation;
2889
2890        if (start_gfn > end_gfn) {
2891                ghc->hva = KVM_HVA_ERR_BAD;
2892                return -EINVAL;
2893        }
2894
2895        /*
2896         * If the requested region crosses two memslots, we still
2897         * verify that the entire region is valid here.
2898         */
2899        for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
2900                ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2901                ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2902                                           &nr_pages_avail);
2903                if (kvm_is_error_hva(ghc->hva))
2904                        return -EFAULT;
2905        }
2906
2907        /* Use the slow path for cross page reads and writes. */
2908        if (nr_pages_needed == 1)
2909                ghc->hva += offset;
2910        else
2911                ghc->memslot = NULL;
2912
2913        ghc->gpa = gpa;
2914        ghc->len = len;
2915        return 0;
2916}
2917
2918int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2919                              gpa_t gpa, unsigned long len)
2920{
2921        struct kvm_memslots *slots = kvm_memslots(kvm);
2922        return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
2923}
2924EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
2925
2926int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2927                                  void *data, unsigned int offset,
2928                                  unsigned long len)
2929{
2930        struct kvm_memslots *slots = kvm_memslots(kvm);
2931        int r;
2932        gpa_t gpa = ghc->gpa + offset;
2933
2934        if (WARN_ON_ONCE(len + offset > ghc->len))
2935                return -EINVAL;
2936
2937        if (slots->generation != ghc->generation) {
2938                if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2939                        return -EFAULT;
2940        }
2941
2942        if (kvm_is_error_hva(ghc->hva))
2943                return -EFAULT;
2944
2945        if (unlikely(!ghc->memslot))
2946                return kvm_write_guest(kvm, gpa, data, len);
2947
2948        r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
2949        if (r)
2950                return -EFAULT;
2951        mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
2952
2953        return 0;
2954}
2955EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
2956
2957int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2958                           void *data, unsigned long len)
2959{
2960        return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
2961}
2962EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
2963
2964int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2965                                 void *data, unsigned int offset,
2966                                 unsigned long len)
2967{
2968        struct kvm_memslots *slots = kvm_memslots(kvm);
2969        int r;
2970        gpa_t gpa = ghc->gpa + offset;
2971
2972        if (WARN_ON_ONCE(len + offset > ghc->len))
2973                return -EINVAL;
2974
2975        if (slots->generation != ghc->generation) {
2976                if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2977                        return -EFAULT;
2978        }
2979
2980        if (kvm_is_error_hva(ghc->hva))
2981                return -EFAULT;
2982
2983        if (unlikely(!ghc->memslot))
2984                return kvm_read_guest(kvm, gpa, data, len);
2985
2986        r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
2987        if (r)
2988                return -EFAULT;
2989
2990        return 0;
2991}
2992EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
2993
2994int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2995                          void *data, unsigned long len)
2996{
2997        return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
2998}
2999EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3000
3001int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3002{
3003        const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3004        gfn_t gfn = gpa >> PAGE_SHIFT;
3005        int seg;
3006        int offset = offset_in_page(gpa);
3007        int ret;
3008
3009        while ((seg = next_segment(len, offset)) != 0) {
3010                ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3011                if (ret < 0)
3012                        return ret;
3013                offset = 0;
3014                len -= seg;
3015                ++gfn;
3016        }
3017        return 0;
3018}
3019EXPORT_SYMBOL_GPL(kvm_clear_guest);
3020
3021void mark_page_dirty_in_slot(struct kvm *kvm,
3022                             struct kvm_memory_slot *memslot,
3023                             gfn_t gfn)
3024{
3025        if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3026                unsigned long rel_gfn = gfn - memslot->base_gfn;
3027                u32 slot = (memslot->as_id << 16) | memslot->id;
3028
3029                if (kvm->dirty_ring_size)
3030                        kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
3031                                            slot, rel_gfn);
3032                else
3033                        set_bit_le(rel_gfn, memslot->dirty_bitmap);
3034        }
3035}
3036EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3037
3038void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3039{
3040        struct kvm_memory_slot *memslot;
3041
3042        memslot = gfn_to_memslot(kvm, gfn);
3043        mark_page_dirty_in_slot(kvm, memslot, gfn);
3044}
3045EXPORT_SYMBOL_GPL(mark_page_dirty);
3046
3047void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3048{
3049        struct kvm_memory_slot *memslot;
3050
3051        memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3052        mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3053}
3054EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3055
3056void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3057{
3058        if (!vcpu->sigset_active)
3059                return;
3060
3061        /*
3062         * This does a lockless modification of ->real_blocked, which is fine
3063         * because, only current can change ->real_blocked and all readers of
3064         * ->real_blocked don't care as long ->real_blocked is always a subset
3065         * of ->blocked.
3066         */
3067        sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3068}
3069
3070void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3071{
3072        if (!vcpu->sigset_active)
3073                return;
3074
3075        sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3076        sigemptyset(&current->real_blocked);
3077}
3078
3079static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3080{
3081        unsigned int old, val, grow, grow_start;
3082
3083        old = val = vcpu->halt_poll_ns;
3084        grow_start = READ_ONCE(halt_poll_ns_grow_start);
3085        grow = READ_ONCE(halt_poll_ns_grow);
3086        if (!grow)
3087                goto out;
3088
3089        val *= grow;
3090        if (val < grow_start)
3091                val = grow_start;
3092
3093        if (val > vcpu->kvm->max_halt_poll_ns)
3094                val = vcpu->kvm->max_halt_poll_ns;
3095
3096        vcpu->halt_poll_ns = val;
3097out:
3098        trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3099}
3100
3101static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3102{
3103        unsigned int old, val, shrink, grow_start;
3104
3105        old = val = vcpu->halt_poll_ns;
3106        shrink = READ_ONCE(halt_poll_ns_shrink);
3107        grow_start = READ_ONCE(halt_poll_ns_grow_start);
3108        if (shrink == 0)
3109                val = 0;
3110        else
3111                val /= shrink;
3112
3113        if (val < grow_start)
3114                val = 0;
3115
3116        vcpu->halt_poll_ns = val;
3117        trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3118}
3119
3120static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3121{
3122        int ret = -EINTR;
3123        int idx = srcu_read_lock(&vcpu->kvm->srcu);
3124
3125        if (kvm_arch_vcpu_runnable(vcpu)) {
3126                kvm_make_request(KVM_REQ_UNHALT, vcpu);
3127                goto out;
3128        }
3129        if (kvm_cpu_has_pending_timer(vcpu))
3130                goto out;
3131        if (signal_pending(current))
3132                goto out;
3133        if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3134                goto out;
3135
3136        ret = 0;
3137out:
3138        srcu_read_unlock(&vcpu->kvm->srcu, idx);
3139        return ret;
3140}
3141
3142static inline void
3143update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
3144{
3145        if (waited)
3146                vcpu->stat.generic.halt_poll_fail_ns += poll_ns;
3147        else
3148                vcpu->stat.generic.halt_poll_success_ns += poll_ns;
3149}
3150
3151/*
3152 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
3153 */
3154void kvm_vcpu_block(struct kvm_vcpu *vcpu)
3155{
3156        ktime_t start, cur, poll_end;
3157        bool waited = false;
3158        u64 block_ns;
3159
3160        kvm_arch_vcpu_blocking(vcpu);
3161
3162        start = cur = poll_end = ktime_get();
3163        if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
3164                ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
3165
3166                ++vcpu->stat.generic.halt_attempted_poll;
3167                do {
3168                        /*
3169                         * This sets KVM_REQ_UNHALT if an interrupt
3170                         * arrives.
3171                         */
3172                        if (kvm_vcpu_check_block(vcpu) < 0) {
3173                                ++vcpu->stat.generic.halt_successful_poll;
3174                                if (!vcpu_valid_wakeup(vcpu))
3175                                        ++vcpu->stat.generic.halt_poll_invalid;
3176
3177                                KVM_STATS_LOG_HIST_UPDATE(
3178                                      vcpu->stat.generic.halt_poll_success_hist,
3179                                      ktime_to_ns(ktime_get()) -
3180                                      ktime_to_ns(start));
3181                                goto out;
3182                        }
3183                        cpu_relax();
3184                        poll_end = cur = ktime_get();
3185                } while (kvm_vcpu_can_poll(cur, stop));
3186
3187                KVM_STATS_LOG_HIST_UPDATE(
3188                                vcpu->stat.generic.halt_poll_fail_hist,
3189                                ktime_to_ns(ktime_get()) - ktime_to_ns(start));
3190        }
3191
3192
3193        prepare_to_rcuwait(&vcpu->wait);
3194        for (;;) {
3195                set_current_state(TASK_INTERRUPTIBLE);
3196
3197                if (kvm_vcpu_check_block(vcpu) < 0)
3198                        break;
3199
3200                waited = true;
3201                schedule();
3202        }
3203        finish_rcuwait(&vcpu->wait);
3204        cur = ktime_get();
3205        if (waited) {
3206                vcpu->stat.generic.halt_wait_ns +=
3207                        ktime_to_ns(cur) - ktime_to_ns(poll_end);
3208                KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3209                                ktime_to_ns(cur) - ktime_to_ns(poll_end));
3210        }
3211out:
3212        kvm_arch_vcpu_unblocking(vcpu);
3213        block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3214
3215        update_halt_poll_stats(
3216                vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
3217
3218        if (!kvm_arch_no_poll(vcpu)) {
3219                if (!vcpu_valid_wakeup(vcpu)) {
3220                        shrink_halt_poll_ns(vcpu);
3221                } else if (vcpu->kvm->max_halt_poll_ns) {
3222                        if (block_ns <= vcpu->halt_poll_ns)
3223                                ;
3224                        /* we had a long block, shrink polling */
3225                        else if (vcpu->halt_poll_ns &&
3226                                        block_ns > vcpu->kvm->max_halt_poll_ns)
3227                                shrink_halt_poll_ns(vcpu);
3228                        /* we had a short halt and our poll time is too small */
3229                        else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
3230                                        block_ns < vcpu->kvm->max_halt_poll_ns)
3231                                grow_halt_poll_ns(vcpu);
3232                } else {
3233                        vcpu->halt_poll_ns = 0;
3234                }
3235        }
3236
3237        trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
3238        kvm_arch_vcpu_block_finish(vcpu);
3239}
3240EXPORT_SYMBOL_GPL(kvm_vcpu_block);
3241
3242bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3243{
3244        struct rcuwait *waitp;
3245
3246        waitp = kvm_arch_vcpu_get_wait(vcpu);
3247        if (rcuwait_wake_up(waitp)) {
3248                WRITE_ONCE(vcpu->ready, true);
3249                ++vcpu->stat.generic.halt_wakeup;
3250                return true;
3251        }
3252
3253        return false;
3254}
3255EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3256
3257#ifndef CONFIG_S390
3258/*
3259 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3260 */
3261void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3262{
3263        int me, cpu;
3264
3265        if (kvm_vcpu_wake_up(vcpu))
3266                return;
3267
3268        /*
3269         * Note, the vCPU could get migrated to a different pCPU at any point
3270         * after kvm_arch_vcpu_should_kick(), which could result in sending an
3271         * IPI to the previous pCPU.  But, that's ok because the purpose of the
3272         * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3273         * vCPU also requires it to leave IN_GUEST_MODE.
3274         */
3275        me = get_cpu();
3276        if (kvm_arch_vcpu_should_kick(vcpu)) {
3277                cpu = READ_ONCE(vcpu->cpu);
3278                if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3279                        smp_send_reschedule(cpu);
3280        }
3281        put_cpu();
3282}
3283EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3284#endif /* !CONFIG_S390 */
3285
3286int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3287{
3288        struct pid *pid;
3289        struct task_struct *task = NULL;
3290        int ret = 0;
3291
3292        rcu_read_lock();
3293        pid = rcu_dereference(target->pid);
3294        if (pid)
3295                task = get_pid_task(pid, PIDTYPE_PID);
3296        rcu_read_unlock();
3297        if (!task)
3298                return ret;
3299        ret = yield_to(task, 1);
3300        put_task_struct(task);
3301
3302        return ret;
3303}
3304EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3305
3306/*
3307 * Helper that checks whether a VCPU is eligible for directed yield.
3308 * Most eligible candidate to yield is decided by following heuristics:
3309 *
3310 *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3311 *  (preempted lock holder), indicated by @in_spin_loop.
3312 *  Set at the beginning and cleared at the end of interception/PLE handler.
3313 *
3314 *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3315 *  chance last time (mostly it has become eligible now since we have probably
3316 *  yielded to lockholder in last iteration. This is done by toggling
3317 *  @dy_eligible each time a VCPU checked for eligibility.)
3318 *
3319 *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3320 *  to preempted lock-holder could result in wrong VCPU selection and CPU
3321 *  burning. Giving priority for a potential lock-holder increases lock
3322 *  progress.
3323 *
3324 *  Since algorithm is based on heuristics, accessing another VCPU data without
3325 *  locking does not harm. It may result in trying to yield to  same VCPU, fail
3326 *  and continue with next VCPU and so on.
3327 */
3328static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3329{
3330#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3331        bool eligible;
3332
3333        eligible = !vcpu->spin_loop.in_spin_loop ||
3334                    vcpu->spin_loop.dy_eligible;
3335
3336        if (vcpu->spin_loop.in_spin_loop)
3337                kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3338
3339        return eligible;
3340#else
3341        return true;
3342#endif
3343}
3344
3345/*
3346 * Unlike kvm_arch_vcpu_runnable, this function is called outside
3347 * a vcpu_load/vcpu_put pair.  However, for most architectures
3348 * kvm_arch_vcpu_runnable does not require vcpu_load.
3349 */
3350bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3351{
3352        return kvm_arch_vcpu_runnable(vcpu);
3353}
3354
3355static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3356{
3357        if (kvm_arch_dy_runnable(vcpu))
3358                return true;
3359
3360#ifdef CONFIG_KVM_ASYNC_PF
3361        if (!list_empty_careful(&vcpu->async_pf.done))
3362                return true;
3363#endif
3364
3365        return false;
3366}
3367
3368bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3369{
3370        return false;
3371}
3372
3373void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3374{
3375        struct kvm *kvm = me->kvm;
3376        struct kvm_vcpu *vcpu;
3377        int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3378        int yielded = 0;
3379        int try = 3;
3380        int pass;
3381        int i;
3382
3383        kvm_vcpu_set_in_spin_loop(me, true);
3384        /*
3385         * We boost the priority of a VCPU that is runnable but not
3386         * currently running, because it got preempted by something
3387         * else and called schedule in __vcpu_run.  Hopefully that
3388         * VCPU is holding the lock that we need and will release it.
3389         * We approximate round-robin by starting at the last boosted VCPU.
3390         */
3391        for (pass = 0; pass < 2 && !yielded && try; pass++) {
3392                kvm_for_each_vcpu(i, vcpu, kvm) {
3393                        if (!pass && i <= last_boosted_vcpu) {
3394                                i = last_boosted_vcpu;
3395                                continue;
3396                        } else if (pass && i > last_boosted_vcpu)
3397                                break;
3398                        if (!READ_ONCE(vcpu->ready))
3399                                continue;
3400                        if (vcpu == me)
3401                                continue;
3402                        if (rcuwait_active(&vcpu->wait) &&
3403                            !vcpu_dy_runnable(vcpu))
3404                                continue;
3405                        if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3406                            !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3407                            !kvm_arch_vcpu_in_kernel(vcpu))
3408                                continue;
3409                        if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3410                                continue;
3411
3412                        yielded = kvm_vcpu_yield_to(vcpu);
3413                        if (yielded > 0) {
3414                                kvm->last_boosted_vcpu = i;
3415                                break;
3416                        } else if (yielded < 0) {
3417                                try--;
3418                                if (!try)
3419                                        break;
3420                        }
3421                }
3422        }
3423        kvm_vcpu_set_in_spin_loop(me, false);
3424
3425        /* Ensure vcpu is not eligible during next spinloop */
3426        kvm_vcpu_set_dy_eligible(me, false);
3427}
3428EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3429
3430static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3431{
3432#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
3433        return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3434            (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3435             kvm->dirty_ring_size / PAGE_SIZE);
3436#else
3437        return false;
3438#endif
3439}
3440
3441static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3442{
3443        struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3444        struct page *page;
3445
3446        if (vmf->pgoff == 0)
3447                page = virt_to_page(vcpu->run);
3448#ifdef CONFIG_X86
3449        else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3450                page = virt_to_page(vcpu->arch.pio_data);
3451#endif
3452#ifdef CONFIG_KVM_MMIO
3453        else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3454                page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3455#endif
3456        else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3457                page = kvm_dirty_ring_get_page(
3458                    &vcpu->dirty_ring,
3459                    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3460        else
3461                return kvm_arch_vcpu_fault(vcpu, vmf);
3462        get_page(page);
3463        vmf->page = page;
3464        return 0;
3465}
3466
3467static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3468        .fault = kvm_vcpu_fault,
3469};
3470
3471static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3472{
3473        struct kvm_vcpu *vcpu = file->private_data;
3474        unsigned long pages = vma_pages(vma);
3475
3476        if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3477             kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3478            ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3479                return -EINVAL;
3480
3481        vma->vm_ops = &kvm_vcpu_vm_ops;
3482        return 0;
3483}
3484
3485static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3486{
3487        struct kvm_vcpu *vcpu = filp->private_data;
3488
3489        kvm_put_kvm(vcpu->kvm);
3490        return 0;
3491}
3492
3493static struct file_operations kvm_vcpu_fops = {
3494        .release        = kvm_vcpu_release,
3495        .unlocked_ioctl = kvm_vcpu_ioctl,
3496        .mmap           = kvm_vcpu_mmap,
3497        .llseek         = noop_llseek,
3498        KVM_COMPAT(kvm_vcpu_compat_ioctl),
3499};
3500
3501/*
3502 * Allocates an inode for the vcpu.
3503 */
3504static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3505{
3506        char name[8 + 1 + ITOA_MAX_LEN + 1];
3507
3508        snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3509        return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3510}
3511
3512static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3513{
3514#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3515        struct dentry *debugfs_dentry;
3516        char dir_name[ITOA_MAX_LEN * 2];
3517
3518        if (!debugfs_initialized())
3519                return;
3520
3521        snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3522        debugfs_dentry = debugfs_create_dir(dir_name,
3523                                            vcpu->kvm->debugfs_dentry);
3524
3525        kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3526#endif
3527}
3528
3529/*
3530 * Creates some virtual cpus.  Good luck creating more than one.
3531 */
3532static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3533{
3534        int r;
3535        struct kvm_vcpu *vcpu;
3536        struct page *page;
3537
3538        if (id >= KVM_MAX_VCPU_IDS)
3539                return -EINVAL;
3540
3541        mutex_lock(&kvm->lock);
3542        if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3543                mutex_unlock(&kvm->lock);
3544                return -EINVAL;
3545        }
3546
3547        kvm->created_vcpus++;
3548        mutex_unlock(&kvm->lock);
3549
3550        r = kvm_arch_vcpu_precreate(kvm, id);
3551        if (r)
3552                goto vcpu_decrement;
3553
3554        vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3555        if (!vcpu) {
3556                r = -ENOMEM;
3557                goto vcpu_decrement;
3558        }
3559
3560        BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3561        page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3562        if (!page) {
3563                r = -ENOMEM;
3564                goto vcpu_free;
3565        }
3566        vcpu->run = page_address(page);
3567
3568        kvm_vcpu_init(vcpu, kvm, id);
3569
3570        r = kvm_arch_vcpu_create(vcpu);
3571        if (r)
3572                goto vcpu_free_run_page;
3573
3574        if (kvm->dirty_ring_size) {
3575                r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3576                                         id, kvm->dirty_ring_size);
3577                if (r)
3578                        goto arch_vcpu_destroy;
3579        }
3580
3581        mutex_lock(&kvm->lock);
3582        if (kvm_get_vcpu_by_id(kvm, id)) {
3583                r = -EEXIST;
3584                goto unlock_vcpu_destroy;
3585        }
3586
3587        vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3588        BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
3589
3590        /* Fill the stats id string for the vcpu */
3591        snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
3592                 task_pid_nr(current), id);
3593
3594        /* Now it's all set up, let userspace reach it */
3595        kvm_get_kvm(kvm);
3596        r = create_vcpu_fd(vcpu);
3597        if (r < 0) {
3598                kvm_put_kvm_no_destroy(kvm);
3599                goto unlock_vcpu_destroy;
3600        }
3601
3602        kvm->vcpus[vcpu->vcpu_idx] = vcpu;
3603
3604        /*
3605         * Pairs with smp_rmb() in kvm_get_vcpu.  Write kvm->vcpus
3606         * before kvm->online_vcpu's incremented value.
3607         */
3608        smp_wmb();
3609        atomic_inc(&kvm->online_vcpus);
3610
3611        mutex_unlock(&kvm->lock);
3612        kvm_arch_vcpu_postcreate(vcpu);
3613        kvm_create_vcpu_debugfs(vcpu);
3614        return r;
3615
3616unlock_vcpu_destroy:
3617        mutex_unlock(&kvm->lock);
3618        kvm_dirty_ring_free(&vcpu->dirty_ring);
3619arch_vcpu_destroy:
3620        kvm_arch_vcpu_destroy(vcpu);
3621vcpu_free_run_page:
3622        free_page((unsigned long)vcpu->run);
3623vcpu_free:
3624        kmem_cache_free(kvm_vcpu_cache, vcpu);
3625vcpu_decrement:
3626        mutex_lock(&kvm->lock);
3627        kvm->created_vcpus--;
3628        mutex_unlock(&kvm->lock);
3629        return r;
3630}
3631
3632static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3633{
3634        if (sigset) {
3635                sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3636                vcpu->sigset_active = 1;
3637                vcpu->sigset = *sigset;
3638        } else
3639                vcpu->sigset_active = 0;
3640        return 0;
3641}
3642
3643static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
3644                              size_t size, loff_t *offset)
3645{
3646        struct kvm_vcpu *vcpu = file->private_data;
3647
3648        return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
3649                        &kvm_vcpu_stats_desc[0], &vcpu->stat,
3650                        sizeof(vcpu->stat), user_buffer, size, offset);
3651}
3652
3653static const struct file_operations kvm_vcpu_stats_fops = {
3654        .read = kvm_vcpu_stats_read,
3655        .llseek = noop_llseek,
3656};
3657
3658static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
3659{
3660        int fd;
3661        struct file *file;
3662        char name[15 + ITOA_MAX_LEN + 1];
3663
3664        snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
3665
3666        fd = get_unused_fd_flags(O_CLOEXEC);
3667        if (fd < 0)
3668                return fd;
3669
3670        file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
3671        if (IS_ERR(file)) {
3672                put_unused_fd(fd);
3673                return PTR_ERR(file);
3674        }
3675        file->f_mode |= FMODE_PREAD;
3676        fd_install(fd, file);
3677
3678        return fd;
3679}
3680
3681static long kvm_vcpu_ioctl(struct file *filp,
3682                           unsigned int ioctl, unsigned long arg)
3683{
3684        struct kvm_vcpu *vcpu = filp->private_data;
3685        void __user *argp = (void __user *)arg;
3686        int r;
3687        struct kvm_fpu *fpu = NULL;
3688        struct kvm_sregs *kvm_sregs = NULL;
3689
3690        if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
3691                return -EIO;
3692
3693        if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3694                return -EINVAL;
3695
3696        /*
3697         * Some architectures have vcpu ioctls that are asynchronous to vcpu
3698         * execution; mutex_lock() would break them.
3699         */
3700        r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3701        if (r != -ENOIOCTLCMD)
3702                return r;
3703
3704        if (mutex_lock_killable(&vcpu->mutex))
3705                return -EINTR;
3706        switch (ioctl) {
3707        case KVM_RUN: {
3708                struct pid *oldpid;
3709                r = -EINVAL;
3710                if (arg)
3711                        goto out;
3712                oldpid = rcu_access_pointer(vcpu->pid);
3713                if (unlikely(oldpid != task_pid(current))) {
3714                        /* The thread running this VCPU changed. */
3715                        struct pid *newpid;
3716
3717                        r = kvm_arch_vcpu_run_pid_change(vcpu);
3718                        if (r)
3719                                break;
3720
3721                        newpid = get_task_pid(current, PIDTYPE_PID);
3722                        rcu_assign_pointer(vcpu->pid, newpid);
3723                        if (oldpid)
3724                                synchronize_rcu();
3725                        put_pid(oldpid);
3726                }
3727                r = kvm_arch_vcpu_ioctl_run(vcpu);
3728                trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
3729                break;
3730        }
3731        case KVM_GET_REGS: {
3732                struct kvm_regs *kvm_regs;
3733
3734                r = -ENOMEM;
3735                kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3736                if (!kvm_regs)
3737                        goto out;
3738                r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
3739                if (r)
3740                        goto out_free1;
3741                r = -EFAULT;
3742                if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3743                        goto out_free1;
3744                r = 0;
3745out_free1:
3746                kfree(kvm_regs);
3747                break;
3748        }
3749        case KVM_SET_REGS: {
3750                struct kvm_regs *kvm_regs;
3751
3752                kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3753                if (IS_ERR(kvm_regs)) {
3754                        r = PTR_ERR(kvm_regs);
3755                        goto out;
3756                }
3757                r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3758                kfree(kvm_regs);
3759                break;
3760        }
3761        case KVM_GET_SREGS: {
3762                kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3763                                    GFP_KERNEL_ACCOUNT);
3764                r = -ENOMEM;
3765                if (!kvm_sregs)
3766                        goto out;
3767                r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3768                if (r)
3769                        goto out;
3770                r = -EFAULT;
3771                if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3772                        goto out;
3773                r = 0;
3774                break;
3775        }
3776        case KVM_SET_SREGS: {
3777                kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3778                if (IS_ERR(kvm_sregs)) {
3779                        r = PTR_ERR(kvm_sregs);
3780                        kvm_sregs = NULL;
3781                        goto out;
3782                }
3783                r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3784                break;
3785        }
3786        case KVM_GET_MP_STATE: {
3787                struct kvm_mp_state mp_state;
3788
3789                r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3790                if (r)
3791                        goto out;
3792                r = -EFAULT;
3793                if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3794                        goto out;
3795                r = 0;
3796                break;
3797        }
3798        case KVM_SET_MP_STATE: {
3799                struct kvm_mp_state mp_state;
3800
3801                r = -EFAULT;
3802                if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3803                        goto out;
3804                r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3805                break;
3806        }
3807        case KVM_TRANSLATE: {
3808                struct kvm_translation tr;
3809
3810                r = -EFAULT;
3811                if (copy_from_user(&tr, argp, sizeof(tr)))
3812                        goto out;
3813                r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3814                if (r)
3815                        goto out;
3816                r = -EFAULT;
3817                if (copy_to_user(argp, &tr, sizeof(tr)))
3818                        goto out;
3819                r = 0;
3820                break;
3821        }
3822        case KVM_SET_GUEST_DEBUG: {
3823                struct kvm_guest_debug dbg;
3824
3825                r = -EFAULT;
3826                if (copy_from_user(&dbg, argp, sizeof(dbg)))
3827                        goto out;
3828                r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
3829                break;
3830        }
3831        case KVM_SET_SIGNAL_MASK: {
3832                struct kvm_signal_mask __user *sigmask_arg = argp;
3833                struct kvm_signal_mask kvm_sigmask;
3834                sigset_t sigset, *p;
3835
3836                p = NULL;
3837                if (argp) {
3838                        r = -EFAULT;
3839                        if (copy_from_user(&kvm_sigmask, argp,
3840                                           sizeof(kvm_sigmask)))
3841                                goto out;
3842                        r = -EINVAL;
3843                        if (kvm_sigmask.len != sizeof(sigset))
3844                                goto out;
3845                        r = -EFAULT;
3846                        if (copy_from_user(&sigset, sigmask_arg->sigset,
3847                                           sizeof(sigset)))
3848                                goto out;
3849                        p = &sigset;
3850                }
3851                r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
3852                break;
3853        }
3854        case KVM_GET_FPU: {
3855                fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
3856                r = -ENOMEM;
3857                if (!fpu)
3858                        goto out;
3859                r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
3860                if (r)
3861                        goto out;
3862                r = -EFAULT;
3863                if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
3864                        goto out;
3865                r = 0;
3866                break;
3867        }
3868        case KVM_SET_FPU: {
3869                fpu = memdup_user(argp, sizeof(*fpu));
3870                if (IS_ERR(fpu)) {
3871                        r = PTR_ERR(fpu);
3872                        fpu = NULL;
3873                        goto out;
3874                }
3875                r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
3876                break;
3877        }
3878        case KVM_GET_STATS_FD: {
3879                r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
3880                break;
3881        }
3882        default:
3883                r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
3884        }
3885out:
3886        mutex_unlock(&vcpu->mutex);
3887        kfree(fpu);
3888        kfree(kvm_sregs);
3889        return r;
3890}
3891
3892#ifdef CONFIG_KVM_COMPAT
3893static long kvm_vcpu_compat_ioctl(struct file *filp,
3894                                  unsigned int ioctl, unsigned long arg)
3895{
3896        struct kvm_vcpu *vcpu = filp->private_data;
3897        void __user *argp = compat_ptr(arg);
3898        int r;
3899
3900        if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
3901                return -EIO;
3902
3903        switch (ioctl) {
3904        case KVM_SET_SIGNAL_MASK: {
3905                struct kvm_signal_mask __user *sigmask_arg = argp;
3906                struct kvm_signal_mask kvm_sigmask;
3907                sigset_t sigset;
3908
3909                if (argp) {
3910                        r = -EFAULT;
3911                        if (copy_from_user(&kvm_sigmask, argp,
3912                                           sizeof(kvm_sigmask)))
3913                                goto out;
3914                        r = -EINVAL;
3915                        if (kvm_sigmask.len != sizeof(compat_sigset_t))
3916                                goto out;
3917                        r = -EFAULT;
3918                        if (get_compat_sigset(&sigset,
3919                                              (compat_sigset_t __user *)sigmask_arg->sigset))
3920                                goto out;
3921                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3922                } else
3923                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
3924                break;
3925        }
3926        default:
3927                r = kvm_vcpu_ioctl(filp, ioctl, arg);
3928        }
3929
3930out:
3931        return r;
3932}
3933#endif
3934
3935static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3936{
3937        struct kvm_device *dev = filp->private_data;
3938
3939        if (dev->ops->mmap)
3940                return dev->ops->mmap(dev, vma);
3941
3942        return -ENODEV;
3943}
3944
3945static int kvm_device_ioctl_attr(struct kvm_device *dev,
3946                                 int (*accessor)(struct kvm_device *dev,
3947                                                 struct kvm_device_attr *attr),
3948                                 unsigned long arg)
3949{
3950        struct kvm_device_attr attr;
3951
3952        if (!accessor)
3953                return -EPERM;
3954
3955        if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
3956                return -EFAULT;
3957
3958        return accessor(dev, &attr);
3959}
3960
3961static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
3962                             unsigned long arg)
3963{
3964        struct kvm_device *dev = filp->private_data;
3965
3966        if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
3967                return -EIO;
3968
3969        switch (ioctl) {
3970        case KVM_SET_DEVICE_ATTR:
3971                return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
3972        case KVM_GET_DEVICE_ATTR:
3973                return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
3974        case KVM_HAS_DEVICE_ATTR:
3975                return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
3976        default:
3977                if (dev->ops->ioctl)
3978                        return dev->ops->ioctl(dev, ioctl, arg);
3979
3980                return -ENOTTY;
3981        }
3982}
3983
3984static int kvm_device_release(struct inode *inode, struct file *filp)
3985{
3986        struct kvm_device *dev = filp->private_data;
3987        struct kvm *kvm = dev->kvm;
3988
3989        if (dev->ops->release) {
3990                mutex_lock(&kvm->lock);
3991                list_del(&dev->vm_node);
3992                dev->ops->release(dev);
3993                mutex_unlock(&kvm->lock);
3994        }
3995
3996        kvm_put_kvm(kvm);
3997        return 0;
3998}
3999
4000static const struct file_operations kvm_device_fops = {
4001        .unlocked_ioctl = kvm_device_ioctl,
4002        .release = kvm_device_release,
4003        KVM_COMPAT(kvm_device_ioctl),
4004        .mmap = kvm_device_mmap,
4005};
4006
4007struct kvm_device *kvm_device_from_filp(struct file *filp)
4008{
4009        if (filp->f_op != &kvm_device_fops)
4010                return NULL;
4011
4012        return filp->private_data;
4013}
4014
4015static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4016#ifdef CONFIG_KVM_MPIC
4017        [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
4018        [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
4019#endif
4020};
4021
4022int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4023{
4024        if (type >= ARRAY_SIZE(kvm_device_ops_table))
4025                return -ENOSPC;
4026
4027        if (kvm_device_ops_table[type] != NULL)
4028                return -EEXIST;
4029
4030        kvm_device_ops_table[type] = ops;
4031        return 0;
4032}
4033
4034void kvm_unregister_device_ops(u32 type)
4035{
4036        if (kvm_device_ops_table[type] != NULL)
4037                kvm_device_ops_table[type] = NULL;
4038}
4039
4040static int kvm_ioctl_create_device(struct kvm *kvm,
4041                                   struct kvm_create_device *cd)
4042{
4043        const struct kvm_device_ops *ops = NULL;
4044        struct kvm_device *dev;
4045        bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4046        int type;
4047        int ret;
4048
4049        if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4050                return -ENODEV;
4051
4052        type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4053        ops = kvm_device_ops_table[type];
4054        if (ops == NULL)
4055                return -ENODEV;
4056
4057        if (test)
4058                return 0;
4059
4060        dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4061        if (!dev)
4062                return -ENOMEM;
4063
4064        dev->ops = ops;
4065        dev->kvm = kvm;
4066
4067        mutex_lock(&kvm->lock);
4068        ret = ops->create(dev, type);
4069        if (ret < 0) {
4070                mutex_unlock(&kvm->lock);
4071                kfree(dev);
4072                return ret;
4073        }
4074        list_add(&dev->vm_node, &kvm->devices);
4075        mutex_unlock(&kvm->lock);
4076
4077        if (ops->init)
4078                ops->init(dev);
4079
4080        kvm_get_kvm(kvm);
4081        ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4082        if (ret < 0) {
4083                kvm_put_kvm_no_destroy(kvm);
4084                mutex_lock(&kvm->lock);
4085                list_del(&dev->vm_node);
4086                mutex_unlock(&kvm->lock);
4087                ops->destroy(dev);
4088                return ret;
4089        }
4090
4091        cd->fd = ret;
4092        return 0;
4093}
4094
4095static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4096{
4097        switch (arg) {
4098        case KVM_CAP_USER_MEMORY:
4099        case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4100        case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4101        case KVM_CAP_INTERNAL_ERROR_DATA:
4102#ifdef CONFIG_HAVE_KVM_MSI
4103        case KVM_CAP_SIGNAL_MSI:
4104#endif
4105#ifdef CONFIG_HAVE_KVM_IRQFD
4106        case KVM_CAP_IRQFD:
4107        case KVM_CAP_IRQFD_RESAMPLE:
4108#endif
4109        case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4110        case KVM_CAP_CHECK_EXTENSION_VM:
4111        case KVM_CAP_ENABLE_CAP_VM:
4112        case KVM_CAP_HALT_POLL:
4113                return 1;
4114#ifdef CONFIG_KVM_MMIO
4115        case KVM_CAP_COALESCED_MMIO:
4116                return KVM_COALESCED_MMIO_PAGE_OFFSET;
4117        case KVM_CAP_COALESCED_PIO:
4118                return 1;
4119#endif
4120#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4121        case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4122                return KVM_DIRTY_LOG_MANUAL_CAPS;
4123#endif
4124#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4125        case KVM_CAP_IRQ_ROUTING:
4126                return KVM_MAX_IRQ_ROUTES;
4127#endif
4128#if KVM_ADDRESS_SPACE_NUM > 1
4129        case KVM_CAP_MULTI_ADDRESS_SPACE:
4130                return KVM_ADDRESS_SPACE_NUM;
4131#endif
4132        case KVM_CAP_NR_MEMSLOTS:
4133                return KVM_USER_MEM_SLOTS;
4134        case KVM_CAP_DIRTY_LOG_RING:
4135#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
4136                return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4137#else
4138                return 0;
4139#endif
4140        case KVM_CAP_BINARY_STATS_FD:
4141                return 1;
4142        default:
4143                break;
4144        }
4145        return kvm_vm_ioctl_check_extension(kvm, arg);
4146}
4147
4148static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4149{
4150        int r;
4151
4152        if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4153                return -EINVAL;
4154
4155        /* the size should be power of 2 */
4156        if (!size || (size & (size - 1)))
4157                return -EINVAL;
4158
4159        /* Should be bigger to keep the reserved entries, or a page */
4160        if (size < kvm_dirty_ring_get_rsvd_entries() *
4161            sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4162                return -EINVAL;
4163
4164        if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4165            sizeof(struct kvm_dirty_gfn))
4166                return -E2BIG;
4167
4168        /* We only allow it to set once */
4169        if (kvm->dirty_ring_size)
4170                return -EINVAL;
4171
4172        mutex_lock(&kvm->lock);
4173
4174        if (kvm->created_vcpus) {
4175                /* We don't allow to change this value after vcpu created */
4176                r = -EINVAL;
4177        } else {
4178                kvm->dirty_ring_size = size;
4179                r = 0;
4180        }
4181
4182        mutex_unlock(&kvm->lock);
4183        return r;
4184}
4185
4186static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4187{
4188        int i;
4189        struct kvm_vcpu *vcpu;
4190        int cleared = 0;
4191
4192        if (!kvm->dirty_ring_size)
4193                return -EINVAL;
4194
4195        mutex_lock(&kvm->slots_lock);
4196
4197        kvm_for_each_vcpu(i, vcpu, kvm)
4198                cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4199
4200        mutex_unlock(&kvm->slots_lock);
4201
4202        if (cleared)
4203                kvm_flush_remote_tlbs(kvm);
4204
4205        return cleared;
4206}
4207
4208int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4209                                                  struct kvm_enable_cap *cap)
4210{
4211        return -EINVAL;
4212}
4213
4214static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4215                                           struct kvm_enable_cap *cap)
4216{
4217        switch (cap->cap) {
4218#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4219        case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4220                u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4221
4222                if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4223                        allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4224
4225                if (cap->flags || (cap->args[0] & ~allowed_options))
4226                        return -EINVAL;
4227                kvm->manual_dirty_log_protect = cap->args[0];
4228                return 0;
4229        }
4230#endif
4231        case KVM_CAP_HALT_POLL: {
4232                if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4233                        return -EINVAL;
4234
4235                kvm->max_halt_poll_ns = cap->args[0];
4236                return 0;
4237        }
4238        case KVM_CAP_DIRTY_LOG_RING:
4239                return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4240        default:
4241                return kvm_vm_ioctl_enable_cap(kvm, cap);
4242        }
4243}
4244
4245static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4246                              size_t size, loff_t *offset)
4247{
4248        struct kvm *kvm = file->private_data;
4249
4250        return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4251                                &kvm_vm_stats_desc[0], &kvm->stat,
4252                                sizeof(kvm->stat), user_buffer, size, offset);
4253}
4254
4255static const struct file_operations kvm_vm_stats_fops = {
4256        .read = kvm_vm_stats_read,
4257        .llseek = noop_llseek,
4258};
4259
4260static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4261{
4262        int fd;
4263        struct file *file;
4264
4265        fd = get_unused_fd_flags(O_CLOEXEC);
4266        if (fd < 0)
4267                return fd;
4268
4269        file = anon_inode_getfile("kvm-vm-stats",
4270                        &kvm_vm_stats_fops, kvm, O_RDONLY);
4271        if (IS_ERR(file)) {
4272                put_unused_fd(fd);
4273                return PTR_ERR(file);
4274        }
4275        file->f_mode |= FMODE_PREAD;
4276        fd_install(fd, file);
4277
4278        return fd;
4279}
4280
4281static long kvm_vm_ioctl(struct file *filp,
4282                           unsigned int ioctl, unsigned long arg)
4283{
4284        struct kvm *kvm = filp->private_data;
4285        void __user *argp = (void __user *)arg;
4286        int r;
4287
4288        if (kvm->mm != current->mm || kvm->vm_dead)
4289                return -EIO;
4290        switch (ioctl) {
4291        case KVM_CREATE_VCPU:
4292                r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4293                break;
4294        case KVM_ENABLE_CAP: {
4295                struct kvm_enable_cap cap;
4296
4297                r = -EFAULT;
4298                if (copy_from_user(&cap, argp, sizeof(cap)))
4299                        goto out;
4300                r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4301                break;
4302        }
4303        case KVM_SET_USER_MEMORY_REGION: {
4304                struct kvm_userspace_memory_region kvm_userspace_mem;
4305
4306                r = -EFAULT;
4307                if (copy_from_user(&kvm_userspace_mem, argp,
4308                                                sizeof(kvm_userspace_mem)))
4309                        goto out;
4310
4311                r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4312                break;
4313        }
4314        case KVM_GET_DIRTY_LOG: {
4315                struct kvm_dirty_log log;
4316
4317                r = -EFAULT;
4318                if (copy_from_user(&log, argp, sizeof(log)))
4319                        goto out;
4320                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4321                break;
4322        }
4323#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4324        case KVM_CLEAR_DIRTY_LOG: {
4325                struct kvm_clear_dirty_log log;
4326
4327                r = -EFAULT;
4328                if (copy_from_user(&log, argp, sizeof(log)))
4329                        goto out;
4330                r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4331                break;
4332        }
4333#endif
4334#ifdef CONFIG_KVM_MMIO
4335        case KVM_REGISTER_COALESCED_MMIO: {
4336                struct kvm_coalesced_mmio_zone zone;
4337
4338                r = -EFAULT;
4339                if (copy_from_user(&zone, argp, sizeof(zone)))
4340                        goto out;
4341                r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4342                break;
4343        }
4344        case KVM_UNREGISTER_COALESCED_MMIO: {
4345                struct kvm_coalesced_mmio_zone zone;
4346
4347                r = -EFAULT;
4348                if (copy_from_user(&zone, argp, sizeof(zone)))
4349                        goto out;
4350                r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4351                break;
4352        }
4353#endif
4354        case KVM_IRQFD: {
4355                struct kvm_irqfd data;
4356
4357                r = -EFAULT;
4358                if (copy_from_user(&data, argp, sizeof(data)))
4359                        goto out;
4360                r = kvm_irqfd(kvm, &data);
4361                break;
4362        }
4363        case KVM_IOEVENTFD: {
4364                struct kvm_ioeventfd data;
4365
4366                r = -EFAULT;
4367                if (copy_from_user(&data, argp, sizeof(data)))
4368                        goto out;
4369                r = kvm_ioeventfd(kvm, &data);
4370                break;
4371        }
4372#ifdef CONFIG_HAVE_KVM_MSI
4373        case KVM_SIGNAL_MSI: {
4374                struct kvm_msi msi;
4375
4376                r = -EFAULT;
4377                if (copy_from_user(&msi, argp, sizeof(msi)))
4378                        goto out;
4379                r = kvm_send_userspace_msi(kvm, &msi);
4380                break;
4381        }
4382#endif
4383#ifdef __KVM_HAVE_IRQ_LINE
4384        case KVM_IRQ_LINE_STATUS:
4385        case KVM_IRQ_LINE: {
4386                struct kvm_irq_level irq_event;
4387
4388                r = -EFAULT;
4389                if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4390                        goto out;
4391
4392                r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4393                                        ioctl == KVM_IRQ_LINE_STATUS);
4394                if (r)
4395                        goto out;
4396
4397                r = -EFAULT;
4398                if (ioctl == KVM_IRQ_LINE_STATUS) {
4399                        if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4400                                goto out;
4401                }
4402
4403                r = 0;
4404                break;
4405        }
4406#endif
4407#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4408        case KVM_SET_GSI_ROUTING: {
4409                struct kvm_irq_routing routing;
4410                struct kvm_irq_routing __user *urouting;
4411                struct kvm_irq_routing_entry *entries = NULL;
4412
4413                r = -EFAULT;
4414                if (copy_from_user(&routing, argp, sizeof(routing)))
4415                        goto out;
4416                r = -EINVAL;
4417                if (!kvm_arch_can_set_irq_routing(kvm))
4418                        goto out;
4419                if (routing.nr > KVM_MAX_IRQ_ROUTES)
4420                        goto out;
4421                if (routing.flags)
4422                        goto out;
4423                if (routing.nr) {
4424                        urouting = argp;
4425                        entries = vmemdup_user(urouting->entries,
4426                                               array_size(sizeof(*entries),
4427                                                          routing.nr));
4428                        if (IS_ERR(entries)) {
4429                                r = PTR_ERR(entries);
4430                                goto out;
4431                        }
4432                }
4433                r = kvm_set_irq_routing(kvm, entries, routing.nr,
4434                                        routing.flags);
4435                kvfree(entries);
4436                break;
4437        }
4438#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
4439        case KVM_CREATE_DEVICE: {
4440                struct kvm_create_device cd;
4441
4442                r = -EFAULT;
4443                if (copy_from_user(&cd, argp, sizeof(cd)))
4444                        goto out;
4445
4446                r = kvm_ioctl_create_device(kvm, &cd);
4447                if (r)
4448                        goto out;
4449
4450                r = -EFAULT;
4451                if (copy_to_user(argp, &cd, sizeof(cd)))
4452                        goto out;
4453
4454                r = 0;
4455                break;
4456        }
4457        case KVM_CHECK_EXTENSION:
4458                r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4459                break;
4460        case KVM_RESET_DIRTY_RINGS:
4461                r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4462                break;
4463        case KVM_GET_STATS_FD:
4464                r = kvm_vm_ioctl_get_stats_fd(kvm);
4465                break;
4466        default:
4467                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4468        }
4469out:
4470        return r;
4471}
4472
4473#ifdef CONFIG_KVM_COMPAT
4474struct compat_kvm_dirty_log {
4475        __u32 slot;
4476        __u32 padding1;
4477        union {
4478                compat_uptr_t dirty_bitmap; /* one bit per page */
4479                __u64 padding2;
4480        };
4481};
4482
4483struct compat_kvm_clear_dirty_log {
4484        __u32 slot;
4485        __u32 num_pages;
4486        __u64 first_page;
4487        union {
4488                compat_uptr_t dirty_bitmap; /* one bit per page */
4489                __u64 padding2;
4490        };
4491};
4492
4493static long kvm_vm_compat_ioctl(struct file *filp,
4494                           unsigned int ioctl, unsigned long arg)
4495{
4496        struct kvm *kvm = filp->private_data;
4497        int r;
4498
4499        if (kvm->mm != current->mm || kvm->vm_dead)
4500                return -EIO;
4501        switch (ioctl) {
4502#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4503        case KVM_CLEAR_DIRTY_LOG: {
4504                struct compat_kvm_clear_dirty_log compat_log;
4505                struct kvm_clear_dirty_log log;
4506
4507                if (copy_from_user(&compat_log, (void __user *)arg,
4508                                   sizeof(compat_log)))
4509                        return -EFAULT;
4510                log.slot         = compat_log.slot;
4511                log.num_pages    = compat_log.num_pages;
4512                log.first_page   = compat_log.first_page;
4513                log.padding2     = compat_log.padding2;
4514                log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4515
4516                r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4517                break;
4518        }
4519#endif
4520        case KVM_GET_DIRTY_LOG: {
4521                struct compat_kvm_dirty_log compat_log;
4522                struct kvm_dirty_log log;
4523
4524                if (copy_from_user(&compat_log, (void __user *)arg,
4525                                   sizeof(compat_log)))
4526                        return -EFAULT;
4527                log.slot         = compat_log.slot;
4528                log.padding1     = compat_log.padding1;
4529                log.padding2     = compat_log.padding2;
4530                log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4531
4532                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4533                break;
4534        }
4535        default:
4536                r = kvm_vm_ioctl(filp, ioctl, arg);
4537        }
4538        return r;
4539}
4540#endif
4541
4542static struct file_operations kvm_vm_fops = {
4543        .release        = kvm_vm_release,
4544        .unlocked_ioctl = kvm_vm_ioctl,
4545        .llseek         = noop_llseek,
4546        KVM_COMPAT(kvm_vm_compat_ioctl),
4547};
4548
4549bool file_is_kvm(struct file *file)
4550{
4551        return file && file->f_op == &kvm_vm_fops;
4552}
4553EXPORT_SYMBOL_GPL(file_is_kvm);
4554
4555static int kvm_dev_ioctl_create_vm(unsigned long type)
4556{
4557        int r;
4558        struct kvm *kvm;
4559        struct file *file;
4560
4561        kvm = kvm_create_vm(type);
4562        if (IS_ERR(kvm))
4563                return PTR_ERR(kvm);
4564#ifdef CONFIG_KVM_MMIO
4565        r = kvm_coalesced_mmio_init(kvm);
4566        if (r < 0)
4567                goto put_kvm;
4568#endif
4569        r = get_unused_fd_flags(O_CLOEXEC);
4570        if (r < 0)
4571                goto put_kvm;
4572
4573        snprintf(kvm->stats_id, sizeof(kvm->stats_id),
4574                        "kvm-%d", task_pid_nr(current));
4575
4576        file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4577        if (IS_ERR(file)) {
4578                put_unused_fd(r);
4579                r = PTR_ERR(file);
4580                goto put_kvm;
4581        }
4582
4583        /*
4584         * Don't call kvm_put_kvm anymore at this point; file->f_op is
4585         * already set, with ->release() being kvm_vm_release().  In error
4586         * cases it will be called by the final fput(file) and will take
4587         * care of doing kvm_put_kvm(kvm).
4588         */
4589        if (kvm_create_vm_debugfs(kvm, r) < 0) {
4590                put_unused_fd(r);
4591                fput(file);
4592                return -ENOMEM;
4593        }
4594        kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
4595
4596        fd_install(r, file);
4597        return r;
4598
4599put_kvm:
4600        kvm_put_kvm(kvm);
4601        return r;
4602}
4603
4604static long kvm_dev_ioctl(struct file *filp,
4605                          unsigned int ioctl, unsigned long arg)
4606{
4607        long r = -EINVAL;
4608
4609        switch (ioctl) {
4610        case KVM_GET_API_VERSION:
4611                if (arg)
4612                        goto out;
4613                r = KVM_API_VERSION;
4614                break;
4615        case KVM_CREATE_VM:
4616                r = kvm_dev_ioctl_create_vm(arg);
4617                break;
4618        case KVM_CHECK_EXTENSION:
4619                r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
4620                break;
4621        case KVM_GET_VCPU_MMAP_SIZE:
4622                if (arg)
4623                        goto out;
4624                r = PAGE_SIZE;     /* struct kvm_run */
4625#ifdef CONFIG_X86
4626                r += PAGE_SIZE;    /* pio data page */
4627#endif
4628#ifdef CONFIG_KVM_MMIO
4629                r += PAGE_SIZE;    /* coalesced mmio ring page */
4630#endif
4631                break;
4632        case KVM_TRACE_ENABLE:
4633        case KVM_TRACE_PAUSE:
4634        case KVM_TRACE_DISABLE:
4635                r = -EOPNOTSUPP;
4636                break;
4637        default:
4638                return kvm_arch_dev_ioctl(filp, ioctl, arg);
4639        }
4640out:
4641        return r;
4642}
4643
4644static struct file_operations kvm_chardev_ops = {
4645        .unlocked_ioctl = kvm_dev_ioctl,
4646        .llseek         = noop_llseek,
4647        KVM_COMPAT(kvm_dev_ioctl),
4648};
4649
4650static struct miscdevice kvm_dev = {
4651        KVM_MINOR,
4652        "kvm",
4653        &kvm_chardev_ops,
4654};
4655
4656static void hardware_enable_nolock(void *junk)
4657{
4658        int cpu = raw_smp_processor_id();
4659        int r;
4660
4661        if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
4662                return;
4663
4664        cpumask_set_cpu(cpu, cpus_hardware_enabled);
4665
4666        r = kvm_arch_hardware_enable();
4667
4668        if (r) {
4669                cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4670                atomic_inc(&hardware_enable_failed);
4671                pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
4672        }
4673}
4674
4675static int kvm_starting_cpu(unsigned int cpu)
4676{
4677        raw_spin_lock(&kvm_count_lock);
4678        if (kvm_usage_count)
4679                hardware_enable_nolock(NULL);
4680        raw_spin_unlock(&kvm_count_lock);
4681        return 0;
4682}
4683
4684static void hardware_disable_nolock(void *junk)
4685{
4686        int cpu = raw_smp_processor_id();
4687
4688        if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
4689                return;
4690        cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4691        kvm_arch_hardware_disable();
4692}
4693
4694static int kvm_dying_cpu(unsigned int cpu)
4695{
4696        raw_spin_lock(&kvm_count_lock);
4697        if (kvm_usage_count)
4698                hardware_disable_nolock(NULL);
4699        raw_spin_unlock(&kvm_count_lock);
4700        return 0;
4701}
4702
4703static void hardware_disable_all_nolock(void)
4704{
4705        BUG_ON(!kvm_usage_count);
4706
4707        kvm_usage_count--;
4708        if (!kvm_usage_count)
4709                on_each_cpu(hardware_disable_nolock, NULL, 1);
4710}
4711
4712static void hardware_disable_all(void)
4713{
4714        raw_spin_lock(&kvm_count_lock);
4715        hardware_disable_all_nolock();
4716        raw_spin_unlock(&kvm_count_lock);
4717}
4718
4719static int hardware_enable_all(void)
4720{
4721        int r = 0;
4722
4723        raw_spin_lock(&kvm_count_lock);
4724
4725        kvm_usage_count++;
4726        if (kvm_usage_count == 1) {
4727                atomic_set(&hardware_enable_failed, 0);
4728                on_each_cpu(hardware_enable_nolock, NULL, 1);
4729
4730                if (atomic_read(&hardware_enable_failed)) {
4731                        hardware_disable_all_nolock();
4732                        r = -EBUSY;
4733                }
4734        }
4735
4736        raw_spin_unlock(&kvm_count_lock);
4737
4738        return r;
4739}
4740
4741static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
4742                      void *v)
4743{
4744        /*
4745         * Some (well, at least mine) BIOSes hang on reboot if
4746         * in vmx root mode.
4747         *
4748         * And Intel TXT required VMX off for all cpu when system shutdown.
4749         */
4750        pr_info("kvm: exiting hardware virtualization\n");
4751        kvm_rebooting = true;
4752        on_each_cpu(hardware_disable_nolock, NULL, 1);
4753        return NOTIFY_OK;
4754}
4755
4756static struct notifier_block kvm_reboot_notifier = {
4757        .notifier_call = kvm_reboot,
4758        .priority = 0,
4759};
4760
4761static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
4762{
4763        int i;
4764
4765        for (i = 0; i < bus->dev_count; i++) {
4766                struct kvm_io_device *pos = bus->range[i].dev;
4767
4768                kvm_iodevice_destructor(pos);
4769        }
4770        kfree(bus);
4771}
4772
4773static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
4774                                 const struct kvm_io_range *r2)
4775{
4776        gpa_t addr1 = r1->addr;
4777        gpa_t addr2 = r2->addr;
4778
4779        if (addr1 < addr2)
4780                return -1;
4781
4782        /* If r2->len == 0, match the exact address.  If r2->len != 0,
4783         * accept any overlapping write.  Any order is acceptable for
4784         * overlapping ranges, because kvm_io_bus_get_first_dev ensures
4785         * we process all of them.
4786         */
4787        if (r2->len) {
4788                addr1 += r1->len;
4789                addr2 += r2->len;
4790        }
4791
4792        if (addr1 > addr2)
4793                return 1;
4794
4795        return 0;
4796}
4797
4798static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4799{
4800        return kvm_io_bus_cmp(p1, p2);
4801}
4802
4803static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
4804                             gpa_t addr, int len)
4805{
4806        struct kvm_io_range *range, key;
4807        int off;
4808
4809        key = (struct kvm_io_range) {
4810                .addr = addr,
4811                .len = len,
4812        };
4813
4814        range = bsearch(&key, bus->range, bus->dev_count,
4815                        sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4816        if (range == NULL)
4817                return -ENOENT;
4818
4819        off = range - bus->range;
4820
4821        while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
4822                off--;
4823
4824        return off;
4825}
4826
4827static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4828                              struct kvm_io_range *range, const void *val)
4829{
4830        int idx;
4831
4832        idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4833        if (idx < 0)
4834                return -EOPNOTSUPP;
4835
4836        while (idx < bus->dev_count &&
4837                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4838                if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
4839                                        range->len, val))
4840                        return idx;
4841                idx++;
4842        }
4843
4844        return -EOPNOTSUPP;
4845}
4846
4847/* kvm_io_bus_write - called under kvm->slots_lock */
4848int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4849                     int len, const void *val)
4850{
4851        struct kvm_io_bus *bus;
4852        struct kvm_io_range range;
4853        int r;
4854
4855        range = (struct kvm_io_range) {
4856                .addr = addr,
4857                .len = len,
4858        };
4859
4860        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4861        if (!bus)
4862                return -ENOMEM;
4863        r = __kvm_io_bus_write(vcpu, bus, &range, val);
4864        return r < 0 ? r : 0;
4865}
4866EXPORT_SYMBOL_GPL(kvm_io_bus_write);
4867
4868/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
4869int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
4870                            gpa_t addr, int len, const void *val, long cookie)
4871{
4872        struct kvm_io_bus *bus;
4873        struct kvm_io_range range;
4874
4875        range = (struct kvm_io_range) {
4876                .addr = addr,
4877                .len = len,
4878        };
4879
4880        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4881        if (!bus)
4882                return -ENOMEM;
4883
4884        /* First try the device referenced by cookie. */
4885        if ((cookie >= 0) && (cookie < bus->dev_count) &&
4886            (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
4887                if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
4888                                        val))
4889                        return cookie;
4890
4891        /*
4892         * cookie contained garbage; fall back to search and return the
4893         * correct cookie value.
4894         */
4895        return __kvm_io_bus_write(vcpu, bus, &range, val);
4896}
4897
4898static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4899                             struct kvm_io_range *range, void *val)
4900{
4901        int idx;
4902
4903        idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4904        if (idx < 0)
4905                return -EOPNOTSUPP;
4906
4907        while (idx < bus->dev_count &&
4908                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4909                if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
4910                                       range->len, val))
4911                        return idx;
4912                idx++;
4913        }
4914
4915        return -EOPNOTSUPP;
4916}
4917
4918/* kvm_io_bus_read - called under kvm->slots_lock */
4919int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4920                    int len, void *val)
4921{
4922        struct kvm_io_bus *bus;
4923        struct kvm_io_range range;
4924        int r;
4925
4926        range = (struct kvm_io_range) {
4927                .addr = addr,
4928                .len = len,
4929        };
4930
4931        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4932        if (!bus)
4933                return -ENOMEM;
4934        r = __kvm_io_bus_read(vcpu, bus, &range, val);
4935        return r < 0 ? r : 0;
4936}
4937
4938/* Caller must hold slots_lock. */
4939int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
4940                            int len, struct kvm_io_device *dev)
4941{
4942        int i;
4943        struct kvm_io_bus *new_bus, *bus;
4944        struct kvm_io_range range;
4945
4946        bus = kvm_get_bus(kvm, bus_idx);
4947        if (!bus)
4948                return -ENOMEM;
4949
4950        /* exclude ioeventfd which is limited by maximum fd */
4951        if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
4952                return -ENOSPC;
4953
4954        new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
4955                          GFP_KERNEL_ACCOUNT);
4956        if (!new_bus)
4957                return -ENOMEM;
4958
4959        range = (struct kvm_io_range) {
4960                .addr = addr,
4961                .len = len,
4962                .dev = dev,
4963        };
4964
4965        for (i = 0; i < bus->dev_count; i++)
4966                if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
4967                        break;
4968
4969        memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
4970        new_bus->dev_count++;
4971        new_bus->range[i] = range;
4972        memcpy(new_bus->range + i + 1, bus->range + i,
4973                (bus->dev_count - i) * sizeof(struct kvm_io_range));
4974        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
4975        synchronize_srcu_expedited(&kvm->srcu);
4976        kfree(bus);
4977
4978        return 0;
4979}
4980
4981int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
4982                              struct kvm_io_device *dev)
4983{
4984        int i, j;
4985        struct kvm_io_bus *new_bus, *bus;
4986
4987        lockdep_assert_held(&kvm->slots_lock);
4988
4989        bus = kvm_get_bus(kvm, bus_idx);
4990        if (!bus)
4991                return 0;
4992
4993        for (i = 0; i < bus->dev_count; i++) {
4994                if (bus->range[i].dev == dev) {
4995                        break;
4996                }
4997        }
4998
4999        if (i == bus->dev_count)
5000                return 0;
5001
5002        new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5003                          GFP_KERNEL_ACCOUNT);
5004        if (new_bus) {
5005                memcpy(new_bus, bus, struct_size(bus, range, i));
5006                new_bus->dev_count--;
5007                memcpy(new_bus->range + i, bus->range + i + 1,
5008                                flex_array_size(new_bus, range, new_bus->dev_count - i));
5009        }
5010
5011        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5012        synchronize_srcu_expedited(&kvm->srcu);
5013
5014        /* Destroy the old bus _after_ installing the (null) bus. */
5015        if (!new_bus) {
5016                pr_err("kvm: failed to shrink bus, removing it completely\n");
5017                for (j = 0; j < bus->dev_count; j++) {
5018                        if (j == i)
5019                                continue;
5020                        kvm_iodevice_destructor(bus->range[j].dev);
5021                }
5022        }
5023
5024        kfree(bus);
5025        return new_bus ? 0 : -ENOMEM;
5026}
5027
5028struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5029                                         gpa_t addr)
5030{
5031        struct kvm_io_bus *bus;
5032        int dev_idx, srcu_idx;
5033        struct kvm_io_device *iodev = NULL;
5034
5035        srcu_idx = srcu_read_lock(&kvm->srcu);
5036
5037        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5038        if (!bus)
5039                goto out_unlock;
5040
5041        dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5042        if (dev_idx < 0)
5043                goto out_unlock;
5044
5045        iodev = bus->range[dev_idx].dev;
5046
5047out_unlock:
5048        srcu_read_unlock(&kvm->srcu, srcu_idx);
5049
5050        return iodev;
5051}
5052EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5053
5054static int kvm_debugfs_open(struct inode *inode, struct file *file,
5055                           int (*get)(void *, u64 *), int (*set)(void *, u64),
5056                           const char *fmt)
5057{
5058        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5059                                          inode->i_private;
5060
5061        /*
5062         * The debugfs files are a reference to the kvm struct which
5063        * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
5064        * avoids the race between open and the removal of the debugfs directory.
5065         */
5066        if (!kvm_get_kvm_safe(stat_data->kvm))
5067                return -ENOENT;
5068
5069        if (simple_attr_open(inode, file, get,
5070                    kvm_stats_debugfs_mode(stat_data->desc) & 0222
5071                    ? set : NULL,
5072                    fmt)) {
5073                kvm_put_kvm(stat_data->kvm);
5074                return -ENOMEM;
5075        }
5076
5077        return 0;
5078}
5079
5080static int kvm_debugfs_release(struct inode *inode, struct file *file)
5081{
5082        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5083                                          inode->i_private;
5084
5085        simple_attr_release(inode, file);
5086        kvm_put_kvm(stat_data->kvm);
5087
5088        return 0;
5089}
5090
5091static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5092{
5093        *val = *(u64 *)((void *)(&kvm->stat) + offset);
5094
5095        return 0;
5096}
5097
5098static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5099{
5100        *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5101
5102        return 0;
5103}
5104
5105static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5106{
5107        int i;
5108        struct kvm_vcpu *vcpu;
5109
5110        *val = 0;
5111
5112        kvm_for_each_vcpu(i, vcpu, kvm)
5113                *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5114
5115        return 0;
5116}
5117
5118static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5119{
5120        int i;
5121        struct kvm_vcpu *vcpu;
5122
5123        kvm_for_each_vcpu(i, vcpu, kvm)
5124                *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5125
5126        return 0;
5127}
5128
5129static int kvm_stat_data_get(void *data, u64 *val)
5130{
5131        int r = -EFAULT;
5132        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5133
5134        switch (stat_data->kind) {
5135        case KVM_STAT_VM:
5136                r = kvm_get_stat_per_vm(stat_data->kvm,
5137                                        stat_data->desc->desc.offset, val);
5138                break;
5139        case KVM_STAT_VCPU:
5140                r = kvm_get_stat_per_vcpu(stat_data->kvm,
5141                                          stat_data->desc->desc.offset, val);
5142                break;
5143        }
5144
5145        return r;
5146}
5147
5148static int kvm_stat_data_clear(void *data, u64 val)
5149{
5150        int r = -EFAULT;
5151        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5152
5153        if (val)
5154                return -EINVAL;
5155
5156        switch (stat_data->kind) {
5157        case KVM_STAT_VM:
5158                r = kvm_clear_stat_per_vm(stat_data->kvm,
5159                                          stat_data->desc->desc.offset);
5160                break;
5161        case KVM_STAT_VCPU:
5162                r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5163                                            stat_data->desc->desc.offset);
5164                break;
5165        }
5166
5167        return r;
5168}
5169
5170static int kvm_stat_data_open(struct inode *inode, struct file *file)
5171{
5172        __simple_attr_check_format("%llu\n", 0ull);
5173        return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5174                                kvm_stat_data_clear, "%llu\n");
5175}
5176
5177static const struct file_operations stat_fops_per_vm = {
5178        .owner = THIS_MODULE,
5179        .open = kvm_stat_data_open,
5180        .release = kvm_debugfs_release,
5181        .read = simple_attr_read,
5182        .write = simple_attr_write,
5183        .llseek = no_llseek,
5184};
5185
5186static int vm_stat_get(void *_offset, u64 *val)
5187{
5188        unsigned offset = (long)_offset;
5189        struct kvm *kvm;
5190        u64 tmp_val;
5191
5192        *val = 0;
5193        mutex_lock(&kvm_lock);
5194        list_for_each_entry(kvm, &vm_list, vm_list) {
5195                kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5196                *val += tmp_val;
5197        }
5198        mutex_unlock(&kvm_lock);
5199        return 0;
5200}
5201
5202static int vm_stat_clear(void *_offset, u64 val)
5203{
5204        unsigned offset = (long)_offset;
5205        struct kvm *kvm;
5206
5207        if (val)
5208                return -EINVAL;
5209
5210        mutex_lock(&kvm_lock);
5211        list_for_each_entry(kvm, &vm_list, vm_list) {
5212                kvm_clear_stat_per_vm(kvm, offset);
5213        }
5214        mutex_unlock(&kvm_lock);
5215
5216        return 0;
5217}
5218
5219DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5220DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5221
5222static int vcpu_stat_get(void *_offset, u64 *val)
5223{
5224        unsigned offset = (long)_offset;
5225        struct kvm *kvm;
5226        u64 tmp_val;
5227
5228        *val = 0;
5229        mutex_lock(&kvm_lock);
5230        list_for_each_entry(kvm, &vm_list, vm_list) {
5231                kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5232                *val += tmp_val;
5233        }
5234        mutex_unlock(&kvm_lock);
5235        return 0;
5236}
5237
5238static int vcpu_stat_clear(void *_offset, u64 val)
5239{
5240        unsigned offset = (long)_offset;
5241        struct kvm *kvm;
5242
5243        if (val)
5244                return -EINVAL;
5245
5246        mutex_lock(&kvm_lock);
5247        list_for_each_entry(kvm, &vm_list, vm_list) {
5248                kvm_clear_stat_per_vcpu(kvm, offset);
5249        }
5250        mutex_unlock(&kvm_lock);
5251
5252        return 0;
5253}
5254
5255DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5256                        "%llu\n");
5257DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5258
5259static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5260{
5261        struct kobj_uevent_env *env;
5262        unsigned long long created, active;
5263
5264        if (!kvm_dev.this_device || !kvm)
5265                return;
5266
5267        mutex_lock(&kvm_lock);
5268        if (type == KVM_EVENT_CREATE_VM) {
5269                kvm_createvm_count++;
5270                kvm_active_vms++;
5271        } else if (type == KVM_EVENT_DESTROY_VM) {
5272                kvm_active_vms--;
5273        }
5274        created = kvm_createvm_count;
5275        active = kvm_active_vms;
5276        mutex_unlock(&kvm_lock);
5277
5278        env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5279        if (!env)
5280                return;
5281
5282        add_uevent_var(env, "CREATED=%llu", created);
5283        add_uevent_var(env, "COUNT=%llu", active);
5284
5285        if (type == KVM_EVENT_CREATE_VM) {
5286                add_uevent_var(env, "EVENT=create");
5287                kvm->userspace_pid = task_pid_nr(current);
5288        } else if (type == KVM_EVENT_DESTROY_VM) {
5289                add_uevent_var(env, "EVENT=destroy");
5290        }
5291        add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5292
5293        if (kvm->debugfs_dentry) {
5294                char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5295
5296                if (p) {
5297                        tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5298                        if (!IS_ERR(tmp))
5299                                add_uevent_var(env, "STATS_PATH=%s", tmp);
5300                        kfree(p);
5301                }
5302        }
5303        /* no need for checks, since we are adding at most only 5 keys */
5304        env->envp[env->envp_idx++] = NULL;
5305        kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5306        kfree(env);
5307}
5308
5309static void kvm_init_debug(void)
5310{
5311        const struct file_operations *fops;
5312        const struct _kvm_stats_desc *pdesc;
5313        int i;
5314
5315        kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
5316
5317        for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5318                pdesc = &kvm_vm_stats_desc[i];
5319                if (kvm_stats_debugfs_mode(pdesc) & 0222)
5320                        fops = &vm_stat_fops;
5321                else
5322                        fops = &vm_stat_readonly_fops;
5323                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5324                                kvm_debugfs_dir,
5325                                (void *)(long)pdesc->desc.offset, fops);
5326        }
5327
5328        for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5329                pdesc = &kvm_vcpu_stats_desc[i];
5330                if (kvm_stats_debugfs_mode(pdesc) & 0222)
5331                        fops = &vcpu_stat_fops;
5332                else
5333                        fops = &vcpu_stat_readonly_fops;
5334                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5335                                kvm_debugfs_dir,
5336                                (void *)(long)pdesc->desc.offset, fops);
5337        }
5338}
5339
5340static int kvm_suspend(void)
5341{
5342        if (kvm_usage_count)
5343                hardware_disable_nolock(NULL);
5344        return 0;
5345}
5346
5347static void kvm_resume(void)
5348{
5349        if (kvm_usage_count) {
5350#ifdef CONFIG_LOCKDEP
5351                WARN_ON(lockdep_is_held(&kvm_count_lock));
5352#endif
5353                hardware_enable_nolock(NULL);
5354        }
5355}
5356
5357static struct syscore_ops kvm_syscore_ops = {
5358        .suspend = kvm_suspend,
5359        .resume = kvm_resume,
5360};
5361
5362static inline
5363struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5364{
5365        return container_of(pn, struct kvm_vcpu, preempt_notifier);
5366}
5367
5368static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5369{
5370        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5371
5372        WRITE_ONCE(vcpu->preempted, false);
5373        WRITE_ONCE(vcpu->ready, false);
5374
5375        __this_cpu_write(kvm_running_vcpu, vcpu);
5376        kvm_arch_sched_in(vcpu, cpu);
5377        kvm_arch_vcpu_load(vcpu, cpu);
5378}
5379
5380static void kvm_sched_out(struct preempt_notifier *pn,
5381                          struct task_struct *next)
5382{
5383        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5384
5385        if (current->on_rq) {
5386                WRITE_ONCE(vcpu->preempted, true);
5387                WRITE_ONCE(vcpu->ready, true);
5388        }
5389        kvm_arch_vcpu_put(vcpu);
5390        __this_cpu_write(kvm_running_vcpu, NULL);
5391}
5392
5393/**
5394 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
5395 *
5396 * We can disable preemption locally around accessing the per-CPU variable,
5397 * and use the resolved vcpu pointer after enabling preemption again,
5398 * because even if the current thread is migrated to another CPU, reading
5399 * the per-CPU value later will give us the same value as we update the
5400 * per-CPU variable in the preempt notifier handlers.
5401 */
5402struct kvm_vcpu *kvm_get_running_vcpu(void)
5403{
5404        struct kvm_vcpu *vcpu;
5405
5406        preempt_disable();
5407        vcpu = __this_cpu_read(kvm_running_vcpu);
5408        preempt_enable();
5409
5410        return vcpu;
5411}
5412EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5413
5414/**
5415 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
5416 */
5417struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5418{
5419        return &kvm_running_vcpu;
5420}
5421
5422struct kvm_cpu_compat_check {
5423        void *opaque;
5424        int *ret;
5425};
5426
5427static void check_processor_compat(void *data)
5428{
5429        struct kvm_cpu_compat_check *c = data;
5430
5431        *c->ret = kvm_arch_check_processor_compat(c->opaque);
5432}
5433
5434int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
5435                  struct module *module)
5436{
5437        struct kvm_cpu_compat_check c;
5438        int r;
5439        int cpu;
5440
5441        r = kvm_arch_init(opaque);
5442        if (r)
5443                goto out_fail;
5444
5445        /*
5446         * kvm_arch_init makes sure there's at most one caller
5447         * for architectures that support multiple implementations,
5448         * like intel and amd on x86.
5449         * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
5450         * conflicts in case kvm is already setup for another implementation.
5451         */
5452        r = kvm_irqfd_init();
5453        if (r)
5454                goto out_irqfd;
5455
5456        if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
5457                r = -ENOMEM;
5458                goto out_free_0;
5459        }
5460
5461        r = kvm_arch_hardware_setup(opaque);
5462        if (r < 0)
5463                goto out_free_1;
5464
5465        c.ret = &r;
5466        c.opaque = opaque;
5467        for_each_online_cpu(cpu) {
5468                smp_call_function_single(cpu, check_processor_compat, &c, 1);
5469                if (r < 0)
5470                        goto out_free_2;
5471        }
5472
5473        r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
5474                                      kvm_starting_cpu, kvm_dying_cpu);
5475        if (r)
5476                goto out_free_2;
5477        register_reboot_notifier(&kvm_reboot_notifier);
5478
5479        /* A kmem cache lets us meet the alignment requirements of fx_save. */
5480        if (!vcpu_align)
5481                vcpu_align = __alignof__(struct kvm_vcpu);
5482        kvm_vcpu_cache =
5483                kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5484                                           SLAB_ACCOUNT,
5485                                           offsetof(struct kvm_vcpu, arch),
5486                                           offsetofend(struct kvm_vcpu, stats_id)
5487                                           - offsetof(struct kvm_vcpu, arch),
5488                                           NULL);
5489        if (!kvm_vcpu_cache) {
5490                r = -ENOMEM;
5491                goto out_free_3;
5492        }
5493
5494        for_each_possible_cpu(cpu) {
5495                if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
5496                                            GFP_KERNEL, cpu_to_node(cpu))) {
5497                        r = -ENOMEM;
5498                        goto out_free_4;
5499                }
5500        }
5501
5502        r = kvm_async_pf_init();
5503        if (r)
5504                goto out_free_5;
5505
5506        kvm_chardev_ops.owner = module;
5507        kvm_vm_fops.owner = module;
5508        kvm_vcpu_fops.owner = module;
5509
5510        r = misc_register(&kvm_dev);
5511        if (r) {
5512                pr_err("kvm: misc device register failed\n");
5513                goto out_unreg;
5514        }
5515
5516        register_syscore_ops(&kvm_syscore_ops);
5517
5518        kvm_preempt_ops.sched_in = kvm_sched_in;
5519        kvm_preempt_ops.sched_out = kvm_sched_out;
5520
5521        kvm_init_debug();
5522
5523        r = kvm_vfio_ops_init();
5524        WARN_ON(r);
5525
5526        return 0;
5527
5528out_unreg:
5529        kvm_async_pf_deinit();
5530out_free_5:
5531        for_each_possible_cpu(cpu)
5532                free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
5533out_free_4:
5534        kmem_cache_destroy(kvm_vcpu_cache);
5535out_free_3:
5536        unregister_reboot_notifier(&kvm_reboot_notifier);
5537        cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5538out_free_2:
5539        kvm_arch_hardware_unsetup();
5540out_free_1:
5541        free_cpumask_var(cpus_hardware_enabled);
5542out_free_0:
5543        kvm_irqfd_exit();
5544out_irqfd:
5545        kvm_arch_exit();
5546out_fail:
5547        return r;
5548}
5549EXPORT_SYMBOL_GPL(kvm_init);
5550
5551void kvm_exit(void)
5552{
5553        int cpu;
5554
5555        debugfs_remove_recursive(kvm_debugfs_dir);
5556        misc_deregister(&kvm_dev);
5557        for_each_possible_cpu(cpu)
5558                free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
5559        kmem_cache_destroy(kvm_vcpu_cache);
5560        kvm_async_pf_deinit();
5561        unregister_syscore_ops(&kvm_syscore_ops);
5562        unregister_reboot_notifier(&kvm_reboot_notifier);
5563        cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5564        on_each_cpu(hardware_disable_nolock, NULL, 1);
5565        kvm_arch_hardware_unsetup();
5566        kvm_arch_exit();
5567        kvm_irqfd_exit();
5568        free_cpumask_var(cpus_hardware_enabled);
5569        kvm_vfio_ops_exit();
5570}
5571EXPORT_SYMBOL_GPL(kvm_exit);
5572
5573struct kvm_vm_worker_thread_context {
5574        struct kvm *kvm;
5575        struct task_struct *parent;
5576        struct completion init_done;
5577        kvm_vm_thread_fn_t thread_fn;
5578        uintptr_t data;
5579        int err;
5580};
5581
5582static int kvm_vm_worker_thread(void *context)
5583{
5584        /*
5585         * The init_context is allocated on the stack of the parent thread, so
5586         * we have to locally copy anything that is needed beyond initialization
5587         */
5588        struct kvm_vm_worker_thread_context *init_context = context;
5589        struct kvm *kvm = init_context->kvm;
5590        kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5591        uintptr_t data = init_context->data;
5592        int err;
5593
5594        err = kthread_park(current);
5595        /* kthread_park(current) is never supposed to return an error */
5596        WARN_ON(err != 0);
5597        if (err)
5598                goto init_complete;
5599
5600        err = cgroup_attach_task_all(init_context->parent, current);
5601        if (err) {
5602                kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5603                        __func__, err);
5604                goto init_complete;
5605        }
5606
5607        set_user_nice(current, task_nice(init_context->parent));
5608
5609init_complete:
5610        init_context->err = err;
5611        complete(&init_context->init_done);
5612        init_context = NULL;
5613
5614        if (err)
5615                return err;
5616
5617        /* Wait to be woken up by the spawner before proceeding. */
5618        kthread_parkme();
5619
5620        if (!kthread_should_stop())
5621                err = thread_fn(kvm, data);
5622
5623        return err;
5624}
5625
5626int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5627                                uintptr_t data, const char *name,
5628                                struct task_struct **thread_ptr)
5629{
5630        struct kvm_vm_worker_thread_context init_context = {};
5631        struct task_struct *thread;
5632
5633        *thread_ptr = NULL;
5634        init_context.kvm = kvm;
5635        init_context.parent = current;
5636        init_context.thread_fn = thread_fn;
5637        init_context.data = data;
5638        init_completion(&init_context.init_done);
5639
5640        thread = kthread_run(kvm_vm_worker_thread, &init_context,
5641                             "%s-%d", name, task_pid_nr(current));
5642        if (IS_ERR(thread))
5643                return PTR_ERR(thread);
5644
5645        /* kthread_run is never supposed to return NULL */
5646        WARN_ON(thread == NULL);
5647
5648        wait_for_completion(&init_context.init_done);
5649
5650        if (!init_context.err)
5651                *thread_ptr = thread;
5652
5653        return init_context.err;
5654}
5655