linux/virt/kvm/kvm_main.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Kernel-based Virtual Machine driver for Linux
   4 *
   5 * This module enables machines with Intel VT-x extensions to run virtual
   6 * machines without emulation or binary translation.
   7 *
   8 * Copyright (C) 2006 Qumranet, Inc.
   9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10 *
  11 * Authors:
  12 *   Avi Kivity   <avi@qumranet.com>
  13 *   Yaniv Kamay  <yaniv@qumranet.com>
  14 */
  15
  16#include <kvm/iodev.h>
  17
  18#include <linux/kvm_host.h>
  19#include <linux/kvm.h>
  20#include <linux/module.h>
  21#include <linux/errno.h>
  22#include <linux/percpu.h>
  23#include <linux/mm.h>
  24#include <linux/miscdevice.h>
  25#include <linux/vmalloc.h>
  26#include <linux/reboot.h>
  27#include <linux/debugfs.h>
  28#include <linux/highmem.h>
  29#include <linux/file.h>
  30#include <linux/syscore_ops.h>
  31#include <linux/cpu.h>
  32#include <linux/sched/signal.h>
  33#include <linux/sched/mm.h>
  34#include <linux/sched/stat.h>
  35#include <linux/cpumask.h>
  36#include <linux/smp.h>
  37#include <linux/anon_inodes.h>
  38#include <linux/profile.h>
  39#include <linux/kvm_para.h>
  40#include <linux/pagemap.h>
  41#include <linux/mman.h>
  42#include <linux/swap.h>
  43#include <linux/bitops.h>
  44#include <linux/spinlock.h>
  45#include <linux/compat.h>
  46#include <linux/srcu.h>
  47#include <linux/hugetlb.h>
  48#include <linux/slab.h>
  49#include <linux/sort.h>
  50#include <linux/bsearch.h>
  51#include <linux/io.h>
  52#include <linux/lockdep.h>
  53#include <linux/kthread.h>
  54#include <linux/suspend.h>
  55
  56#include <asm/processor.h>
  57#include <asm/ioctl.h>
  58#include <linux/uaccess.h>
  59
  60#include "coalesced_mmio.h"
  61#include "async_pf.h"
  62#include "mmu_lock.h"
  63#include "vfio.h"
  64
  65#define CREATE_TRACE_POINTS
  66#include <trace/events/kvm.h>
  67
  68#include <linux/kvm_dirty_ring.h>
  69
  70/* Worst case buffer size needed for holding an integer. */
  71#define ITOA_MAX_LEN 12
  72
  73MODULE_AUTHOR("Qumranet");
  74MODULE_LICENSE("GPL");
  75
  76/* Architectures should define their poll value according to the halt latency */
  77unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
  78module_param(halt_poll_ns, uint, 0644);
  79EXPORT_SYMBOL_GPL(halt_poll_ns);
  80
  81/* Default doubles per-vcpu halt_poll_ns. */
  82unsigned int halt_poll_ns_grow = 2;
  83module_param(halt_poll_ns_grow, uint, 0644);
  84EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
  85
  86/* The start value to grow halt_poll_ns from */
  87unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
  88module_param(halt_poll_ns_grow_start, uint, 0644);
  89EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
  90
  91/* Default resets per-vcpu halt_poll_ns . */
  92unsigned int halt_poll_ns_shrink;
  93module_param(halt_poll_ns_shrink, uint, 0644);
  94EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
  95
  96/*
  97 * Ordering of locks:
  98 *
  99 *      kvm->lock --> kvm->slots_lock --> kvm->irq_lock
 100 */
 101
 102DEFINE_MUTEX(kvm_lock);
 103static DEFINE_RAW_SPINLOCK(kvm_count_lock);
 104LIST_HEAD(vm_list);
 105
 106static cpumask_var_t cpus_hardware_enabled;
 107static int kvm_usage_count;
 108static atomic_t hardware_enable_failed;
 109
 110static struct kmem_cache *kvm_vcpu_cache;
 111
 112static __read_mostly struct preempt_ops kvm_preempt_ops;
 113static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
 114
 115struct dentry *kvm_debugfs_dir;
 116EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 117
 118static const struct file_operations stat_fops_per_vm;
 119
 120static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 121                           unsigned long arg);
 122#ifdef CONFIG_KVM_COMPAT
 123static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
 124                                  unsigned long arg);
 125#define KVM_COMPAT(c)   .compat_ioctl   = (c)
 126#else
 127/*
 128 * For architectures that don't implement a compat infrastructure,
 129 * adopt a double line of defense:
 130 * - Prevent a compat task from opening /dev/kvm
 131 * - If the open has been done by a 64bit task, and the KVM fd
 132 *   passed to a compat task, let the ioctls fail.
 133 */
 134static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
 135                                unsigned long arg) { return -EINVAL; }
 136
 137static int kvm_no_compat_open(struct inode *inode, struct file *file)
 138{
 139        return is_compat_task() ? -ENODEV : 0;
 140}
 141#define KVM_COMPAT(c)   .compat_ioctl   = kvm_no_compat_ioctl,  \
 142                        .open           = kvm_no_compat_open
 143#endif
 144static int hardware_enable_all(void);
 145static void hardware_disable_all(void);
 146
 147static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 148
 149__visible bool kvm_rebooting;
 150EXPORT_SYMBOL_GPL(kvm_rebooting);
 151
 152#define KVM_EVENT_CREATE_VM 0
 153#define KVM_EVENT_DESTROY_VM 1
 154static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
 155static unsigned long long kvm_createvm_count;
 156static unsigned long long kvm_active_vms;
 157
 158__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 159                                                   unsigned long start, unsigned long end)
 160{
 161}
 162
 163bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
 164{
 165        /*
 166         * The metadata used by is_zone_device_page() to determine whether or
 167         * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
 168         * the device has been pinned, e.g. by get_user_pages().  WARN if the
 169         * page_count() is zero to help detect bad usage of this helper.
 170         */
 171        if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
 172                return false;
 173
 174        return is_zone_device_page(pfn_to_page(pfn));
 175}
 176
 177bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 178{
 179        /*
 180         * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
 181         * perspective they are "normal" pages, albeit with slightly different
 182         * usage rules.
 183         */
 184        if (pfn_valid(pfn))
 185                return PageReserved(pfn_to_page(pfn)) &&
 186                       !is_zero_pfn(pfn) &&
 187                       !kvm_is_zone_device_pfn(pfn);
 188
 189        return true;
 190}
 191
 192/*
 193 * Switches to specified vcpu, until a matching vcpu_put()
 194 */
 195void vcpu_load(struct kvm_vcpu *vcpu)
 196{
 197        int cpu = get_cpu();
 198
 199        __this_cpu_write(kvm_running_vcpu, vcpu);
 200        preempt_notifier_register(&vcpu->preempt_notifier);
 201        kvm_arch_vcpu_load(vcpu, cpu);
 202        put_cpu();
 203}
 204EXPORT_SYMBOL_GPL(vcpu_load);
 205
 206void vcpu_put(struct kvm_vcpu *vcpu)
 207{
 208        preempt_disable();
 209        kvm_arch_vcpu_put(vcpu);
 210        preempt_notifier_unregister(&vcpu->preempt_notifier);
 211        __this_cpu_write(kvm_running_vcpu, NULL);
 212        preempt_enable();
 213}
 214EXPORT_SYMBOL_GPL(vcpu_put);
 215
 216/* TODO: merge with kvm_arch_vcpu_should_kick */
 217static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
 218{
 219        int mode = kvm_vcpu_exiting_guest_mode(vcpu);
 220
 221        /*
 222         * We need to wait for the VCPU to reenable interrupts and get out of
 223         * READING_SHADOW_PAGE_TABLES mode.
 224         */
 225        if (req & KVM_REQUEST_WAIT)
 226                return mode != OUTSIDE_GUEST_MODE;
 227
 228        /*
 229         * Need to kick a running VCPU, but otherwise there is nothing to do.
 230         */
 231        return mode == IN_GUEST_MODE;
 232}
 233
 234static void ack_flush(void *_completed)
 235{
 236}
 237
 238static inline bool kvm_kick_many_cpus(cpumask_var_t tmp, bool wait)
 239{
 240        const struct cpumask *cpus;
 241
 242        if (likely(cpumask_available(tmp)))
 243                cpus = tmp;
 244        else
 245                cpus = cpu_online_mask;
 246
 247        if (cpumask_empty(cpus))
 248                return false;
 249
 250        smp_call_function_many(cpus, ack_flush, NULL, wait);
 251        return true;
 252}
 253
 254bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 255                                 struct kvm_vcpu *except,
 256                                 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
 257{
 258        int i, cpu, me;
 259        struct kvm_vcpu *vcpu;
 260        bool called;
 261
 262        me = get_cpu();
 263
 264        kvm_for_each_vcpu(i, vcpu, kvm) {
 265                if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
 266                    vcpu == except)
 267                        continue;
 268
 269                kvm_make_request(req, vcpu);
 270
 271                if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
 272                        continue;
 273
 274                /*
 275                 * tmp can be "unavailable" if cpumasks are allocated off stack
 276                 * as allocation of the mask is deliberately not fatal and is
 277                 * handled by falling back to kicking all online CPUs.
 278                 */
 279                if (!cpumask_available(tmp))
 280                        continue;
 281
 282                /*
 283                 * Note, the vCPU could get migrated to a different pCPU at any
 284                 * point after kvm_request_needs_ipi(), which could result in
 285                 * sending an IPI to the previous pCPU.  But, that's ok because
 286                 * the purpose of the IPI is to ensure the vCPU returns to
 287                 * OUTSIDE_GUEST_MODE, which is satisfied if the vCPU migrates.
 288                 * Entering READING_SHADOW_PAGE_TABLES after this point is also
 289                 * ok, as the requirement is only that KVM wait for vCPUs that
 290                 * were reading SPTEs _before_ any changes were finalized.  See
 291                 * kvm_vcpu_kick() for more details on handling requests.
 292                 */
 293                if (kvm_request_needs_ipi(vcpu, req)) {
 294                        cpu = READ_ONCE(vcpu->cpu);
 295                        if (cpu != -1 && cpu != me)
 296                                __cpumask_set_cpu(cpu, tmp);
 297                }
 298        }
 299
 300        called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
 301        put_cpu();
 302
 303        return called;
 304}
 305
 306bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
 307                                      struct kvm_vcpu *except)
 308{
 309        cpumask_var_t cpus;
 310        bool called;
 311
 312        zalloc_cpumask_var(&cpus, GFP_ATOMIC);
 313
 314        called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus);
 315
 316        free_cpumask_var(cpus);
 317        return called;
 318}
 319
 320bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 321{
 322        return kvm_make_all_cpus_request_except(kvm, req, NULL);
 323}
 324EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
 325
 326#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
 327void kvm_flush_remote_tlbs(struct kvm *kvm)
 328{
 329        ++kvm->stat.generic.remote_tlb_flush_requests;
 330
 331        /*
 332         * We want to publish modifications to the page tables before reading
 333         * mode. Pairs with a memory barrier in arch-specific code.
 334         * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
 335         * and smp_mb in walk_shadow_page_lockless_begin/end.
 336         * - powerpc: smp_mb in kvmppc_prepare_to_enter.
 337         *
 338         * There is already an smp_mb__after_atomic() before
 339         * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
 340         * barrier here.
 341         */
 342        if (!kvm_arch_flush_remote_tlb(kvm)
 343            || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 344                ++kvm->stat.generic.remote_tlb_flush;
 345}
 346EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
 347#endif
 348
 349void kvm_reload_remote_mmus(struct kvm *kvm)
 350{
 351        kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 352}
 353
 354#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 355static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
 356                                               gfp_t gfp_flags)
 357{
 358        gfp_flags |= mc->gfp_zero;
 359
 360        if (mc->kmem_cache)
 361                return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
 362        else
 363                return (void *)__get_free_page(gfp_flags);
 364}
 365
 366int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
 367{
 368        void *obj;
 369
 370        if (mc->nobjs >= min)
 371                return 0;
 372        while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
 373                obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
 374                if (!obj)
 375                        return mc->nobjs >= min ? 0 : -ENOMEM;
 376                mc->objects[mc->nobjs++] = obj;
 377        }
 378        return 0;
 379}
 380
 381int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
 382{
 383        return mc->nobjs;
 384}
 385
 386void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 387{
 388        while (mc->nobjs) {
 389                if (mc->kmem_cache)
 390                        kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
 391                else
 392                        free_page((unsigned long)mc->objects[--mc->nobjs]);
 393        }
 394}
 395
 396void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 397{
 398        void *p;
 399
 400        if (WARN_ON(!mc->nobjs))
 401                p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
 402        else
 403                p = mc->objects[--mc->nobjs];
 404        BUG_ON(!p);
 405        return p;
 406}
 407#endif
 408
 409static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 410{
 411        mutex_init(&vcpu->mutex);
 412        vcpu->cpu = -1;
 413        vcpu->kvm = kvm;
 414        vcpu->vcpu_id = id;
 415        vcpu->pid = NULL;
 416        rcuwait_init(&vcpu->wait);
 417        kvm_async_pf_vcpu_init(vcpu);
 418
 419        vcpu->pre_pcpu = -1;
 420        INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
 421
 422        kvm_vcpu_set_in_spin_loop(vcpu, false);
 423        kvm_vcpu_set_dy_eligible(vcpu, false);
 424        vcpu->preempted = false;
 425        vcpu->ready = false;
 426        preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 427        vcpu->last_used_slot = 0;
 428}
 429
 430void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 431{
 432        kvm_dirty_ring_free(&vcpu->dirty_ring);
 433        kvm_arch_vcpu_destroy(vcpu);
 434
 435        /*
 436         * No need for rcu_read_lock as VCPU_RUN is the only place that changes
 437         * the vcpu->pid pointer, and at destruction time all file descriptors
 438         * are already gone.
 439         */
 440        put_pid(rcu_dereference_protected(vcpu->pid, 1));
 441
 442        free_page((unsigned long)vcpu->run);
 443        kmem_cache_free(kvm_vcpu_cache, vcpu);
 444}
 445EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
 446
 447#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 448static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 449{
 450        return container_of(mn, struct kvm, mmu_notifier);
 451}
 452
 453static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
 454                                              struct mm_struct *mm,
 455                                              unsigned long start, unsigned long end)
 456{
 457        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 458        int idx;
 459
 460        idx = srcu_read_lock(&kvm->srcu);
 461        kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
 462        srcu_read_unlock(&kvm->srcu, idx);
 463}
 464
 465typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
 466
 467typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
 468                             unsigned long end);
 469
 470struct kvm_hva_range {
 471        unsigned long start;
 472        unsigned long end;
 473        pte_t pte;
 474        hva_handler_t handler;
 475        on_lock_fn_t on_lock;
 476        bool flush_on_ret;
 477        bool may_block;
 478};
 479
 480/*
 481 * Use a dedicated stub instead of NULL to indicate that there is no callback
 482 * function/handler.  The compiler technically can't guarantee that a real
 483 * function will have a non-zero address, and so it will generate code to
 484 * check for !NULL, whereas comparing against a stub will be elided at compile
 485 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
 486 */
 487static void kvm_null_fn(void)
 488{
 489
 490}
 491#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
 492
 493static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 494                                                  const struct kvm_hva_range *range)
 495{
 496        bool ret = false, locked = false;
 497        struct kvm_gfn_range gfn_range;
 498        struct kvm_memory_slot *slot;
 499        struct kvm_memslots *slots;
 500        int i, idx;
 501
 502        /* A null handler is allowed if and only if on_lock() is provided. */
 503        if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
 504                         IS_KVM_NULL_FN(range->handler)))
 505                return 0;
 506
 507        idx = srcu_read_lock(&kvm->srcu);
 508
 509        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 510                slots = __kvm_memslots(kvm, i);
 511                kvm_for_each_memslot(slot, slots) {
 512                        unsigned long hva_start, hva_end;
 513
 514                        hva_start = max(range->start, slot->userspace_addr);
 515                        hva_end = min(range->end, slot->userspace_addr +
 516                                                  (slot->npages << PAGE_SHIFT));
 517                        if (hva_start >= hva_end)
 518                                continue;
 519
 520                        /*
 521                         * To optimize for the likely case where the address
 522                         * range is covered by zero or one memslots, don't
 523                         * bother making these conditional (to avoid writes on
 524                         * the second or later invocation of the handler).
 525                         */
 526                        gfn_range.pte = range->pte;
 527                        gfn_range.may_block = range->may_block;
 528
 529                        /*
 530                         * {gfn(page) | page intersects with [hva_start, hva_end)} =
 531                         * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 532                         */
 533                        gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
 534                        gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
 535                        gfn_range.slot = slot;
 536
 537                        if (!locked) {
 538                                locked = true;
 539                                KVM_MMU_LOCK(kvm);
 540                                if (!IS_KVM_NULL_FN(range->on_lock))
 541                                        range->on_lock(kvm, range->start, range->end);
 542                                if (IS_KVM_NULL_FN(range->handler))
 543                                        break;
 544                        }
 545                        ret |= range->handler(kvm, &gfn_range);
 546                }
 547        }
 548
 549        if (range->flush_on_ret && ret)
 550                kvm_flush_remote_tlbs(kvm);
 551
 552        if (locked)
 553                KVM_MMU_UNLOCK(kvm);
 554
 555        srcu_read_unlock(&kvm->srcu, idx);
 556
 557        /* The notifiers are averse to booleans. :-( */
 558        return (int)ret;
 559}
 560
 561static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 562                                                unsigned long start,
 563                                                unsigned long end,
 564                                                pte_t pte,
 565                                                hva_handler_t handler)
 566{
 567        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 568        const struct kvm_hva_range range = {
 569                .start          = start,
 570                .end            = end,
 571                .pte            = pte,
 572                .handler        = handler,
 573                .on_lock        = (void *)kvm_null_fn,
 574                .flush_on_ret   = true,
 575                .may_block      = false,
 576        };
 577
 578        return __kvm_handle_hva_range(kvm, &range);
 579}
 580
 581static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
 582                                                         unsigned long start,
 583                                                         unsigned long end,
 584                                                         hva_handler_t handler)
 585{
 586        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 587        const struct kvm_hva_range range = {
 588                .start          = start,
 589                .end            = end,
 590                .pte            = __pte(0),
 591                .handler        = handler,
 592                .on_lock        = (void *)kvm_null_fn,
 593                .flush_on_ret   = false,
 594                .may_block      = false,
 595        };
 596
 597        return __kvm_handle_hva_range(kvm, &range);
 598}
 599static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
 600                                        struct mm_struct *mm,
 601                                        unsigned long address,
 602                                        pte_t pte)
 603{
 604        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 605
 606        trace_kvm_set_spte_hva(address);
 607
 608        /*
 609         * .change_pte() must be surrounded by .invalidate_range_{start,end}().
 610         * If mmu_notifier_count is zero, then no in-progress invalidations,
 611         * including this one, found a relevant memslot at start(); rechecking
 612         * memslots here is unnecessary.  Note, a false positive (count elevated
 613         * by a different invalidation) is sub-optimal but functionally ok.
 614         */
 615        WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
 616        if (!READ_ONCE(kvm->mmu_notifier_count))
 617                return;
 618
 619        kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
 620}
 621
 622void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
 623                                   unsigned long end)
 624{
 625        /*
 626         * The count increase must become visible at unlock time as no
 627         * spte can be established without taking the mmu_lock and
 628         * count is also read inside the mmu_lock critical section.
 629         */
 630        kvm->mmu_notifier_count++;
 631        if (likely(kvm->mmu_notifier_count == 1)) {
 632                kvm->mmu_notifier_range_start = start;
 633                kvm->mmu_notifier_range_end = end;
 634        } else {
 635                /*
 636                 * Fully tracking multiple concurrent ranges has dimishing
 637                 * returns. Keep things simple and just find the minimal range
 638                 * which includes the current and new ranges. As there won't be
 639                 * enough information to subtract a range after its invalidate
 640                 * completes, any ranges invalidated concurrently will
 641                 * accumulate and persist until all outstanding invalidates
 642                 * complete.
 643                 */
 644                kvm->mmu_notifier_range_start =
 645                        min(kvm->mmu_notifier_range_start, start);
 646                kvm->mmu_notifier_range_end =
 647                        max(kvm->mmu_notifier_range_end, end);
 648        }
 649}
 650
 651static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 652                                        const struct mmu_notifier_range *range)
 653{
 654        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 655        const struct kvm_hva_range hva_range = {
 656                .start          = range->start,
 657                .end            = range->end,
 658                .pte            = __pte(0),
 659                .handler        = kvm_unmap_gfn_range,
 660                .on_lock        = kvm_inc_notifier_count,
 661                .flush_on_ret   = true,
 662                .may_block      = mmu_notifier_range_blockable(range),
 663        };
 664
 665        trace_kvm_unmap_hva_range(range->start, range->end);
 666
 667        /*
 668         * Prevent memslot modification between range_start() and range_end()
 669         * so that conditionally locking provides the same result in both
 670         * functions.  Without that guarantee, the mmu_notifier_count
 671         * adjustments will be imbalanced.
 672         *
 673         * Pairs with the decrement in range_end().
 674         */
 675        spin_lock(&kvm->mn_invalidate_lock);
 676        kvm->mn_active_invalidate_count++;
 677        spin_unlock(&kvm->mn_invalidate_lock);
 678
 679        __kvm_handle_hva_range(kvm, &hva_range);
 680
 681        return 0;
 682}
 683
 684void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
 685                                   unsigned long end)
 686{
 687        /*
 688         * This sequence increase will notify the kvm page fault that
 689         * the page that is going to be mapped in the spte could have
 690         * been freed.
 691         */
 692        kvm->mmu_notifier_seq++;
 693        smp_wmb();
 694        /*
 695         * The above sequence increase must be visible before the
 696         * below count decrease, which is ensured by the smp_wmb above
 697         * in conjunction with the smp_rmb in mmu_notifier_retry().
 698         */
 699        kvm->mmu_notifier_count--;
 700}
 701
 702static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 703                                        const struct mmu_notifier_range *range)
 704{
 705        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 706        const struct kvm_hva_range hva_range = {
 707                .start          = range->start,
 708                .end            = range->end,
 709                .pte            = __pte(0),
 710                .handler        = (void *)kvm_null_fn,
 711                .on_lock        = kvm_dec_notifier_count,
 712                .flush_on_ret   = false,
 713                .may_block      = mmu_notifier_range_blockable(range),
 714        };
 715        bool wake;
 716
 717        __kvm_handle_hva_range(kvm, &hva_range);
 718
 719        /* Pairs with the increment in range_start(). */
 720        spin_lock(&kvm->mn_invalidate_lock);
 721        wake = (--kvm->mn_active_invalidate_count == 0);
 722        spin_unlock(&kvm->mn_invalidate_lock);
 723
 724        /*
 725         * There can only be one waiter, since the wait happens under
 726         * slots_lock.
 727         */
 728        if (wake)
 729                rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
 730
 731        BUG_ON(kvm->mmu_notifier_count < 0);
 732}
 733
 734static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 735                                              struct mm_struct *mm,
 736                                              unsigned long start,
 737                                              unsigned long end)
 738{
 739        trace_kvm_age_hva(start, end);
 740
 741        return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
 742}
 743
 744static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 745                                        struct mm_struct *mm,
 746                                        unsigned long start,
 747                                        unsigned long end)
 748{
 749        trace_kvm_age_hva(start, end);
 750
 751        /*
 752         * Even though we do not flush TLB, this will still adversely
 753         * affect performance on pre-Haswell Intel EPT, where there is
 754         * no EPT Access Bit to clear so that we have to tear down EPT
 755         * tables instead. If we find this unacceptable, we can always
 756         * add a parameter to kvm_age_hva so that it effectively doesn't
 757         * do anything on clear_young.
 758         *
 759         * Also note that currently we never issue secondary TLB flushes
 760         * from clear_young, leaving this job up to the regular system
 761         * cadence. If we find this inaccurate, we might come up with a
 762         * more sophisticated heuristic later.
 763         */
 764        return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
 765}
 766
 767static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 768                                       struct mm_struct *mm,
 769                                       unsigned long address)
 770{
 771        trace_kvm_test_age_hva(address);
 772
 773        return kvm_handle_hva_range_no_flush(mn, address, address + 1,
 774                                             kvm_test_age_gfn);
 775}
 776
 777static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 778                                     struct mm_struct *mm)
 779{
 780        struct kvm *kvm = mmu_notifier_to_kvm(mn);
 781        int idx;
 782
 783        idx = srcu_read_lock(&kvm->srcu);
 784        kvm_arch_flush_shadow_all(kvm);
 785        srcu_read_unlock(&kvm->srcu, idx);
 786}
 787
 788static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 789        .invalidate_range       = kvm_mmu_notifier_invalidate_range,
 790        .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
 791        .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
 792        .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
 793        .clear_young            = kvm_mmu_notifier_clear_young,
 794        .test_young             = kvm_mmu_notifier_test_young,
 795        .change_pte             = kvm_mmu_notifier_change_pte,
 796        .release                = kvm_mmu_notifier_release,
 797};
 798
 799static int kvm_init_mmu_notifier(struct kvm *kvm)
 800{
 801        kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
 802        return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 803}
 804
 805#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
 806
 807static int kvm_init_mmu_notifier(struct kvm *kvm)
 808{
 809        return 0;
 810}
 811
 812#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 813
 814#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 815static int kvm_pm_notifier_call(struct notifier_block *bl,
 816                                unsigned long state,
 817                                void *unused)
 818{
 819        struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
 820
 821        return kvm_arch_pm_notifier(kvm, state);
 822}
 823
 824static void kvm_init_pm_notifier(struct kvm *kvm)
 825{
 826        kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
 827        /* Suspend KVM before we suspend ftrace, RCU, etc. */
 828        kvm->pm_notifier.priority = INT_MAX;
 829        register_pm_notifier(&kvm->pm_notifier);
 830}
 831
 832static void kvm_destroy_pm_notifier(struct kvm *kvm)
 833{
 834        unregister_pm_notifier(&kvm->pm_notifier);
 835}
 836#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
 837static void kvm_init_pm_notifier(struct kvm *kvm)
 838{
 839}
 840
 841static void kvm_destroy_pm_notifier(struct kvm *kvm)
 842{
 843}
 844#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
 845
 846static struct kvm_memslots *kvm_alloc_memslots(void)
 847{
 848        int i;
 849        struct kvm_memslots *slots;
 850
 851        slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
 852        if (!slots)
 853                return NULL;
 854
 855        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
 856                slots->id_to_index[i] = -1;
 857
 858        return slots;
 859}
 860
 861static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 862{
 863        if (!memslot->dirty_bitmap)
 864                return;
 865
 866        kvfree(memslot->dirty_bitmap);
 867        memslot->dirty_bitmap = NULL;
 868}
 869
 870static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 871{
 872        kvm_destroy_dirty_bitmap(slot);
 873
 874        kvm_arch_free_memslot(kvm, slot);
 875
 876        slot->flags = 0;
 877        slot->npages = 0;
 878}
 879
 880static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
 881{
 882        struct kvm_memory_slot *memslot;
 883
 884        if (!slots)
 885                return;
 886
 887        kvm_for_each_memslot(memslot, slots)
 888                kvm_free_memslot(kvm, memslot);
 889
 890        kvfree(slots);
 891}
 892
 893static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
 894{
 895        switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
 896        case KVM_STATS_TYPE_INSTANT:
 897                return 0444;
 898        case KVM_STATS_TYPE_CUMULATIVE:
 899        case KVM_STATS_TYPE_PEAK:
 900        default:
 901                return 0644;
 902        }
 903}
 904
 905
 906static void kvm_destroy_vm_debugfs(struct kvm *kvm)
 907{
 908        int i;
 909        int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
 910                                      kvm_vcpu_stats_header.num_desc;
 911
 912        if (!kvm->debugfs_dentry)
 913                return;
 914
 915        debugfs_remove_recursive(kvm->debugfs_dentry);
 916
 917        if (kvm->debugfs_stat_data) {
 918                for (i = 0; i < kvm_debugfs_num_entries; i++)
 919                        kfree(kvm->debugfs_stat_data[i]);
 920                kfree(kvm->debugfs_stat_data);
 921        }
 922}
 923
 924static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
 925{
 926        static DEFINE_MUTEX(kvm_debugfs_lock);
 927        struct dentry *dent;
 928        char dir_name[ITOA_MAX_LEN * 2];
 929        struct kvm_stat_data *stat_data;
 930        const struct _kvm_stats_desc *pdesc;
 931        int i, ret;
 932        int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
 933                                      kvm_vcpu_stats_header.num_desc;
 934
 935        if (!debugfs_initialized())
 936                return 0;
 937
 938        snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
 939        mutex_lock(&kvm_debugfs_lock);
 940        dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
 941        if (dent) {
 942                pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
 943                dput(dent);
 944                mutex_unlock(&kvm_debugfs_lock);
 945                return 0;
 946        }
 947        dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
 948        mutex_unlock(&kvm_debugfs_lock);
 949        if (IS_ERR(dent))
 950                return 0;
 951
 952        kvm->debugfs_dentry = dent;
 953        kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
 954                                         sizeof(*kvm->debugfs_stat_data),
 955                                         GFP_KERNEL_ACCOUNT);
 956        if (!kvm->debugfs_stat_data)
 957                return -ENOMEM;
 958
 959        for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
 960                pdesc = &kvm_vm_stats_desc[i];
 961                stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
 962                if (!stat_data)
 963                        return -ENOMEM;
 964
 965                stat_data->kvm = kvm;
 966                stat_data->desc = pdesc;
 967                stat_data->kind = KVM_STAT_VM;
 968                kvm->debugfs_stat_data[i] = stat_data;
 969                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
 970                                    kvm->debugfs_dentry, stat_data,
 971                                    &stat_fops_per_vm);
 972        }
 973
 974        for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
 975                pdesc = &kvm_vcpu_stats_desc[i];
 976                stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
 977                if (!stat_data)
 978                        return -ENOMEM;
 979
 980                stat_data->kvm = kvm;
 981                stat_data->desc = pdesc;
 982                stat_data->kind = KVM_STAT_VCPU;
 983                kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
 984                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
 985                                    kvm->debugfs_dentry, stat_data,
 986                                    &stat_fops_per_vm);
 987        }
 988
 989        ret = kvm_arch_create_vm_debugfs(kvm);
 990        if (ret) {
 991                kvm_destroy_vm_debugfs(kvm);
 992                return i;
 993        }
 994
 995        return 0;
 996}
 997
 998/*
 999 * Called after the VM is otherwise initialized, but just before adding it to
1000 * the vm_list.
1001 */
1002int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1003{
1004        return 0;
1005}
1006
1007/*
1008 * Called just after removing the VM from the vm_list, but before doing any
1009 * other destruction.
1010 */
1011void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1012{
1013}
1014
1015/*
1016 * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
1017 * be setup already, so we can create arch-specific debugfs entries under it.
1018 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1019 * a per-arch destroy interface is not needed.
1020 */
1021int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1022{
1023        return 0;
1024}
1025
1026static struct kvm *kvm_create_vm(unsigned long type)
1027{
1028        struct kvm *kvm = kvm_arch_alloc_vm();
1029        int r = -ENOMEM;
1030        int i;
1031
1032        if (!kvm)
1033                return ERR_PTR(-ENOMEM);
1034
1035        KVM_MMU_LOCK_INIT(kvm);
1036        mmgrab(current->mm);
1037        kvm->mm = current->mm;
1038        kvm_eventfd_init(kvm);
1039        mutex_init(&kvm->lock);
1040        mutex_init(&kvm->irq_lock);
1041        mutex_init(&kvm->slots_lock);
1042        mutex_init(&kvm->slots_arch_lock);
1043        spin_lock_init(&kvm->mn_invalidate_lock);
1044        rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1045
1046        INIT_LIST_HEAD(&kvm->devices);
1047
1048        BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1049
1050        if (init_srcu_struct(&kvm->srcu))
1051                goto out_err_no_srcu;
1052        if (init_srcu_struct(&kvm->irq_srcu))
1053                goto out_err_no_irq_srcu;
1054
1055        refcount_set(&kvm->users_count, 1);
1056        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1057                struct kvm_memslots *slots = kvm_alloc_memslots();
1058
1059                if (!slots)
1060                        goto out_err_no_arch_destroy_vm;
1061                /* Generations must be different for each address space. */
1062                slots->generation = i;
1063                rcu_assign_pointer(kvm->memslots[i], slots);
1064        }
1065
1066        for (i = 0; i < KVM_NR_BUSES; i++) {
1067                rcu_assign_pointer(kvm->buses[i],
1068                        kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1069                if (!kvm->buses[i])
1070                        goto out_err_no_arch_destroy_vm;
1071        }
1072
1073        kvm->max_halt_poll_ns = halt_poll_ns;
1074
1075        r = kvm_arch_init_vm(kvm, type);
1076        if (r)
1077                goto out_err_no_arch_destroy_vm;
1078
1079        r = hardware_enable_all();
1080        if (r)
1081                goto out_err_no_disable;
1082
1083#ifdef CONFIG_HAVE_KVM_IRQFD
1084        INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1085#endif
1086
1087        r = kvm_init_mmu_notifier(kvm);
1088        if (r)
1089                goto out_err_no_mmu_notifier;
1090
1091        r = kvm_arch_post_init_vm(kvm);
1092        if (r)
1093                goto out_err;
1094
1095        mutex_lock(&kvm_lock);
1096        list_add(&kvm->vm_list, &vm_list);
1097        mutex_unlock(&kvm_lock);
1098
1099        preempt_notifier_inc();
1100        kvm_init_pm_notifier(kvm);
1101
1102        return kvm;
1103
1104out_err:
1105#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1106        if (kvm->mmu_notifier.ops)
1107                mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1108#endif
1109out_err_no_mmu_notifier:
1110        hardware_disable_all();
1111out_err_no_disable:
1112        kvm_arch_destroy_vm(kvm);
1113out_err_no_arch_destroy_vm:
1114        WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1115        for (i = 0; i < KVM_NR_BUSES; i++)
1116                kfree(kvm_get_bus(kvm, i));
1117        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1118                kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1119        cleanup_srcu_struct(&kvm->irq_srcu);
1120out_err_no_irq_srcu:
1121        cleanup_srcu_struct(&kvm->srcu);
1122out_err_no_srcu:
1123        kvm_arch_free_vm(kvm);
1124        mmdrop(current->mm);
1125        return ERR_PTR(r);
1126}
1127
1128static void kvm_destroy_devices(struct kvm *kvm)
1129{
1130        struct kvm_device *dev, *tmp;
1131
1132        /*
1133         * We do not need to take the kvm->lock here, because nobody else
1134         * has a reference to the struct kvm at this point and therefore
1135         * cannot access the devices list anyhow.
1136         */
1137        list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1138                list_del(&dev->vm_node);
1139                dev->ops->destroy(dev);
1140        }
1141}
1142
1143static void kvm_destroy_vm(struct kvm *kvm)
1144{
1145        int i;
1146        struct mm_struct *mm = kvm->mm;
1147
1148        kvm_destroy_pm_notifier(kvm);
1149        kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1150        kvm_destroy_vm_debugfs(kvm);
1151        kvm_arch_sync_events(kvm);
1152        mutex_lock(&kvm_lock);
1153        list_del(&kvm->vm_list);
1154        mutex_unlock(&kvm_lock);
1155        kvm_arch_pre_destroy_vm(kvm);
1156
1157        kvm_free_irq_routing(kvm);
1158        for (i = 0; i < KVM_NR_BUSES; i++) {
1159                struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1160
1161                if (bus)
1162                        kvm_io_bus_destroy(bus);
1163                kvm->buses[i] = NULL;
1164        }
1165        kvm_coalesced_mmio_free(kvm);
1166#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1167        mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1168        /*
1169         * At this point, pending calls to invalidate_range_start()
1170         * have completed but no more MMU notifiers will run, so
1171         * mn_active_invalidate_count may remain unbalanced.
1172         * No threads can be waiting in install_new_memslots as the
1173         * last reference on KVM has been dropped, but freeing
1174         * memslots would deadlock without this manual intervention.
1175         */
1176        WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1177        kvm->mn_active_invalidate_count = 0;
1178#else
1179        kvm_arch_flush_shadow_all(kvm);
1180#endif
1181        kvm_arch_destroy_vm(kvm);
1182        kvm_destroy_devices(kvm);
1183        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1184                kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1185        cleanup_srcu_struct(&kvm->irq_srcu);
1186        cleanup_srcu_struct(&kvm->srcu);
1187        kvm_arch_free_vm(kvm);
1188        preempt_notifier_dec();
1189        hardware_disable_all();
1190        mmdrop(mm);
1191}
1192
1193void kvm_get_kvm(struct kvm *kvm)
1194{
1195        refcount_inc(&kvm->users_count);
1196}
1197EXPORT_SYMBOL_GPL(kvm_get_kvm);
1198
1199/*
1200 * Make sure the vm is not during destruction, which is a safe version of
1201 * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
1202 */
1203bool kvm_get_kvm_safe(struct kvm *kvm)
1204{
1205        return refcount_inc_not_zero(&kvm->users_count);
1206}
1207EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1208
1209void kvm_put_kvm(struct kvm *kvm)
1210{
1211        if (refcount_dec_and_test(&kvm->users_count))
1212                kvm_destroy_vm(kvm);
1213}
1214EXPORT_SYMBOL_GPL(kvm_put_kvm);
1215
1216/*
1217 * Used to put a reference that was taken on behalf of an object associated
1218 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1219 * of the new file descriptor fails and the reference cannot be transferred to
1220 * its final owner.  In such cases, the caller is still actively using @kvm and
1221 * will fail miserably if the refcount unexpectedly hits zero.
1222 */
1223void kvm_put_kvm_no_destroy(struct kvm *kvm)
1224{
1225        WARN_ON(refcount_dec_and_test(&kvm->users_count));
1226}
1227EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1228
1229static int kvm_vm_release(struct inode *inode, struct file *filp)
1230{
1231        struct kvm *kvm = filp->private_data;
1232
1233        kvm_irqfd_release(kvm);
1234
1235        kvm_put_kvm(kvm);
1236        return 0;
1237}
1238
1239/*
1240 * Allocation size is twice as large as the actual dirty bitmap size.
1241 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1242 */
1243static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1244{
1245        unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
1246
1247        memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
1248        if (!memslot->dirty_bitmap)
1249                return -ENOMEM;
1250
1251        return 0;
1252}
1253
1254/*
1255 * Delete a memslot by decrementing the number of used slots and shifting all
1256 * other entries in the array forward one spot.
1257 */
1258static inline void kvm_memslot_delete(struct kvm_memslots *slots,
1259                                      struct kvm_memory_slot *memslot)
1260{
1261        struct kvm_memory_slot *mslots = slots->memslots;
1262        int i;
1263
1264        if (WARN_ON(slots->id_to_index[memslot->id] == -1))
1265                return;
1266
1267        slots->used_slots--;
1268
1269        if (atomic_read(&slots->last_used_slot) >= slots->used_slots)
1270                atomic_set(&slots->last_used_slot, 0);
1271
1272        for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
1273                mslots[i] = mslots[i + 1];
1274                slots->id_to_index[mslots[i].id] = i;
1275        }
1276        mslots[i] = *memslot;
1277        slots->id_to_index[memslot->id] = -1;
1278}
1279
1280/*
1281 * "Insert" a new memslot by incrementing the number of used slots.  Returns
1282 * the new slot's initial index into the memslots array.
1283 */
1284static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
1285{
1286        return slots->used_slots++;
1287}
1288
1289/*
1290 * Move a changed memslot backwards in the array by shifting existing slots
1291 * with a higher GFN toward the front of the array.  Note, the changed memslot
1292 * itself is not preserved in the array, i.e. not swapped at this time, only
1293 * its new index into the array is tracked.  Returns the changed memslot's
1294 * current index into the memslots array.
1295 */
1296static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
1297                                            struct kvm_memory_slot *memslot)
1298{
1299        struct kvm_memory_slot *mslots = slots->memslots;
1300        int i;
1301
1302        if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
1303            WARN_ON_ONCE(!slots->used_slots))
1304                return -1;
1305
1306        /*
1307         * Move the target memslot backward in the array by shifting existing
1308         * memslots with a higher GFN (than the target memslot) towards the
1309         * front of the array.
1310         */
1311        for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
1312                if (memslot->base_gfn > mslots[i + 1].base_gfn)
1313                        break;
1314
1315                WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1316
1317                /* Shift the next memslot forward one and update its index. */
1318                mslots[i] = mslots[i + 1];
1319                slots->id_to_index[mslots[i].id] = i;
1320        }
1321        return i;
1322}
1323
1324/*
1325 * Move a changed memslot forwards in the array by shifting existing slots with
1326 * a lower GFN toward the back of the array.  Note, the changed memslot itself
1327 * is not preserved in the array, i.e. not swapped at this time, only its new
1328 * index into the array is tracked.  Returns the changed memslot's final index
1329 * into the memslots array.
1330 */
1331static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1332                                           struct kvm_memory_slot *memslot,
1333                                           int start)
1334{
1335        struct kvm_memory_slot *mslots = slots->memslots;
1336        int i;
1337
1338        for (i = start; i > 0; i--) {
1339                if (memslot->base_gfn < mslots[i - 1].base_gfn)
1340                        break;
1341
1342                WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1343
1344                /* Shift the next memslot back one and update its index. */
1345                mslots[i] = mslots[i - 1];
1346                slots->id_to_index[mslots[i].id] = i;
1347        }
1348        return i;
1349}
1350
1351/*
1352 * Re-sort memslots based on their GFN to account for an added, deleted, or
1353 * moved memslot.  Sorting memslots by GFN allows using a binary search during
1354 * memslot lookup.
1355 *
1356 * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!  I.e. the entry
1357 * at memslots[0] has the highest GFN.
1358 *
1359 * The sorting algorithm takes advantage of having initially sorted memslots
1360 * and knowing the position of the changed memslot.  Sorting is also optimized
1361 * by not swapping the updated memslot and instead only shifting other memslots
1362 * and tracking the new index for the update memslot.  Only once its final
1363 * index is known is the updated memslot copied into its position in the array.
1364 *
1365 *  - When deleting a memslot, the deleted memslot simply needs to be moved to
1366 *    the end of the array.
1367 *
1368 *  - When creating a memslot, the algorithm "inserts" the new memslot at the
1369 *    end of the array and then it forward to its correct location.
1370 *
1371 *  - When moving a memslot, the algorithm first moves the updated memslot
1372 *    backward to handle the scenario where the memslot's GFN was changed to a
1373 *    lower value.  update_memslots() then falls through and runs the same flow
1374 *    as creating a memslot to move the memslot forward to handle the scenario
1375 *    where its GFN was changed to a higher value.
1376 *
1377 * Note, slots are sorted from highest->lowest instead of lowest->highest for
1378 * historical reasons.  Originally, invalid memslots where denoted by having
1379 * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots
1380 * to the end of the array.  The current algorithm uses dedicated logic to
1381 * delete a memslot and thus does not rely on invalid memslots having GFN=0.
1382 *
1383 * The other historical motiviation for highest->lowest was to improve the
1384 * performance of memslot lookup.  KVM originally used a linear search starting
1385 * at memslots[0].  On x86, the largest memslot usually has one of the highest,
1386 * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a
1387 * single memslot above the 4gb boundary.  As the largest memslot is also the
1388 * most likely to be referenced, sorting it to the front of the array was
1389 * advantageous.  The current binary search starts from the middle of the array
1390 * and uses an LRU pointer to improve performance for all memslots and GFNs.
1391 */
1392static void update_memslots(struct kvm_memslots *slots,
1393                            struct kvm_memory_slot *memslot,
1394                            enum kvm_mr_change change)
1395{
1396        int i;
1397
1398        if (change == KVM_MR_DELETE) {
1399                kvm_memslot_delete(slots, memslot);
1400        } else {
1401                if (change == KVM_MR_CREATE)
1402                        i = kvm_memslot_insert_back(slots);
1403                else
1404                        i = kvm_memslot_move_backward(slots, memslot);
1405                i = kvm_memslot_move_forward(slots, memslot, i);
1406
1407                /*
1408                 * Copy the memslot to its new position in memslots and update
1409                 * its index accordingly.
1410                 */
1411                slots->memslots[i] = *memslot;
1412                slots->id_to_index[memslot->id] = i;
1413        }
1414}
1415
1416static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1417{
1418        u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1419
1420#ifdef __KVM_HAVE_READONLY_MEM
1421        valid_flags |= KVM_MEM_READONLY;
1422#endif
1423
1424        if (mem->flags & ~valid_flags)
1425                return -EINVAL;
1426
1427        return 0;
1428}
1429
1430static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
1431                int as_id, struct kvm_memslots *slots)
1432{
1433        struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
1434        u64 gen = old_memslots->generation;
1435
1436        WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1437        slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1438
1439        /*
1440         * Do not store the new memslots while there are invalidations in
1441         * progress, otherwise the locking in invalidate_range_start and
1442         * invalidate_range_end will be unbalanced.
1443         */
1444        spin_lock(&kvm->mn_invalidate_lock);
1445        prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1446        while (kvm->mn_active_invalidate_count) {
1447                set_current_state(TASK_UNINTERRUPTIBLE);
1448                spin_unlock(&kvm->mn_invalidate_lock);
1449                schedule();
1450                spin_lock(&kvm->mn_invalidate_lock);
1451        }
1452        finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1453        rcu_assign_pointer(kvm->memslots[as_id], slots);
1454        spin_unlock(&kvm->mn_invalidate_lock);
1455
1456        /*
1457         * Acquired in kvm_set_memslot. Must be released before synchronize
1458         * SRCU below in order to avoid deadlock with another thread
1459         * acquiring the slots_arch_lock in an srcu critical section.
1460         */
1461        mutex_unlock(&kvm->slots_arch_lock);
1462
1463        synchronize_srcu_expedited(&kvm->srcu);
1464
1465        /*
1466         * Increment the new memslot generation a second time, dropping the
1467         * update in-progress flag and incrementing the generation based on
1468         * the number of address spaces.  This provides a unique and easily
1469         * identifiable generation number while the memslots are in flux.
1470         */
1471        gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1472
1473        /*
1474         * Generations must be unique even across address spaces.  We do not need
1475         * a global counter for that, instead the generation space is evenly split
1476         * across address spaces.  For example, with two address spaces, address
1477         * space 0 will use generations 0, 2, 4, ... while address space 1 will
1478         * use generations 1, 3, 5, ...
1479         */
1480        gen += KVM_ADDRESS_SPACE_NUM;
1481
1482        kvm_arch_memslots_updated(kvm, gen);
1483
1484        slots->generation = gen;
1485
1486        return old_memslots;
1487}
1488
1489static size_t kvm_memslots_size(int slots)
1490{
1491        return sizeof(struct kvm_memslots) +
1492               (sizeof(struct kvm_memory_slot) * slots);
1493}
1494
1495static void kvm_copy_memslots(struct kvm_memslots *to,
1496                              struct kvm_memslots *from)
1497{
1498        memcpy(to, from, kvm_memslots_size(from->used_slots));
1499}
1500
1501/*
1502 * Note, at a minimum, the current number of used slots must be allocated, even
1503 * when deleting a memslot, as we need a complete duplicate of the memslots for
1504 * use when invalidating a memslot prior to deleting/moving the memslot.
1505 */
1506static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1507                                             enum kvm_mr_change change)
1508{
1509        struct kvm_memslots *slots;
1510        size_t new_size;
1511
1512        if (change == KVM_MR_CREATE)
1513                new_size = kvm_memslots_size(old->used_slots + 1);
1514        else
1515                new_size = kvm_memslots_size(old->used_slots);
1516
1517        slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1518        if (likely(slots))
1519                kvm_copy_memslots(slots, old);
1520
1521        return slots;
1522}
1523
1524static int kvm_set_memslot(struct kvm *kvm,
1525                           const struct kvm_userspace_memory_region *mem,
1526                           struct kvm_memory_slot *old,
1527                           struct kvm_memory_slot *new, int as_id,
1528                           enum kvm_mr_change change)
1529{
1530        struct kvm_memory_slot *slot;
1531        struct kvm_memslots *slots;
1532        int r;
1533
1534        /*
1535         * Released in install_new_memslots.
1536         *
1537         * Must be held from before the current memslots are copied until
1538         * after the new memslots are installed with rcu_assign_pointer,
1539         * then released before the synchronize srcu in install_new_memslots.
1540         *
1541         * When modifying memslots outside of the slots_lock, must be held
1542         * before reading the pointer to the current memslots until after all
1543         * changes to those memslots are complete.
1544         *
1545         * These rules ensure that installing new memslots does not lose
1546         * changes made to the previous memslots.
1547         */
1548        mutex_lock(&kvm->slots_arch_lock);
1549
1550        slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
1551        if (!slots) {
1552                mutex_unlock(&kvm->slots_arch_lock);
1553                return -ENOMEM;
1554        }
1555
1556        if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1557                /*
1558                 * Note, the INVALID flag needs to be in the appropriate entry
1559                 * in the freshly allocated memslots, not in @old or @new.
1560                 */
1561                slot = id_to_memslot(slots, old->id);
1562                slot->flags |= KVM_MEMSLOT_INVALID;
1563
1564                /*
1565                 * We can re-use the memory from the old memslots.
1566                 * It will be overwritten with a copy of the new memslots
1567                 * after reacquiring the slots_arch_lock below.
1568                 */
1569                slots = install_new_memslots(kvm, as_id, slots);
1570
1571                /* From this point no new shadow pages pointing to a deleted,
1572                 * or moved, memslot will be created.
1573                 *
1574                 * validation of sp->gfn happens in:
1575                 *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1576                 *      - kvm_is_visible_gfn (mmu_check_root)
1577                 */
1578                kvm_arch_flush_shadow_memslot(kvm, slot);
1579
1580                /* Released in install_new_memslots. */
1581                mutex_lock(&kvm->slots_arch_lock);
1582
1583                /*
1584                 * The arch-specific fields of the memslots could have changed
1585                 * between releasing the slots_arch_lock in
1586                 * install_new_memslots and here, so get a fresh copy of the
1587                 * slots.
1588                 */
1589                kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id));
1590        }
1591
1592        r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
1593        if (r)
1594                goto out_slots;
1595
1596        update_memslots(slots, new, change);
1597        slots = install_new_memslots(kvm, as_id, slots);
1598
1599        kvm_arch_commit_memory_region(kvm, mem, old, new, change);
1600
1601        kvfree(slots);
1602        return 0;
1603
1604out_slots:
1605        if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1606                slot = id_to_memslot(slots, old->id);
1607                slot->flags &= ~KVM_MEMSLOT_INVALID;
1608                slots = install_new_memslots(kvm, as_id, slots);
1609        } else {
1610                mutex_unlock(&kvm->slots_arch_lock);
1611        }
1612        kvfree(slots);
1613        return r;
1614}
1615
1616static int kvm_delete_memslot(struct kvm *kvm,
1617                              const struct kvm_userspace_memory_region *mem,
1618                              struct kvm_memory_slot *old, int as_id)
1619{
1620        struct kvm_memory_slot new;
1621        int r;
1622
1623        if (!old->npages)
1624                return -EINVAL;
1625
1626        memset(&new, 0, sizeof(new));
1627        new.id = old->id;
1628        /*
1629         * This is only for debugging purpose; it should never be referenced
1630         * for a removed memslot.
1631         */
1632        new.as_id = as_id;
1633
1634        r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
1635        if (r)
1636                return r;
1637
1638        kvm_free_memslot(kvm, old);
1639        return 0;
1640}
1641
1642/*
1643 * Allocate some memory and give it an address in the guest physical address
1644 * space.
1645 *
1646 * Discontiguous memory is allowed, mostly for framebuffers.
1647 *
1648 * Must be called holding kvm->slots_lock for write.
1649 */
1650int __kvm_set_memory_region(struct kvm *kvm,
1651                            const struct kvm_userspace_memory_region *mem)
1652{
1653        struct kvm_memory_slot old, new;
1654        struct kvm_memory_slot *tmp;
1655        enum kvm_mr_change change;
1656        int as_id, id;
1657        int r;
1658
1659        r = check_memory_region_flags(mem);
1660        if (r)
1661                return r;
1662
1663        as_id = mem->slot >> 16;
1664        id = (u16)mem->slot;
1665
1666        /* General sanity checks */
1667        if (mem->memory_size & (PAGE_SIZE - 1))
1668                return -EINVAL;
1669        if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1670                return -EINVAL;
1671        /* We can read the guest memory with __xxx_user() later on. */
1672        if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1673            (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1674             !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1675                        mem->memory_size))
1676                return -EINVAL;
1677        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1678                return -EINVAL;
1679        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1680                return -EINVAL;
1681
1682        /*
1683         * Make a full copy of the old memslot, the pointer will become stale
1684         * when the memslots are re-sorted by update_memslots(), and the old
1685         * memslot needs to be referenced after calling update_memslots(), e.g.
1686         * to free its resources and for arch specific behavior.
1687         */
1688        tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1689        if (tmp) {
1690                old = *tmp;
1691                tmp = NULL;
1692        } else {
1693                memset(&old, 0, sizeof(old));
1694                old.id = id;
1695        }
1696
1697        if (!mem->memory_size)
1698                return kvm_delete_memslot(kvm, mem, &old, as_id);
1699
1700        new.as_id = as_id;
1701        new.id = id;
1702        new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1703        new.npages = mem->memory_size >> PAGE_SHIFT;
1704        new.flags = mem->flags;
1705        new.userspace_addr = mem->userspace_addr;
1706
1707        if (new.npages > KVM_MEM_MAX_NR_PAGES)
1708                return -EINVAL;
1709
1710        if (!old.npages) {
1711                change = KVM_MR_CREATE;
1712                new.dirty_bitmap = NULL;
1713                memset(&new.arch, 0, sizeof(new.arch));
1714        } else { /* Modify an existing slot. */
1715                if ((new.userspace_addr != old.userspace_addr) ||
1716                    (new.npages != old.npages) ||
1717                    ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1718                        return -EINVAL;
1719
1720                if (new.base_gfn != old.base_gfn)
1721                        change = KVM_MR_MOVE;
1722                else if (new.flags != old.flags)
1723                        change = KVM_MR_FLAGS_ONLY;
1724                else /* Nothing to change. */
1725                        return 0;
1726
1727                /* Copy dirty_bitmap and arch from the current memslot. */
1728                new.dirty_bitmap = old.dirty_bitmap;
1729                memcpy(&new.arch, &old.arch, sizeof(new.arch));
1730        }
1731
1732        if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
1733                /* Check for overlaps */
1734                kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1735                        if (tmp->id == id)
1736                                continue;
1737                        if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1738                              (new.base_gfn >= tmp->base_gfn + tmp->npages)))
1739                                return -EEXIST;
1740                }
1741        }
1742
1743        /* Allocate/free page dirty bitmap as needed */
1744        if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1745                new.dirty_bitmap = NULL;
1746        else if (!new.dirty_bitmap && !kvm->dirty_ring_size) {
1747                r = kvm_alloc_dirty_bitmap(&new);
1748                if (r)
1749                        return r;
1750
1751                if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1752                        bitmap_set(new.dirty_bitmap, 0, new.npages);
1753        }
1754
1755        r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
1756        if (r)
1757                goto out_bitmap;
1758
1759        if (old.dirty_bitmap && !new.dirty_bitmap)
1760                kvm_destroy_dirty_bitmap(&old);
1761        return 0;
1762
1763out_bitmap:
1764        if (new.dirty_bitmap && !old.dirty_bitmap)
1765                kvm_destroy_dirty_bitmap(&new);
1766        return r;
1767}
1768EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1769
1770int kvm_set_memory_region(struct kvm *kvm,
1771                          const struct kvm_userspace_memory_region *mem)
1772{
1773        int r;
1774
1775        mutex_lock(&kvm->slots_lock);
1776        r = __kvm_set_memory_region(kvm, mem);
1777        mutex_unlock(&kvm->slots_lock);
1778        return r;
1779}
1780EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1781
1782static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1783                                          struct kvm_userspace_memory_region *mem)
1784{
1785        if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1786                return -EINVAL;
1787
1788        return kvm_set_memory_region(kvm, mem);
1789}
1790
1791#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1792/**
1793 * kvm_get_dirty_log - get a snapshot of dirty pages
1794 * @kvm:        pointer to kvm instance
1795 * @log:        slot id and address to which we copy the log
1796 * @is_dirty:   set to '1' if any dirty pages were found
1797 * @memslot:    set to the associated memslot, always valid on success
1798 */
1799int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1800                      int *is_dirty, struct kvm_memory_slot **memslot)
1801{
1802        struct kvm_memslots *slots;
1803        int i, as_id, id;
1804        unsigned long n;
1805        unsigned long any = 0;
1806
1807        /* Dirty ring tracking is exclusive to dirty log tracking */
1808        if (kvm->dirty_ring_size)
1809                return -ENXIO;
1810
1811        *memslot = NULL;
1812        *is_dirty = 0;
1813
1814        as_id = log->slot >> 16;
1815        id = (u16)log->slot;
1816        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1817                return -EINVAL;
1818
1819        slots = __kvm_memslots(kvm, as_id);
1820        *memslot = id_to_memslot(slots, id);
1821        if (!(*memslot) || !(*memslot)->dirty_bitmap)
1822                return -ENOENT;
1823
1824        kvm_arch_sync_dirty_log(kvm, *memslot);
1825
1826        n = kvm_dirty_bitmap_bytes(*memslot);
1827
1828        for (i = 0; !any && i < n/sizeof(long); ++i)
1829                any = (*memslot)->dirty_bitmap[i];
1830
1831        if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1832                return -EFAULT;
1833
1834        if (any)
1835                *is_dirty = 1;
1836        return 0;
1837}
1838EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1839
1840#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
1841/**
1842 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
1843 *      and reenable dirty page tracking for the corresponding pages.
1844 * @kvm:        pointer to kvm instance
1845 * @log:        slot id and address to which we copy the log
1846 *
1847 * We need to keep it in mind that VCPU threads can write to the bitmap
1848 * concurrently. So, to avoid losing track of dirty pages we keep the
1849 * following order:
1850 *
1851 *    1. Take a snapshot of the bit and clear it if needed.
1852 *    2. Write protect the corresponding page.
1853 *    3. Copy the snapshot to the userspace.
1854 *    4. Upon return caller flushes TLB's if needed.
1855 *
1856 * Between 2 and 4, the guest may write to the page using the remaining TLB
1857 * entry.  This is not a problem because the page is reported dirty using
1858 * the snapshot taken before and step 4 ensures that writes done after
1859 * exiting to userspace will be logged for the next call.
1860 *
1861 */
1862static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
1863{
1864        struct kvm_memslots *slots;
1865        struct kvm_memory_slot *memslot;
1866        int i, as_id, id;
1867        unsigned long n;
1868        unsigned long *dirty_bitmap;
1869        unsigned long *dirty_bitmap_buffer;
1870        bool flush;
1871
1872        /* Dirty ring tracking is exclusive to dirty log tracking */
1873        if (kvm->dirty_ring_size)
1874                return -ENXIO;
1875
1876        as_id = log->slot >> 16;
1877        id = (u16)log->slot;
1878        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1879                return -EINVAL;
1880
1881        slots = __kvm_memslots(kvm, as_id);
1882        memslot = id_to_memslot(slots, id);
1883        if (!memslot || !memslot->dirty_bitmap)
1884                return -ENOENT;
1885
1886        dirty_bitmap = memslot->dirty_bitmap;
1887
1888        kvm_arch_sync_dirty_log(kvm, memslot);
1889
1890        n = kvm_dirty_bitmap_bytes(memslot);
1891        flush = false;
1892        if (kvm->manual_dirty_log_protect) {
1893                /*
1894                 * Unlike kvm_get_dirty_log, we always return false in *flush,
1895                 * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
1896                 * is some code duplication between this function and
1897                 * kvm_get_dirty_log, but hopefully all architecture
1898                 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
1899                 * can be eliminated.
1900                 */
1901                dirty_bitmap_buffer = dirty_bitmap;
1902        } else {
1903                dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1904                memset(dirty_bitmap_buffer, 0, n);
1905
1906                KVM_MMU_LOCK(kvm);
1907                for (i = 0; i < n / sizeof(long); i++) {
1908                        unsigned long mask;
1909                        gfn_t offset;
1910
1911                        if (!dirty_bitmap[i])
1912                                continue;
1913
1914                        flush = true;
1915                        mask = xchg(&dirty_bitmap[i], 0);
1916                        dirty_bitmap_buffer[i] = mask;
1917
1918                        offset = i * BITS_PER_LONG;
1919                        kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1920                                                                offset, mask);
1921                }
1922                KVM_MMU_UNLOCK(kvm);
1923        }
1924
1925        if (flush)
1926                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1927
1928        if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1929                return -EFAULT;
1930        return 0;
1931}
1932
1933
1934/**
1935 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
1936 * @kvm: kvm instance
1937 * @log: slot id and address to which we copy the log
1938 *
1939 * Steps 1-4 below provide general overview of dirty page logging. See
1940 * kvm_get_dirty_log_protect() function description for additional details.
1941 *
1942 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
1943 * always flush the TLB (step 4) even if previous step failed  and the dirty
1944 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
1945 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
1946 * writes will be marked dirty for next log read.
1947 *
1948 *   1. Take a snapshot of the bit and clear it if needed.
1949 *   2. Write protect the corresponding page.
1950 *   3. Copy the snapshot to the userspace.
1951 *   4. Flush TLB's if needed.
1952 */
1953static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1954                                      struct kvm_dirty_log *log)
1955{
1956        int r;
1957
1958        mutex_lock(&kvm->slots_lock);
1959
1960        r = kvm_get_dirty_log_protect(kvm, log);
1961
1962        mutex_unlock(&kvm->slots_lock);
1963        return r;
1964}
1965
1966/**
1967 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
1968 *      and reenable dirty page tracking for the corresponding pages.
1969 * @kvm:        pointer to kvm instance
1970 * @log:        slot id and address from which to fetch the bitmap of dirty pages
1971 */
1972static int kvm_clear_dirty_log_protect(struct kvm *kvm,
1973                                       struct kvm_clear_dirty_log *log)
1974{
1975        struct kvm_memslots *slots;
1976        struct kvm_memory_slot *memslot;
1977        int as_id, id;
1978        gfn_t offset;
1979        unsigned long i, n;
1980        unsigned long *dirty_bitmap;
1981        unsigned long *dirty_bitmap_buffer;
1982        bool flush;
1983
1984        /* Dirty ring tracking is exclusive to dirty log tracking */
1985        if (kvm->dirty_ring_size)
1986                return -ENXIO;
1987
1988        as_id = log->slot >> 16;
1989        id = (u16)log->slot;
1990        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1991                return -EINVAL;
1992
1993        if (log->first_page & 63)
1994                return -EINVAL;
1995
1996        slots = __kvm_memslots(kvm, as_id);
1997        memslot = id_to_memslot(slots, id);
1998        if (!memslot || !memslot->dirty_bitmap)
1999                return -ENOENT;
2000
2001        dirty_bitmap = memslot->dirty_bitmap;
2002
2003        n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2004
2005        if (log->first_page > memslot->npages ||
2006            log->num_pages > memslot->npages - log->first_page ||
2007            (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2008            return -EINVAL;
2009
2010        kvm_arch_sync_dirty_log(kvm, memslot);
2011
2012        flush = false;
2013        dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2014        if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2015                return -EFAULT;
2016
2017        KVM_MMU_LOCK(kvm);
2018        for (offset = log->first_page, i = offset / BITS_PER_LONG,
2019                 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2020             i++, offset += BITS_PER_LONG) {
2021                unsigned long mask = *dirty_bitmap_buffer++;
2022                atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2023                if (!mask)
2024                        continue;
2025
2026                mask &= atomic_long_fetch_andnot(mask, p);
2027
2028                /*
2029                 * mask contains the bits that really have been cleared.  This
2030                 * never includes any bits beyond the length of the memslot (if
2031                 * the length is not aligned to 64 pages), therefore it is not
2032                 * a problem if userspace sets them in log->dirty_bitmap.
2033                */
2034                if (mask) {
2035                        flush = true;
2036                        kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2037                                                                offset, mask);
2038                }
2039        }
2040        KVM_MMU_UNLOCK(kvm);
2041
2042        if (flush)
2043                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2044
2045        return 0;
2046}
2047
2048static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2049                                        struct kvm_clear_dirty_log *log)
2050{
2051        int r;
2052
2053        mutex_lock(&kvm->slots_lock);
2054
2055        r = kvm_clear_dirty_log_protect(kvm, log);
2056
2057        mutex_unlock(&kvm->slots_lock);
2058        return r;
2059}
2060#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2061
2062struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2063{
2064        return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2065}
2066EXPORT_SYMBOL_GPL(gfn_to_memslot);
2067
2068struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2069{
2070        struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2071        struct kvm_memory_slot *slot;
2072        int slot_index;
2073
2074        slot = try_get_memslot(slots, vcpu->last_used_slot, gfn);
2075        if (slot)
2076                return slot;
2077
2078        /*
2079         * Fall back to searching all memslots. We purposely use
2080         * search_memslots() instead of __gfn_to_memslot() to avoid
2081         * thrashing the VM-wide last_used_index in kvm_memslots.
2082         */
2083        slot = search_memslots(slots, gfn, &slot_index);
2084        if (slot) {
2085                vcpu->last_used_slot = slot_index;
2086                return slot;
2087        }
2088
2089        return NULL;
2090}
2091EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);
2092
2093bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2094{
2095        struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2096
2097        return kvm_is_visible_memslot(memslot);
2098}
2099EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2100
2101bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2102{
2103        struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2104
2105        return kvm_is_visible_memslot(memslot);
2106}
2107EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2108
2109unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2110{
2111        struct vm_area_struct *vma;
2112        unsigned long addr, size;
2113
2114        size = PAGE_SIZE;
2115
2116        addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2117        if (kvm_is_error_hva(addr))
2118                return PAGE_SIZE;
2119
2120        mmap_read_lock(current->mm);
2121        vma = find_vma(current->mm, addr);
2122        if (!vma)
2123                goto out;
2124
2125        size = vma_kernel_pagesize(vma);
2126
2127out:
2128        mmap_read_unlock(current->mm);
2129
2130        return size;
2131}
2132
2133static bool memslot_is_readonly(struct kvm_memory_slot *slot)
2134{
2135        return slot->flags & KVM_MEM_READONLY;
2136}
2137
2138static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2139                                       gfn_t *nr_pages, bool write)
2140{
2141        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2142                return KVM_HVA_ERR_BAD;
2143
2144        if (memslot_is_readonly(slot) && write)
2145                return KVM_HVA_ERR_RO_BAD;
2146
2147        if (nr_pages)
2148                *nr_pages = slot->npages - (gfn - slot->base_gfn);
2149
2150        return __gfn_to_hva_memslot(slot, gfn);
2151}
2152
2153static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2154                                     gfn_t *nr_pages)
2155{
2156        return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2157}
2158
2159unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2160                                        gfn_t gfn)
2161{
2162        return gfn_to_hva_many(slot, gfn, NULL);
2163}
2164EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2165
2166unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2167{
2168        return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2169}
2170EXPORT_SYMBOL_GPL(gfn_to_hva);
2171
2172unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2173{
2174        return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2175}
2176EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2177
2178/*
2179 * Return the hva of a @gfn and the R/W attribute if possible.
2180 *
2181 * @slot: the kvm_memory_slot which contains @gfn
2182 * @gfn: the gfn to be translated
2183 * @writable: used to return the read/write attribute of the @slot if the hva
2184 * is valid and @writable is not NULL
2185 */
2186unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2187                                      gfn_t gfn, bool *writable)
2188{
2189        unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2190
2191        if (!kvm_is_error_hva(hva) && writable)
2192                *writable = !memslot_is_readonly(slot);
2193
2194        return hva;
2195}
2196
2197unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2198{
2199        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2200
2201        return gfn_to_hva_memslot_prot(slot, gfn, writable);
2202}
2203
2204unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2205{
2206        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2207
2208        return gfn_to_hva_memslot_prot(slot, gfn, writable);
2209}
2210
2211static inline int check_user_page_hwpoison(unsigned long addr)
2212{
2213        int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2214
2215        rc = get_user_pages(addr, 1, flags, NULL, NULL);
2216        return rc == -EHWPOISON;
2217}
2218
2219/*
2220 * The fast path to get the writable pfn which will be stored in @pfn,
2221 * true indicates success, otherwise false is returned.  It's also the
2222 * only part that runs if we can in atomic context.
2223 */
2224static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2225                            bool *writable, kvm_pfn_t *pfn)
2226{
2227        struct page *page[1];
2228
2229        /*
2230         * Fast pin a writable pfn only if it is a write fault request
2231         * or the caller allows to map a writable pfn for a read fault
2232         * request.
2233         */
2234        if (!(write_fault || writable))
2235                return false;
2236
2237        if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2238                *pfn = page_to_pfn(page[0]);
2239
2240                if (writable)
2241                        *writable = true;
2242                return true;
2243        }
2244
2245        return false;
2246}
2247
2248/*
2249 * The slow path to get the pfn of the specified host virtual address,
2250 * 1 indicates success, -errno is returned if error is detected.
2251 */
2252static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2253                           bool *writable, kvm_pfn_t *pfn)
2254{
2255        unsigned int flags = FOLL_HWPOISON;
2256        struct page *page;
2257        int npages = 0;
2258
2259        might_sleep();
2260
2261        if (writable)
2262                *writable = write_fault;
2263
2264        if (write_fault)
2265                flags |= FOLL_WRITE;
2266        if (async)
2267                flags |= FOLL_NOWAIT;
2268
2269        npages = get_user_pages_unlocked(addr, 1, &page, flags);
2270        if (npages != 1)
2271                return npages;
2272
2273        /* map read fault as writable if possible */
2274        if (unlikely(!write_fault) && writable) {
2275                struct page *wpage;
2276
2277                if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2278                        *writable = true;
2279                        put_page(page);
2280                        page = wpage;
2281                }
2282        }
2283        *pfn = page_to_pfn(page);
2284        return npages;
2285}
2286
2287static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2288{
2289        if (unlikely(!(vma->vm_flags & VM_READ)))
2290                return false;
2291
2292        if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2293                return false;
2294
2295        return true;
2296}
2297
2298static int kvm_try_get_pfn(kvm_pfn_t pfn)
2299{
2300        if (kvm_is_reserved_pfn(pfn))
2301                return 1;
2302        return get_page_unless_zero(pfn_to_page(pfn));
2303}
2304
2305static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2306                               unsigned long addr, bool *async,
2307                               bool write_fault, bool *writable,
2308                               kvm_pfn_t *p_pfn)
2309{
2310        kvm_pfn_t pfn;
2311        pte_t *ptep;
2312        spinlock_t *ptl;
2313        int r;
2314
2315        r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2316        if (r) {
2317                /*
2318                 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2319                 * not call the fault handler, so do it here.
2320                 */
2321                bool unlocked = false;
2322                r = fixup_user_fault(current->mm, addr,
2323                                     (write_fault ? FAULT_FLAG_WRITE : 0),
2324                                     &unlocked);
2325                if (unlocked)
2326                        return -EAGAIN;
2327                if (r)
2328                        return r;
2329
2330                r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2331                if (r)
2332                        return r;
2333        }
2334
2335        if (write_fault && !pte_write(*ptep)) {
2336                pfn = KVM_PFN_ERR_RO_FAULT;
2337                goto out;
2338        }
2339
2340        if (writable)
2341                *writable = pte_write(*ptep);
2342        pfn = pte_pfn(*ptep);
2343
2344        /*
2345         * Get a reference here because callers of *hva_to_pfn* and
2346         * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2347         * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
2348         * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2349         * simply do nothing for reserved pfns.
2350         *
2351         * Whoever called remap_pfn_range is also going to call e.g.
2352         * unmap_mapping_range before the underlying pages are freed,
2353         * causing a call to our MMU notifier.
2354         *
2355         * Certain IO or PFNMAP mappings can be backed with valid
2356         * struct pages, but be allocated without refcounting e.g.,
2357         * tail pages of non-compound higher order allocations, which
2358         * would then underflow the refcount when the caller does the
2359         * required put_page. Don't allow those pages here.
2360         */ 
2361        if (!kvm_try_get_pfn(pfn))
2362                r = -EFAULT;
2363
2364out:
2365        pte_unmap_unlock(ptep, ptl);
2366        *p_pfn = pfn;
2367
2368        return r;
2369}
2370
2371/*
2372 * Pin guest page in memory and return its pfn.
2373 * @addr: host virtual address which maps memory to the guest
2374 * @atomic: whether this function can sleep
2375 * @async: whether this function need to wait IO complete if the
2376 *         host page is not in the memory
2377 * @write_fault: whether we should get a writable host page
2378 * @writable: whether it allows to map a writable host page for !@write_fault
2379 *
2380 * The function will map a writable host page for these two cases:
2381 * 1): @write_fault = true
2382 * 2): @write_fault = false && @writable, @writable will tell the caller
2383 *     whether the mapping is writable.
2384 */
2385static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
2386                        bool write_fault, bool *writable)
2387{
2388        struct vm_area_struct *vma;
2389        kvm_pfn_t pfn = 0;
2390        int npages, r;
2391
2392        /* we can do it either atomically or asynchronously, not both */
2393        BUG_ON(atomic && async);
2394
2395        if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2396                return pfn;
2397
2398        if (atomic)
2399                return KVM_PFN_ERR_FAULT;
2400
2401        npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
2402        if (npages == 1)
2403                return pfn;
2404
2405        mmap_read_lock(current->mm);
2406        if (npages == -EHWPOISON ||
2407              (!async && check_user_page_hwpoison(addr))) {
2408                pfn = KVM_PFN_ERR_HWPOISON;
2409                goto exit;
2410        }
2411
2412retry:
2413        vma = vma_lookup(current->mm, addr);
2414
2415        if (vma == NULL)
2416                pfn = KVM_PFN_ERR_FAULT;
2417        else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2418                r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
2419                if (r == -EAGAIN)
2420                        goto retry;
2421                if (r < 0)
2422                        pfn = KVM_PFN_ERR_FAULT;
2423        } else {
2424                if (async && vma_is_valid(vma, write_fault))
2425                        *async = true;
2426                pfn = KVM_PFN_ERR_FAULT;
2427        }
2428exit:
2429        mmap_read_unlock(current->mm);
2430        return pfn;
2431}
2432
2433kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
2434                               bool atomic, bool *async, bool write_fault,
2435                               bool *writable, hva_t *hva)
2436{
2437        unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2438
2439        if (hva)
2440                *hva = addr;
2441
2442        if (addr == KVM_HVA_ERR_RO_BAD) {
2443                if (writable)
2444                        *writable = false;
2445                return KVM_PFN_ERR_RO_FAULT;
2446        }
2447
2448        if (kvm_is_error_hva(addr)) {
2449                if (writable)
2450                        *writable = false;
2451                return KVM_PFN_NOSLOT;
2452        }
2453
2454        /* Do not map writable pfn in the readonly memslot. */
2455        if (writable && memslot_is_readonly(slot)) {
2456                *writable = false;
2457                writable = NULL;
2458        }
2459
2460        return hva_to_pfn(addr, atomic, async, write_fault,
2461                          writable);
2462}
2463EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2464
2465kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2466                      bool *writable)
2467{
2468        return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2469                                    write_fault, writable, NULL);
2470}
2471EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2472
2473kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
2474{
2475        return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
2476}
2477EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2478
2479kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
2480{
2481        return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
2482}
2483EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2484
2485kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2486{
2487        return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2488}
2489EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2490
2491kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2492{
2493        return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2494}
2495EXPORT_SYMBOL_GPL(gfn_to_pfn);
2496
2497kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2498{
2499        return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2500}
2501EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2502
2503int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2504                            struct page **pages, int nr_pages)
2505{
2506        unsigned long addr;
2507        gfn_t entry = 0;
2508
2509        addr = gfn_to_hva_many(slot, gfn, &entry);
2510        if (kvm_is_error_hva(addr))
2511                return -1;
2512
2513        if (entry < nr_pages)
2514                return 0;
2515
2516        return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2517}
2518EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2519
2520static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
2521{
2522        if (is_error_noslot_pfn(pfn))
2523                return KVM_ERR_PTR_BAD_PAGE;
2524
2525        if (kvm_is_reserved_pfn(pfn)) {
2526                WARN_ON(1);
2527                return KVM_ERR_PTR_BAD_PAGE;
2528        }
2529
2530        return pfn_to_page(pfn);
2531}
2532
2533struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2534{
2535        kvm_pfn_t pfn;
2536
2537        pfn = gfn_to_pfn(kvm, gfn);
2538
2539        return kvm_pfn_to_page(pfn);
2540}
2541EXPORT_SYMBOL_GPL(gfn_to_page);
2542
2543void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
2544{
2545        if (pfn == 0)
2546                return;
2547
2548        if (cache)
2549                cache->pfn = cache->gfn = 0;
2550
2551        if (dirty)
2552                kvm_release_pfn_dirty(pfn);
2553        else
2554                kvm_release_pfn_clean(pfn);
2555}
2556
2557static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
2558                                 struct gfn_to_pfn_cache *cache, u64 gen)
2559{
2560        kvm_release_pfn(cache->pfn, cache->dirty, cache);
2561
2562        cache->pfn = gfn_to_pfn_memslot(slot, gfn);
2563        cache->gfn = gfn;
2564        cache->dirty = false;
2565        cache->generation = gen;
2566}
2567
2568static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
2569                         struct kvm_host_map *map,
2570                         struct gfn_to_pfn_cache *cache,
2571                         bool atomic)
2572{
2573        kvm_pfn_t pfn;
2574        void *hva = NULL;
2575        struct page *page = KVM_UNMAPPED_PAGE;
2576        struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
2577        u64 gen = slots->generation;
2578
2579        if (!map)
2580                return -EINVAL;
2581
2582        if (cache) {
2583                if (!cache->pfn || cache->gfn != gfn ||
2584                        cache->generation != gen) {
2585                        if (atomic)
2586                                return -EAGAIN;
2587                        kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
2588                }
2589                pfn = cache->pfn;
2590        } else {
2591                if (atomic)
2592                        return -EAGAIN;
2593                pfn = gfn_to_pfn_memslot(slot, gfn);
2594        }
2595        if (is_error_noslot_pfn(pfn))
2596                return -EINVAL;
2597
2598        if (pfn_valid(pfn)) {
2599                page = pfn_to_page(pfn);
2600                if (atomic)
2601                        hva = kmap_atomic(page);
2602                else
2603                        hva = kmap(page);
2604#ifdef CONFIG_HAS_IOMEM
2605        } else if (!atomic) {
2606                hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2607        } else {
2608                return -EINVAL;
2609#endif
2610        }
2611
2612        if (!hva)
2613                return -EFAULT;
2614
2615        map->page = page;
2616        map->hva = hva;
2617        map->pfn = pfn;
2618        map->gfn = gfn;
2619
2620        return 0;
2621}
2622
2623int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
2624                struct gfn_to_pfn_cache *cache, bool atomic)
2625{
2626        return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
2627                        cache, atomic);
2628}
2629EXPORT_SYMBOL_GPL(kvm_map_gfn);
2630
2631int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2632{
2633        return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
2634                NULL, false);
2635}
2636EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2637
2638static void __kvm_unmap_gfn(struct kvm *kvm,
2639                        struct kvm_memory_slot *memslot,
2640                        struct kvm_host_map *map,
2641                        struct gfn_to_pfn_cache *cache,
2642                        bool dirty, bool atomic)
2643{
2644        if (!map)
2645                return;
2646
2647        if (!map->hva)
2648                return;
2649
2650        if (map->page != KVM_UNMAPPED_PAGE) {
2651                if (atomic)
2652                        kunmap_atomic(map->hva);
2653                else
2654                        kunmap(map->page);
2655        }
2656#ifdef CONFIG_HAS_IOMEM
2657        else if (!atomic)
2658                memunmap(map->hva);
2659        else
2660                WARN_ONCE(1, "Unexpected unmapping in atomic context");
2661#endif
2662
2663        if (dirty)
2664                mark_page_dirty_in_slot(kvm, memslot, map->gfn);
2665
2666        if (cache)
2667                cache->dirty |= dirty;
2668        else
2669                kvm_release_pfn(map->pfn, dirty, NULL);
2670
2671        map->hva = NULL;
2672        map->page = NULL;
2673}
2674
2675int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 
2676                  struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
2677{
2678        __kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map,
2679                        cache, dirty, atomic);
2680        return 0;
2681}
2682EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
2683
2684void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2685{
2686        __kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn),
2687                        map, NULL, dirty, false);
2688}
2689EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2690
2691struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2692{
2693        kvm_pfn_t pfn;
2694
2695        pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2696
2697        return kvm_pfn_to_page(pfn);
2698}
2699EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2700
2701void kvm_release_page_clean(struct page *page)
2702{
2703        WARN_ON(is_error_page(page));
2704
2705        kvm_release_pfn_clean(page_to_pfn(page));
2706}
2707EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2708
2709void kvm_release_pfn_clean(kvm_pfn_t pfn)
2710{
2711        if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2712                put_page(pfn_to_page(pfn));
2713}
2714EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2715
2716void kvm_release_page_dirty(struct page *page)
2717{
2718        WARN_ON(is_error_page(page));
2719
2720        kvm_release_pfn_dirty(page_to_pfn(page));
2721}
2722EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2723
2724void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2725{
2726        kvm_set_pfn_dirty(pfn);
2727        kvm_release_pfn_clean(pfn);
2728}
2729EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2730
2731void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2732{
2733        if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2734                SetPageDirty(pfn_to_page(pfn));
2735}
2736EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2737
2738void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2739{
2740        if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2741                mark_page_accessed(pfn_to_page(pfn));
2742}
2743EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2744
2745static int next_segment(unsigned long len, int offset)
2746{
2747        if (len > PAGE_SIZE - offset)
2748                return PAGE_SIZE - offset;
2749        else
2750                return len;
2751}
2752
2753static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2754                                 void *data, int offset, int len)
2755{
2756        int r;
2757        unsigned long addr;
2758
2759        addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2760        if (kvm_is_error_hva(addr))
2761                return -EFAULT;
2762        r = __copy_from_user(data, (void __user *)addr + offset, len);
2763        if (r)
2764                return -EFAULT;
2765        return 0;
2766}
2767
2768int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2769                        int len)
2770{
2771        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2772
2773        return __kvm_read_guest_page(slot, gfn, data, offset, len);
2774}
2775EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2776
2777int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2778                             int offset, int len)
2779{
2780        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2781
2782        return __kvm_read_guest_page(slot, gfn, data, offset, len);
2783}
2784EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2785
2786int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2787{
2788        gfn_t gfn = gpa >> PAGE_SHIFT;
2789        int seg;
2790        int offset = offset_in_page(gpa);
2791        int ret;
2792
2793        while ((seg = next_segment(len, offset)) != 0) {
2794                ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2795                if (ret < 0)
2796                        return ret;
2797                offset = 0;
2798                len -= seg;
2799                data += seg;
2800                ++gfn;
2801        }
2802        return 0;
2803}
2804EXPORT_SYMBOL_GPL(kvm_read_guest);
2805
2806int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2807{
2808        gfn_t gfn = gpa >> PAGE_SHIFT;
2809        int seg;
2810        int offset = offset_in_page(gpa);
2811        int ret;
2812
2813        while ((seg = next_segment(len, offset)) != 0) {
2814                ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2815                if (ret < 0)
2816                        return ret;
2817                offset = 0;
2818                len -= seg;
2819                data += seg;
2820                ++gfn;
2821        }
2822        return 0;
2823}
2824EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2825
2826static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2827                                   void *data, int offset, unsigned long len)
2828{
2829        int r;
2830        unsigned long addr;
2831
2832        addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2833        if (kvm_is_error_hva(addr))
2834                return -EFAULT;
2835        pagefault_disable();
2836        r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2837        pagefault_enable();
2838        if (r)
2839                return -EFAULT;
2840        return 0;
2841}
2842
2843int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2844                               void *data, unsigned long len)
2845{
2846        gfn_t gfn = gpa >> PAGE_SHIFT;
2847        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2848        int offset = offset_in_page(gpa);
2849
2850        return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2851}
2852EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2853
2854static int __kvm_write_guest_page(struct kvm *kvm,
2855                                  struct kvm_memory_slot *memslot, gfn_t gfn,
2856                                  const void *data, int offset, int len)
2857{
2858        int r;
2859        unsigned long addr;
2860
2861        addr = gfn_to_hva_memslot(memslot, gfn);
2862        if (kvm_is_error_hva(addr))
2863                return -EFAULT;
2864        r = __copy_to_user((void __user *)addr + offset, data, len);
2865        if (r)
2866                return -EFAULT;
2867        mark_page_dirty_in_slot(kvm, memslot, gfn);
2868        return 0;
2869}
2870
2871int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2872                         const void *data, int offset, int len)
2873{
2874        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2875
2876        return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
2877}
2878EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2879
2880int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2881                              const void *data, int offset, int len)
2882{
2883        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2884
2885        return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
2886}
2887EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2888
2889int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2890                    unsigned long len)
2891{
2892        gfn_t gfn = gpa >> PAGE_SHIFT;
2893        int seg;
2894        int offset = offset_in_page(gpa);
2895        int ret;
2896
2897        while ((seg = next_segment(len, offset)) != 0) {
2898                ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2899                if (ret < 0)
2900                        return ret;
2901                offset = 0;
2902                len -= seg;
2903                data += seg;
2904                ++gfn;
2905        }
2906        return 0;
2907}
2908EXPORT_SYMBOL_GPL(kvm_write_guest);
2909
2910int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2911                         unsigned long len)
2912{
2913        gfn_t gfn = gpa >> PAGE_SHIFT;
2914        int seg;
2915        int offset = offset_in_page(gpa);
2916        int ret;
2917
2918        while ((seg = next_segment(len, offset)) != 0) {
2919                ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2920                if (ret < 0)
2921                        return ret;
2922                offset = 0;
2923                len -= seg;
2924                data += seg;
2925                ++gfn;
2926        }
2927        return 0;
2928}
2929EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2930
2931static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2932                                       struct gfn_to_hva_cache *ghc,
2933                                       gpa_t gpa, unsigned long len)
2934{
2935        int offset = offset_in_page(gpa);
2936        gfn_t start_gfn = gpa >> PAGE_SHIFT;
2937        gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2938        gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2939        gfn_t nr_pages_avail;
2940
2941        /* Update ghc->generation before performing any error checks. */
2942        ghc->generation = slots->generation;
2943
2944        if (start_gfn > end_gfn) {
2945                ghc->hva = KVM_HVA_ERR_BAD;
2946                return -EINVAL;
2947        }
2948
2949        /*
2950         * If the requested region crosses two memslots, we still
2951         * verify that the entire region is valid here.
2952         */
2953        for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
2954                ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2955                ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2956                                           &nr_pages_avail);
2957                if (kvm_is_error_hva(ghc->hva))
2958                        return -EFAULT;
2959        }
2960
2961        /* Use the slow path for cross page reads and writes. */
2962        if (nr_pages_needed == 1)
2963                ghc->hva += offset;
2964        else
2965                ghc->memslot = NULL;
2966
2967        ghc->gpa = gpa;
2968        ghc->len = len;
2969        return 0;
2970}
2971
2972int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2973                              gpa_t gpa, unsigned long len)
2974{
2975        struct kvm_memslots *slots = kvm_memslots(kvm);
2976        return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
2977}
2978EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
2979
2980int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2981                                  void *data, unsigned int offset,
2982                                  unsigned long len)
2983{
2984        struct kvm_memslots *slots = kvm_memslots(kvm);
2985        int r;
2986        gpa_t gpa = ghc->gpa + offset;
2987
2988        BUG_ON(len + offset > ghc->len);
2989
2990        if (slots->generation != ghc->generation) {
2991                if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
2992                        return -EFAULT;
2993        }
2994
2995        if (kvm_is_error_hva(ghc->hva))
2996                return -EFAULT;
2997
2998        if (unlikely(!ghc->memslot))
2999                return kvm_write_guest(kvm, gpa, data, len);
3000
3001        r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3002        if (r)
3003                return -EFAULT;
3004        mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3005
3006        return 0;
3007}
3008EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3009
3010int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3011                           void *data, unsigned long len)
3012{
3013        return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3014}
3015EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3016
3017int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3018                                 void *data, unsigned int offset,
3019                                 unsigned long len)
3020{
3021        struct kvm_memslots *slots = kvm_memslots(kvm);
3022        int r;
3023        gpa_t gpa = ghc->gpa + offset;
3024
3025        BUG_ON(len + offset > ghc->len);
3026
3027        if (slots->generation != ghc->generation) {
3028                if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3029                        return -EFAULT;
3030        }
3031
3032        if (kvm_is_error_hva(ghc->hva))
3033                return -EFAULT;
3034
3035        if (unlikely(!ghc->memslot))
3036                return kvm_read_guest(kvm, gpa, data, len);
3037
3038        r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3039        if (r)
3040                return -EFAULT;
3041
3042        return 0;
3043}
3044EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3045
3046int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3047                          void *data, unsigned long len)
3048{
3049        return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3050}
3051EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3052
3053int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3054{
3055        const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3056        gfn_t gfn = gpa >> PAGE_SHIFT;
3057        int seg;
3058        int offset = offset_in_page(gpa);
3059        int ret;
3060
3061        while ((seg = next_segment(len, offset)) != 0) {
3062                ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3063                if (ret < 0)
3064                        return ret;
3065                offset = 0;
3066                len -= seg;
3067                ++gfn;
3068        }
3069        return 0;
3070}
3071EXPORT_SYMBOL_GPL(kvm_clear_guest);
3072
3073void mark_page_dirty_in_slot(struct kvm *kvm,
3074                             struct kvm_memory_slot *memslot,
3075                             gfn_t gfn)
3076{
3077        if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3078                unsigned long rel_gfn = gfn - memslot->base_gfn;
3079                u32 slot = (memslot->as_id << 16) | memslot->id;
3080
3081                if (kvm->dirty_ring_size)
3082                        kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
3083                                            slot, rel_gfn);
3084                else
3085                        set_bit_le(rel_gfn, memslot->dirty_bitmap);
3086        }
3087}
3088EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3089
3090void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3091{
3092        struct kvm_memory_slot *memslot;
3093
3094        memslot = gfn_to_memslot(kvm, gfn);
3095        mark_page_dirty_in_slot(kvm, memslot, gfn);
3096}
3097EXPORT_SYMBOL_GPL(mark_page_dirty);
3098
3099void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3100{
3101        struct kvm_memory_slot *memslot;
3102
3103        memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3104        mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3105}
3106EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3107
3108void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3109{
3110        if (!vcpu->sigset_active)
3111                return;
3112
3113        /*
3114         * This does a lockless modification of ->real_blocked, which is fine
3115         * because, only current can change ->real_blocked and all readers of
3116         * ->real_blocked don't care as long ->real_blocked is always a subset
3117         * of ->blocked.
3118         */
3119        sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3120}
3121
3122void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3123{
3124        if (!vcpu->sigset_active)
3125                return;
3126
3127        sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3128        sigemptyset(&current->real_blocked);
3129}
3130
3131static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3132{
3133        unsigned int old, val, grow, grow_start;
3134
3135        old = val = vcpu->halt_poll_ns;
3136        grow_start = READ_ONCE(halt_poll_ns_grow_start);
3137        grow = READ_ONCE(halt_poll_ns_grow);
3138        if (!grow)
3139                goto out;
3140
3141        val *= grow;
3142        if (val < grow_start)
3143                val = grow_start;
3144
3145        if (val > vcpu->kvm->max_halt_poll_ns)
3146                val = vcpu->kvm->max_halt_poll_ns;
3147
3148        vcpu->halt_poll_ns = val;
3149out:
3150        trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3151}
3152
3153static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3154{
3155        unsigned int old, val, shrink, grow_start;
3156
3157        old = val = vcpu->halt_poll_ns;
3158        shrink = READ_ONCE(halt_poll_ns_shrink);
3159        grow_start = READ_ONCE(halt_poll_ns_grow_start);
3160        if (shrink == 0)
3161                val = 0;
3162        else
3163                val /= shrink;
3164
3165        if (val < grow_start)
3166                val = 0;
3167
3168        vcpu->halt_poll_ns = val;
3169        trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3170}
3171
3172static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3173{
3174        int ret = -EINTR;
3175        int idx = srcu_read_lock(&vcpu->kvm->srcu);
3176
3177        if (kvm_arch_vcpu_runnable(vcpu)) {
3178                kvm_make_request(KVM_REQ_UNHALT, vcpu);
3179                goto out;
3180        }
3181        if (kvm_cpu_has_pending_timer(vcpu))
3182                goto out;
3183        if (signal_pending(current))
3184                goto out;
3185        if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3186                goto out;
3187
3188        ret = 0;
3189out:
3190        srcu_read_unlock(&vcpu->kvm->srcu, idx);
3191        return ret;
3192}
3193
3194static inline void
3195update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
3196{
3197        if (waited)
3198                vcpu->stat.generic.halt_poll_fail_ns += poll_ns;
3199        else
3200                vcpu->stat.generic.halt_poll_success_ns += poll_ns;
3201}
3202
3203/*
3204 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
3205 */
3206void kvm_vcpu_block(struct kvm_vcpu *vcpu)
3207{
3208        ktime_t start, cur, poll_end;
3209        bool waited = false;
3210        u64 block_ns;
3211
3212        kvm_arch_vcpu_blocking(vcpu);
3213
3214        start = cur = poll_end = ktime_get();
3215        if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
3216                ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
3217
3218                ++vcpu->stat.generic.halt_attempted_poll;
3219                do {
3220                        /*
3221                         * This sets KVM_REQ_UNHALT if an interrupt
3222                         * arrives.
3223                         */
3224                        if (kvm_vcpu_check_block(vcpu) < 0) {
3225                                ++vcpu->stat.generic.halt_successful_poll;
3226                                if (!vcpu_valid_wakeup(vcpu))
3227                                        ++vcpu->stat.generic.halt_poll_invalid;
3228
3229                                KVM_STATS_LOG_HIST_UPDATE(
3230                                      vcpu->stat.generic.halt_poll_success_hist,
3231                                      ktime_to_ns(ktime_get()) -
3232                                      ktime_to_ns(start));
3233                                goto out;
3234                        }
3235                        cpu_relax();
3236                        poll_end = cur = ktime_get();
3237                } while (kvm_vcpu_can_poll(cur, stop));
3238
3239                KVM_STATS_LOG_HIST_UPDATE(
3240                                vcpu->stat.generic.halt_poll_fail_hist,
3241                                ktime_to_ns(ktime_get()) - ktime_to_ns(start));
3242        }
3243
3244
3245        prepare_to_rcuwait(&vcpu->wait);
3246        for (;;) {
3247                set_current_state(TASK_INTERRUPTIBLE);
3248
3249                if (kvm_vcpu_check_block(vcpu) < 0)
3250                        break;
3251
3252                waited = true;
3253                schedule();
3254        }
3255        finish_rcuwait(&vcpu->wait);
3256        cur = ktime_get();
3257        if (waited) {
3258                vcpu->stat.generic.halt_wait_ns +=
3259                        ktime_to_ns(cur) - ktime_to_ns(poll_end);
3260                KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3261                                ktime_to_ns(cur) - ktime_to_ns(poll_end));
3262        }
3263out:
3264        kvm_arch_vcpu_unblocking(vcpu);
3265        block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3266
3267        update_halt_poll_stats(
3268                vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
3269
3270        if (!kvm_arch_no_poll(vcpu)) {
3271                if (!vcpu_valid_wakeup(vcpu)) {
3272                        shrink_halt_poll_ns(vcpu);
3273                } else if (vcpu->kvm->max_halt_poll_ns) {
3274                        if (block_ns <= vcpu->halt_poll_ns)
3275                                ;
3276                        /* we had a long block, shrink polling */
3277                        else if (vcpu->halt_poll_ns &&
3278                                        block_ns > vcpu->kvm->max_halt_poll_ns)
3279                                shrink_halt_poll_ns(vcpu);
3280                        /* we had a short halt and our poll time is too small */
3281                        else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
3282                                        block_ns < vcpu->kvm->max_halt_poll_ns)
3283                                grow_halt_poll_ns(vcpu);
3284                } else {
3285                        vcpu->halt_poll_ns = 0;
3286                }
3287        }
3288
3289        trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
3290        kvm_arch_vcpu_block_finish(vcpu);
3291}
3292EXPORT_SYMBOL_GPL(kvm_vcpu_block);
3293
3294bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3295{
3296        struct rcuwait *waitp;
3297
3298        waitp = kvm_arch_vcpu_get_wait(vcpu);
3299        if (rcuwait_wake_up(waitp)) {
3300                WRITE_ONCE(vcpu->ready, true);
3301                ++vcpu->stat.generic.halt_wakeup;
3302                return true;
3303        }
3304
3305        return false;
3306}
3307EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3308
3309#ifndef CONFIG_S390
3310/*
3311 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3312 */
3313void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3314{
3315        int me, cpu;
3316
3317        if (kvm_vcpu_wake_up(vcpu))
3318                return;
3319
3320        /*
3321         * Note, the vCPU could get migrated to a different pCPU at any point
3322         * after kvm_arch_vcpu_should_kick(), which could result in sending an
3323         * IPI to the previous pCPU.  But, that's ok because the purpose of the
3324         * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3325         * vCPU also requires it to leave IN_GUEST_MODE.
3326         */
3327        me = get_cpu();
3328        if (kvm_arch_vcpu_should_kick(vcpu)) {
3329                cpu = READ_ONCE(vcpu->cpu);
3330                if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3331                        smp_send_reschedule(cpu);
3332        }
3333        put_cpu();
3334}
3335EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3336#endif /* !CONFIG_S390 */
3337
3338int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3339{
3340        struct pid *pid;
3341        struct task_struct *task = NULL;
3342        int ret = 0;
3343
3344        rcu_read_lock();
3345        pid = rcu_dereference(target->pid);
3346        if (pid)
3347                task = get_pid_task(pid, PIDTYPE_PID);
3348        rcu_read_unlock();
3349        if (!task)
3350                return ret;
3351        ret = yield_to(task, 1);
3352        put_task_struct(task);
3353
3354        return ret;
3355}
3356EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3357
3358/*
3359 * Helper that checks whether a VCPU is eligible for directed yield.
3360 * Most eligible candidate to yield is decided by following heuristics:
3361 *
3362 *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3363 *  (preempted lock holder), indicated by @in_spin_loop.
3364 *  Set at the beginning and cleared at the end of interception/PLE handler.
3365 *
3366 *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3367 *  chance last time (mostly it has become eligible now since we have probably
3368 *  yielded to lockholder in last iteration. This is done by toggling
3369 *  @dy_eligible each time a VCPU checked for eligibility.)
3370 *
3371 *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3372 *  to preempted lock-holder could result in wrong VCPU selection and CPU
3373 *  burning. Giving priority for a potential lock-holder increases lock
3374 *  progress.
3375 *
3376 *  Since algorithm is based on heuristics, accessing another VCPU data without
3377 *  locking does not harm. It may result in trying to yield to  same VCPU, fail
3378 *  and continue with next VCPU and so on.
3379 */
3380static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3381{
3382#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3383        bool eligible;
3384
3385        eligible = !vcpu->spin_loop.in_spin_loop ||
3386                    vcpu->spin_loop.dy_eligible;
3387
3388        if (vcpu->spin_loop.in_spin_loop)
3389                kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3390
3391        return eligible;
3392#else
3393        return true;
3394#endif
3395}
3396
3397/*
3398 * Unlike kvm_arch_vcpu_runnable, this function is called outside
3399 * a vcpu_load/vcpu_put pair.  However, for most architectures
3400 * kvm_arch_vcpu_runnable does not require vcpu_load.
3401 */
3402bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3403{
3404        return kvm_arch_vcpu_runnable(vcpu);
3405}
3406
3407static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3408{
3409        if (kvm_arch_dy_runnable(vcpu))
3410                return true;
3411
3412#ifdef CONFIG_KVM_ASYNC_PF
3413        if (!list_empty_careful(&vcpu->async_pf.done))
3414                return true;
3415#endif
3416
3417        return false;
3418}
3419
3420bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3421{
3422        return false;
3423}
3424
3425void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3426{
3427        struct kvm *kvm = me->kvm;
3428        struct kvm_vcpu *vcpu;
3429        int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3430        int yielded = 0;
3431        int try = 3;
3432        int pass;
3433        int i;
3434
3435        kvm_vcpu_set_in_spin_loop(me, true);
3436        /*
3437         * We boost the priority of a VCPU that is runnable but not
3438         * currently running, because it got preempted by something
3439         * else and called schedule in __vcpu_run.  Hopefully that
3440         * VCPU is holding the lock that we need and will release it.
3441         * We approximate round-robin by starting at the last boosted VCPU.
3442         */
3443        for (pass = 0; pass < 2 && !yielded && try; pass++) {
3444                kvm_for_each_vcpu(i, vcpu, kvm) {
3445                        if (!pass && i <= last_boosted_vcpu) {
3446                                i = last_boosted_vcpu;
3447                                continue;
3448                        } else if (pass && i > last_boosted_vcpu)
3449                                break;
3450                        if (!READ_ONCE(vcpu->ready))
3451                                continue;
3452                        if (vcpu == me)
3453                                continue;
3454                        if (rcuwait_active(&vcpu->wait) &&
3455                            !vcpu_dy_runnable(vcpu))
3456                                continue;
3457                        if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3458                            !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3459                            !kvm_arch_vcpu_in_kernel(vcpu))
3460                                continue;
3461                        if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3462                                continue;
3463
3464                        yielded = kvm_vcpu_yield_to(vcpu);
3465                        if (yielded > 0) {
3466                                kvm->last_boosted_vcpu = i;
3467                                break;
3468                        } else if (yielded < 0) {
3469                                try--;
3470                                if (!try)
3471                                        break;
3472                        }
3473                }
3474        }
3475        kvm_vcpu_set_in_spin_loop(me, false);
3476
3477        /* Ensure vcpu is not eligible during next spinloop */
3478        kvm_vcpu_set_dy_eligible(me, false);
3479}
3480EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3481
3482static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3483{
3484#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
3485        return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3486            (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3487             kvm->dirty_ring_size / PAGE_SIZE);
3488#else
3489        return false;
3490#endif
3491}
3492
3493static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3494{
3495        struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3496        struct page *page;
3497
3498        if (vmf->pgoff == 0)
3499                page = virt_to_page(vcpu->run);
3500#ifdef CONFIG_X86
3501        else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3502                page = virt_to_page(vcpu->arch.pio_data);
3503#endif
3504#ifdef CONFIG_KVM_MMIO
3505        else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3506                page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3507#endif
3508        else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3509                page = kvm_dirty_ring_get_page(
3510                    &vcpu->dirty_ring,
3511                    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3512        else
3513                return kvm_arch_vcpu_fault(vcpu, vmf);
3514        get_page(page);
3515        vmf->page = page;
3516        return 0;
3517}
3518
3519static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3520        .fault = kvm_vcpu_fault,
3521};
3522
3523static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3524{
3525        struct kvm_vcpu *vcpu = file->private_data;
3526        unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
3527
3528        if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3529             kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3530            ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3531                return -EINVAL;
3532
3533        vma->vm_ops = &kvm_vcpu_vm_ops;
3534        return 0;
3535}
3536
3537static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3538{
3539        struct kvm_vcpu *vcpu = filp->private_data;
3540
3541        kvm_put_kvm(vcpu->kvm);
3542        return 0;
3543}
3544
3545static struct file_operations kvm_vcpu_fops = {
3546        .release        = kvm_vcpu_release,
3547        .unlocked_ioctl = kvm_vcpu_ioctl,
3548        .mmap           = kvm_vcpu_mmap,
3549        .llseek         = noop_llseek,
3550        KVM_COMPAT(kvm_vcpu_compat_ioctl),
3551};
3552
3553/*
3554 * Allocates an inode for the vcpu.
3555 */
3556static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3557{
3558        char name[8 + 1 + ITOA_MAX_LEN + 1];
3559
3560        snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3561        return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3562}
3563
3564static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3565{
3566#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3567        struct dentry *debugfs_dentry;
3568        char dir_name[ITOA_MAX_LEN * 2];
3569
3570        if (!debugfs_initialized())
3571                return;
3572
3573        snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3574        debugfs_dentry = debugfs_create_dir(dir_name,
3575                                            vcpu->kvm->debugfs_dentry);
3576
3577        kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3578#endif
3579}
3580
3581/*
3582 * Creates some virtual cpus.  Good luck creating more than one.
3583 */
3584static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3585{
3586        int r;
3587        struct kvm_vcpu *vcpu;
3588        struct page *page;
3589
3590        if (id >= KVM_MAX_VCPU_ID)
3591                return -EINVAL;
3592
3593        mutex_lock(&kvm->lock);
3594        if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3595                mutex_unlock(&kvm->lock);
3596                return -EINVAL;
3597        }
3598
3599        kvm->created_vcpus++;
3600        mutex_unlock(&kvm->lock);
3601
3602        r = kvm_arch_vcpu_precreate(kvm, id);
3603        if (r)
3604                goto vcpu_decrement;
3605
3606        vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3607        if (!vcpu) {
3608                r = -ENOMEM;
3609                goto vcpu_decrement;
3610        }
3611
3612        BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3613        page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3614        if (!page) {
3615                r = -ENOMEM;
3616                goto vcpu_free;
3617        }
3618        vcpu->run = page_address(page);
3619
3620        kvm_vcpu_init(vcpu, kvm, id);
3621
3622        r = kvm_arch_vcpu_create(vcpu);
3623        if (r)
3624                goto vcpu_free_run_page;
3625
3626        if (kvm->dirty_ring_size) {
3627                r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3628                                         id, kvm->dirty_ring_size);
3629                if (r)
3630                        goto arch_vcpu_destroy;
3631        }
3632
3633        mutex_lock(&kvm->lock);
3634        if (kvm_get_vcpu_by_id(kvm, id)) {
3635                r = -EEXIST;
3636                goto unlock_vcpu_destroy;
3637        }
3638
3639        vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3640        BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
3641
3642        /* Fill the stats id string for the vcpu */
3643        snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
3644                 task_pid_nr(current), id);
3645
3646        /* Now it's all set up, let userspace reach it */
3647        kvm_get_kvm(kvm);
3648        r = create_vcpu_fd(vcpu);
3649        if (r < 0) {
3650                kvm_put_kvm_no_destroy(kvm);
3651                goto unlock_vcpu_destroy;
3652        }
3653
3654        kvm->vcpus[vcpu->vcpu_idx] = vcpu;
3655
3656        /*
3657         * Pairs with smp_rmb() in kvm_get_vcpu.  Write kvm->vcpus
3658         * before kvm->online_vcpu's incremented value.
3659         */
3660        smp_wmb();
3661        atomic_inc(&kvm->online_vcpus);
3662
3663        mutex_unlock(&kvm->lock);
3664        kvm_arch_vcpu_postcreate(vcpu);
3665        kvm_create_vcpu_debugfs(vcpu);
3666        return r;
3667
3668unlock_vcpu_destroy:
3669        mutex_unlock(&kvm->lock);
3670        kvm_dirty_ring_free(&vcpu->dirty_ring);
3671arch_vcpu_destroy:
3672        kvm_arch_vcpu_destroy(vcpu);
3673vcpu_free_run_page:
3674        free_page((unsigned long)vcpu->run);
3675vcpu_free:
3676        kmem_cache_free(kvm_vcpu_cache, vcpu);
3677vcpu_decrement:
3678        mutex_lock(&kvm->lock);
3679        kvm->created_vcpus--;
3680        mutex_unlock(&kvm->lock);
3681        return r;
3682}
3683
3684static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3685{
3686        if (sigset) {
3687                sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3688                vcpu->sigset_active = 1;
3689                vcpu->sigset = *sigset;
3690        } else
3691                vcpu->sigset_active = 0;
3692        return 0;
3693}
3694
3695static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
3696                              size_t size, loff_t *offset)
3697{
3698        struct kvm_vcpu *vcpu = file->private_data;
3699
3700        return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
3701                        &kvm_vcpu_stats_desc[0], &vcpu->stat,
3702                        sizeof(vcpu->stat), user_buffer, size, offset);
3703}
3704
3705static const struct file_operations kvm_vcpu_stats_fops = {
3706        .read = kvm_vcpu_stats_read,
3707        .llseek = noop_llseek,
3708};
3709
3710static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
3711{
3712        int fd;
3713        struct file *file;
3714        char name[15 + ITOA_MAX_LEN + 1];
3715
3716        snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
3717
3718        fd = get_unused_fd_flags(O_CLOEXEC);
3719        if (fd < 0)
3720                return fd;
3721
3722        file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
3723        if (IS_ERR(file)) {
3724                put_unused_fd(fd);
3725                return PTR_ERR(file);
3726        }
3727        file->f_mode |= FMODE_PREAD;
3728        fd_install(fd, file);
3729
3730        return fd;
3731}
3732
3733static long kvm_vcpu_ioctl(struct file *filp,
3734                           unsigned int ioctl, unsigned long arg)
3735{
3736        struct kvm_vcpu *vcpu = filp->private_data;
3737        void __user *argp = (void __user *)arg;
3738        int r;
3739        struct kvm_fpu *fpu = NULL;
3740        struct kvm_sregs *kvm_sregs = NULL;
3741
3742        if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
3743                return -EIO;
3744
3745        if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3746                return -EINVAL;
3747
3748        /*
3749         * Some architectures have vcpu ioctls that are asynchronous to vcpu
3750         * execution; mutex_lock() would break them.
3751         */
3752        r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3753        if (r != -ENOIOCTLCMD)
3754                return r;
3755
3756        if (mutex_lock_killable(&vcpu->mutex))
3757                return -EINTR;
3758        switch (ioctl) {
3759        case KVM_RUN: {
3760                struct pid *oldpid;
3761                r = -EINVAL;
3762                if (arg)
3763                        goto out;
3764                oldpid = rcu_access_pointer(vcpu->pid);
3765                if (unlikely(oldpid != task_pid(current))) {
3766                        /* The thread running this VCPU changed. */
3767                        struct pid *newpid;
3768
3769                        r = kvm_arch_vcpu_run_pid_change(vcpu);
3770                        if (r)
3771                                break;
3772
3773                        newpid = get_task_pid(current, PIDTYPE_PID);
3774                        rcu_assign_pointer(vcpu->pid, newpid);
3775                        if (oldpid)
3776                                synchronize_rcu();
3777                        put_pid(oldpid);
3778                }
3779                r = kvm_arch_vcpu_ioctl_run(vcpu);
3780                trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
3781                break;
3782        }
3783        case KVM_GET_REGS: {
3784                struct kvm_regs *kvm_regs;
3785
3786                r = -ENOMEM;
3787                kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3788                if (!kvm_regs)
3789                        goto out;
3790                r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
3791                if (r)
3792                        goto out_free1;
3793                r = -EFAULT;
3794                if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3795                        goto out_free1;
3796                r = 0;
3797out_free1:
3798                kfree(kvm_regs);
3799                break;
3800        }
3801        case KVM_SET_REGS: {
3802                struct kvm_regs *kvm_regs;
3803
3804                kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3805                if (IS_ERR(kvm_regs)) {
3806                        r = PTR_ERR(kvm_regs);
3807                        goto out;
3808                }
3809                r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3810                kfree(kvm_regs);
3811                break;
3812        }
3813        case KVM_GET_SREGS: {
3814                kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3815                                    GFP_KERNEL_ACCOUNT);
3816                r = -ENOMEM;
3817                if (!kvm_sregs)
3818                        goto out;
3819                r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3820                if (r)
3821                        goto out;
3822                r = -EFAULT;
3823                if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3824                        goto out;
3825                r = 0;
3826                break;
3827        }
3828        case KVM_SET_SREGS: {
3829                kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3830                if (IS_ERR(kvm_sregs)) {
3831                        r = PTR_ERR(kvm_sregs);
3832                        kvm_sregs = NULL;
3833                        goto out;
3834                }
3835                r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3836                break;
3837        }
3838        case KVM_GET_MP_STATE: {
3839                struct kvm_mp_state mp_state;
3840
3841                r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3842                if (r)
3843                        goto out;
3844                r = -EFAULT;
3845                if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3846                        goto out;
3847                r = 0;
3848                break;
3849        }
3850        case KVM_SET_MP_STATE: {
3851                struct kvm_mp_state mp_state;
3852
3853                r = -EFAULT;
3854                if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3855                        goto out;
3856                r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3857                break;
3858        }
3859        case KVM_TRANSLATE: {
3860                struct kvm_translation tr;
3861
3862                r = -EFAULT;
3863                if (copy_from_user(&tr, argp, sizeof(tr)))
3864                        goto out;
3865                r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3866                if (r)
3867                        goto out;
3868                r = -EFAULT;
3869                if (copy_to_user(argp, &tr, sizeof(tr)))
3870                        goto out;
3871                r = 0;
3872                break;
3873        }
3874        case KVM_SET_GUEST_DEBUG: {
3875                struct kvm_guest_debug dbg;
3876
3877                r = -EFAULT;
3878                if (copy_from_user(&dbg, argp, sizeof(dbg)))
3879                        goto out;
3880                r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
3881                break;
3882        }
3883        case KVM_SET_SIGNAL_MASK: {
3884                struct kvm_signal_mask __user *sigmask_arg = argp;
3885                struct kvm_signal_mask kvm_sigmask;
3886                sigset_t sigset, *p;
3887
3888                p = NULL;
3889                if (argp) {
3890                        r = -EFAULT;
3891                        if (copy_from_user(&kvm_sigmask, argp,
3892                                           sizeof(kvm_sigmask)))
3893                                goto out;
3894                        r = -EINVAL;
3895                        if (kvm_sigmask.len != sizeof(sigset))
3896                                goto out;
3897                        r = -EFAULT;
3898                        if (copy_from_user(&sigset, sigmask_arg->sigset,
3899                                           sizeof(sigset)))
3900                                goto out;
3901                        p = &sigset;
3902                }
3903                r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
3904                break;
3905        }
3906        case KVM_GET_FPU: {
3907                fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
3908                r = -ENOMEM;
3909                if (!fpu)
3910                        goto out;
3911                r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
3912                if (r)
3913                        goto out;
3914                r = -EFAULT;
3915                if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
3916                        goto out;
3917                r = 0;
3918                break;
3919        }
3920        case KVM_SET_FPU: {
3921                fpu = memdup_user(argp, sizeof(*fpu));
3922                if (IS_ERR(fpu)) {
3923                        r = PTR_ERR(fpu);
3924                        fpu = NULL;
3925                        goto out;
3926                }
3927                r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
3928                break;
3929        }
3930        case KVM_GET_STATS_FD: {
3931                r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
3932                break;
3933        }
3934        default:
3935                r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
3936        }
3937out:
3938        mutex_unlock(&vcpu->mutex);
3939        kfree(fpu);
3940        kfree(kvm_sregs);
3941        return r;
3942}
3943
3944#ifdef CONFIG_KVM_COMPAT
3945static long kvm_vcpu_compat_ioctl(struct file *filp,
3946                                  unsigned int ioctl, unsigned long arg)
3947{
3948        struct kvm_vcpu *vcpu = filp->private_data;
3949        void __user *argp = compat_ptr(arg);
3950        int r;
3951
3952        if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
3953                return -EIO;
3954
3955        switch (ioctl) {
3956        case KVM_SET_SIGNAL_MASK: {
3957                struct kvm_signal_mask __user *sigmask_arg = argp;
3958                struct kvm_signal_mask kvm_sigmask;
3959                sigset_t sigset;
3960
3961                if (argp) {
3962                        r = -EFAULT;
3963                        if (copy_from_user(&kvm_sigmask, argp,
3964                                           sizeof(kvm_sigmask)))
3965                                goto out;
3966                        r = -EINVAL;
3967                        if (kvm_sigmask.len != sizeof(compat_sigset_t))
3968                                goto out;
3969                        r = -EFAULT;
3970                        if (get_compat_sigset(&sigset,
3971                                              (compat_sigset_t __user *)sigmask_arg->sigset))
3972                                goto out;
3973                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3974                } else
3975                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
3976                break;
3977        }
3978        default:
3979                r = kvm_vcpu_ioctl(filp, ioctl, arg);
3980        }
3981
3982out:
3983        return r;
3984}
3985#endif
3986
3987static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
3988{
3989        struct kvm_device *dev = filp->private_data;
3990
3991        if (dev->ops->mmap)
3992                return dev->ops->mmap(dev, vma);
3993
3994        return -ENODEV;
3995}
3996
3997static int kvm_device_ioctl_attr(struct kvm_device *dev,
3998                                 int (*accessor)(struct kvm_device *dev,
3999                                                 struct kvm_device_attr *attr),
4000                                 unsigned long arg)
4001{
4002        struct kvm_device_attr attr;
4003
4004        if (!accessor)
4005                return -EPERM;
4006
4007        if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4008                return -EFAULT;
4009
4010        return accessor(dev, &attr);
4011}
4012
4013static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4014                             unsigned long arg)
4015{
4016        struct kvm_device *dev = filp->private_data;
4017
4018        if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged)
4019                return -EIO;
4020
4021        switch (ioctl) {
4022        case KVM_SET_DEVICE_ATTR:
4023                return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4024        case KVM_GET_DEVICE_ATTR:
4025                return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4026        case KVM_HAS_DEVICE_ATTR:
4027                return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4028        default:
4029                if (dev->ops->ioctl)
4030                        return dev->ops->ioctl(dev, ioctl, arg);
4031
4032                return -ENOTTY;
4033        }
4034}
4035
4036static int kvm_device_release(struct inode *inode, struct file *filp)
4037{
4038        struct kvm_device *dev = filp->private_data;
4039        struct kvm *kvm = dev->kvm;
4040
4041        if (dev->ops->release) {
4042                mutex_lock(&kvm->lock);
4043                list_del(&dev->vm_node);
4044                dev->ops->release(dev);
4045                mutex_unlock(&kvm->lock);
4046        }
4047
4048        kvm_put_kvm(kvm);
4049        return 0;
4050}
4051
4052static const struct file_operations kvm_device_fops = {
4053        .unlocked_ioctl = kvm_device_ioctl,
4054        .release = kvm_device_release,
4055        KVM_COMPAT(kvm_device_ioctl),
4056        .mmap = kvm_device_mmap,
4057};
4058
4059struct kvm_device *kvm_device_from_filp(struct file *filp)
4060{
4061        if (filp->f_op != &kvm_device_fops)
4062                return NULL;
4063
4064        return filp->private_data;
4065}
4066
4067static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4068#ifdef CONFIG_KVM_MPIC
4069        [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
4070        [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
4071#endif
4072};
4073
4074int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4075{
4076        if (type >= ARRAY_SIZE(kvm_device_ops_table))
4077                return -ENOSPC;
4078
4079        if (kvm_device_ops_table[type] != NULL)
4080                return -EEXIST;
4081
4082        kvm_device_ops_table[type] = ops;
4083        return 0;
4084}
4085
4086void kvm_unregister_device_ops(u32 type)
4087{
4088        if (kvm_device_ops_table[type] != NULL)
4089                kvm_device_ops_table[type] = NULL;
4090}
4091
4092static int kvm_ioctl_create_device(struct kvm *kvm,
4093                                   struct kvm_create_device *cd)
4094{
4095        const struct kvm_device_ops *ops = NULL;
4096        struct kvm_device *dev;
4097        bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4098        int type;
4099        int ret;
4100
4101        if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4102                return -ENODEV;
4103
4104        type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4105        ops = kvm_device_ops_table[type];
4106        if (ops == NULL)
4107                return -ENODEV;
4108
4109        if (test)
4110                return 0;
4111
4112        dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4113        if (!dev)
4114                return -ENOMEM;
4115
4116        dev->ops = ops;
4117        dev->kvm = kvm;
4118
4119        mutex_lock(&kvm->lock);
4120        ret = ops->create(dev, type);
4121        if (ret < 0) {
4122                mutex_unlock(&kvm->lock);
4123                kfree(dev);
4124                return ret;
4125        }
4126        list_add(&dev->vm_node, &kvm->devices);
4127        mutex_unlock(&kvm->lock);
4128
4129        if (ops->init)
4130                ops->init(dev);
4131
4132        kvm_get_kvm(kvm);
4133        ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4134        if (ret < 0) {
4135                kvm_put_kvm_no_destroy(kvm);
4136                mutex_lock(&kvm->lock);
4137                list_del(&dev->vm_node);
4138                mutex_unlock(&kvm->lock);
4139                ops->destroy(dev);
4140                return ret;
4141        }
4142
4143        cd->fd = ret;
4144        return 0;
4145}
4146
4147static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4148{
4149        switch (arg) {
4150        case KVM_CAP_USER_MEMORY:
4151        case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4152        case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4153        case KVM_CAP_INTERNAL_ERROR_DATA:
4154#ifdef CONFIG_HAVE_KVM_MSI
4155        case KVM_CAP_SIGNAL_MSI:
4156#endif
4157#ifdef CONFIG_HAVE_KVM_IRQFD
4158        case KVM_CAP_IRQFD:
4159        case KVM_CAP_IRQFD_RESAMPLE:
4160#endif
4161        case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4162        case KVM_CAP_CHECK_EXTENSION_VM:
4163        case KVM_CAP_ENABLE_CAP_VM:
4164        case KVM_CAP_HALT_POLL:
4165                return 1;
4166#ifdef CONFIG_KVM_MMIO
4167        case KVM_CAP_COALESCED_MMIO:
4168                return KVM_COALESCED_MMIO_PAGE_OFFSET;
4169        case KVM_CAP_COALESCED_PIO:
4170                return 1;
4171#endif
4172#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4173        case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4174                return KVM_DIRTY_LOG_MANUAL_CAPS;
4175#endif
4176#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4177        case KVM_CAP_IRQ_ROUTING:
4178                return KVM_MAX_IRQ_ROUTES;
4179#endif
4180#if KVM_ADDRESS_SPACE_NUM > 1
4181        case KVM_CAP_MULTI_ADDRESS_SPACE:
4182                return KVM_ADDRESS_SPACE_NUM;
4183#endif
4184        case KVM_CAP_NR_MEMSLOTS:
4185                return KVM_USER_MEM_SLOTS;
4186        case KVM_CAP_DIRTY_LOG_RING:
4187#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
4188                return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4189#else
4190                return 0;
4191#endif
4192        case KVM_CAP_BINARY_STATS_FD:
4193                return 1;
4194        default:
4195                break;
4196        }
4197        return kvm_vm_ioctl_check_extension(kvm, arg);
4198}
4199
4200static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4201{
4202        int r;
4203
4204        if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4205                return -EINVAL;
4206
4207        /* the size should be power of 2 */
4208        if (!size || (size & (size - 1)))
4209                return -EINVAL;
4210
4211        /* Should be bigger to keep the reserved entries, or a page */
4212        if (size < kvm_dirty_ring_get_rsvd_entries() *
4213            sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4214                return -EINVAL;
4215
4216        if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4217            sizeof(struct kvm_dirty_gfn))
4218                return -E2BIG;
4219
4220        /* We only allow it to set once */
4221        if (kvm->dirty_ring_size)
4222                return -EINVAL;
4223
4224        mutex_lock(&kvm->lock);
4225
4226        if (kvm->created_vcpus) {
4227                /* We don't allow to change this value after vcpu created */
4228                r = -EINVAL;
4229        } else {
4230                kvm->dirty_ring_size = size;
4231                r = 0;
4232        }
4233
4234        mutex_unlock(&kvm->lock);
4235        return r;
4236}
4237
4238static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4239{
4240        int i;
4241        struct kvm_vcpu *vcpu;
4242        int cleared = 0;
4243
4244        if (!kvm->dirty_ring_size)
4245                return -EINVAL;
4246
4247        mutex_lock(&kvm->slots_lock);
4248
4249        kvm_for_each_vcpu(i, vcpu, kvm)
4250                cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4251
4252        mutex_unlock(&kvm->slots_lock);
4253
4254        if (cleared)
4255                kvm_flush_remote_tlbs(kvm);
4256
4257        return cleared;
4258}
4259
4260int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4261                                                  struct kvm_enable_cap *cap)
4262{
4263        return -EINVAL;
4264}
4265
4266static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4267                                           struct kvm_enable_cap *cap)
4268{
4269        switch (cap->cap) {
4270#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4271        case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4272                u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4273
4274                if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4275                        allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4276
4277                if (cap->flags || (cap->args[0] & ~allowed_options))
4278                        return -EINVAL;
4279                kvm->manual_dirty_log_protect = cap->args[0];
4280                return 0;
4281        }
4282#endif
4283        case KVM_CAP_HALT_POLL: {
4284                if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4285                        return -EINVAL;
4286
4287                kvm->max_halt_poll_ns = cap->args[0];
4288                return 0;
4289        }
4290        case KVM_CAP_DIRTY_LOG_RING:
4291                return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4292        default:
4293                return kvm_vm_ioctl_enable_cap(kvm, cap);
4294        }
4295}
4296
4297static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4298                              size_t size, loff_t *offset)
4299{
4300        struct kvm *kvm = file->private_data;
4301
4302        return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4303                                &kvm_vm_stats_desc[0], &kvm->stat,
4304                                sizeof(kvm->stat), user_buffer, size, offset);
4305}
4306
4307static const struct file_operations kvm_vm_stats_fops = {
4308        .read = kvm_vm_stats_read,
4309        .llseek = noop_llseek,
4310};
4311
4312static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4313{
4314        int fd;
4315        struct file *file;
4316
4317        fd = get_unused_fd_flags(O_CLOEXEC);
4318        if (fd < 0)
4319                return fd;
4320
4321        file = anon_inode_getfile("kvm-vm-stats",
4322                        &kvm_vm_stats_fops, kvm, O_RDONLY);
4323        if (IS_ERR(file)) {
4324                put_unused_fd(fd);
4325                return PTR_ERR(file);
4326        }
4327        file->f_mode |= FMODE_PREAD;
4328        fd_install(fd, file);
4329
4330        return fd;
4331}
4332
4333static long kvm_vm_ioctl(struct file *filp,
4334                           unsigned int ioctl, unsigned long arg)
4335{
4336        struct kvm *kvm = filp->private_data;
4337        void __user *argp = (void __user *)arg;
4338        int r;
4339
4340        if (kvm->mm != current->mm || kvm->vm_bugged)
4341                return -EIO;
4342        switch (ioctl) {
4343        case KVM_CREATE_VCPU:
4344                r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4345                break;
4346        case KVM_ENABLE_CAP: {
4347                struct kvm_enable_cap cap;
4348
4349                r = -EFAULT;
4350                if (copy_from_user(&cap, argp, sizeof(cap)))
4351                        goto out;
4352                r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4353                break;
4354        }
4355        case KVM_SET_USER_MEMORY_REGION: {
4356                struct kvm_userspace_memory_region kvm_userspace_mem;
4357
4358                r = -EFAULT;
4359                if (copy_from_user(&kvm_userspace_mem, argp,
4360                                                sizeof(kvm_userspace_mem)))
4361                        goto out;
4362
4363                r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4364                break;
4365        }
4366        case KVM_GET_DIRTY_LOG: {
4367                struct kvm_dirty_log log;
4368
4369                r = -EFAULT;
4370                if (copy_from_user(&log, argp, sizeof(log)))
4371                        goto out;
4372                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4373                break;
4374        }
4375#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4376        case KVM_CLEAR_DIRTY_LOG: {
4377                struct kvm_clear_dirty_log log;
4378
4379                r = -EFAULT;
4380                if (copy_from_user(&log, argp, sizeof(log)))
4381                        goto out;
4382                r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4383                break;
4384        }
4385#endif
4386#ifdef CONFIG_KVM_MMIO
4387        case KVM_REGISTER_COALESCED_MMIO: {
4388                struct kvm_coalesced_mmio_zone zone;
4389
4390                r = -EFAULT;
4391                if (copy_from_user(&zone, argp, sizeof(zone)))
4392                        goto out;
4393                r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4394                break;
4395        }
4396        case KVM_UNREGISTER_COALESCED_MMIO: {
4397                struct kvm_coalesced_mmio_zone zone;
4398
4399                r = -EFAULT;
4400                if (copy_from_user(&zone, argp, sizeof(zone)))
4401                        goto out;
4402                r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4403                break;
4404        }
4405#endif
4406        case KVM_IRQFD: {
4407                struct kvm_irqfd data;
4408
4409                r = -EFAULT;
4410                if (copy_from_user(&data, argp, sizeof(data)))
4411                        goto out;
4412                r = kvm_irqfd(kvm, &data);
4413                break;
4414        }
4415        case KVM_IOEVENTFD: {
4416                struct kvm_ioeventfd data;
4417
4418                r = -EFAULT;
4419                if (copy_from_user(&data, argp, sizeof(data)))
4420                        goto out;
4421                r = kvm_ioeventfd(kvm, &data);
4422                break;
4423        }
4424#ifdef CONFIG_HAVE_KVM_MSI
4425        case KVM_SIGNAL_MSI: {
4426                struct kvm_msi msi;
4427
4428                r = -EFAULT;
4429                if (copy_from_user(&msi, argp, sizeof(msi)))
4430                        goto out;
4431                r = kvm_send_userspace_msi(kvm, &msi);
4432                break;
4433        }
4434#endif
4435#ifdef __KVM_HAVE_IRQ_LINE
4436        case KVM_IRQ_LINE_STATUS:
4437        case KVM_IRQ_LINE: {
4438                struct kvm_irq_level irq_event;
4439
4440                r = -EFAULT;
4441                if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4442                        goto out;
4443
4444                r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4445                                        ioctl == KVM_IRQ_LINE_STATUS);
4446                if (r)
4447                        goto out;
4448
4449                r = -EFAULT;
4450                if (ioctl == KVM_IRQ_LINE_STATUS) {
4451                        if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4452                                goto out;
4453                }
4454
4455                r = 0;
4456                break;
4457        }
4458#endif
4459#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4460        case KVM_SET_GSI_ROUTING: {
4461                struct kvm_irq_routing routing;
4462                struct kvm_irq_routing __user *urouting;
4463                struct kvm_irq_routing_entry *entries = NULL;
4464
4465                r = -EFAULT;
4466                if (copy_from_user(&routing, argp, sizeof(routing)))
4467                        goto out;
4468                r = -EINVAL;
4469                if (!kvm_arch_can_set_irq_routing(kvm))
4470                        goto out;
4471                if (routing.nr > KVM_MAX_IRQ_ROUTES)
4472                        goto out;
4473                if (routing.flags)
4474                        goto out;
4475                if (routing.nr) {
4476                        urouting = argp;
4477                        entries = vmemdup_user(urouting->entries,
4478                                               array_size(sizeof(*entries),
4479                                                          routing.nr));
4480                        if (IS_ERR(entries)) {
4481                                r = PTR_ERR(entries);
4482                                goto out;
4483                        }
4484                }
4485                r = kvm_set_irq_routing(kvm, entries, routing.nr,
4486                                        routing.flags);
4487                kvfree(entries);
4488                break;
4489        }
4490#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
4491        case KVM_CREATE_DEVICE: {
4492                struct kvm_create_device cd;
4493
4494                r = -EFAULT;
4495                if (copy_from_user(&cd, argp, sizeof(cd)))
4496                        goto out;
4497
4498                r = kvm_ioctl_create_device(kvm, &cd);
4499                if (r)
4500                        goto out;
4501
4502                r = -EFAULT;
4503                if (copy_to_user(argp, &cd, sizeof(cd)))
4504                        goto out;
4505
4506                r = 0;
4507                break;
4508        }
4509        case KVM_CHECK_EXTENSION:
4510                r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4511                break;
4512        case KVM_RESET_DIRTY_RINGS:
4513                r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4514                break;
4515        case KVM_GET_STATS_FD:
4516                r = kvm_vm_ioctl_get_stats_fd(kvm);
4517                break;
4518        default:
4519                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4520        }
4521out:
4522        return r;
4523}
4524
4525#ifdef CONFIG_KVM_COMPAT
4526struct compat_kvm_dirty_log {
4527        __u32 slot;
4528        __u32 padding1;
4529        union {
4530                compat_uptr_t dirty_bitmap; /* one bit per page */
4531                __u64 padding2;
4532        };
4533};
4534
4535struct compat_kvm_clear_dirty_log {
4536        __u32 slot;
4537        __u32 num_pages;
4538        __u64 first_page;
4539        union {
4540                compat_uptr_t dirty_bitmap; /* one bit per page */
4541                __u64 padding2;
4542        };
4543};
4544
4545static long kvm_vm_compat_ioctl(struct file *filp,
4546                           unsigned int ioctl, unsigned long arg)
4547{
4548        struct kvm *kvm = filp->private_data;
4549        int r;
4550
4551        if (kvm->mm != current->mm || kvm->vm_bugged)
4552                return -EIO;
4553        switch (ioctl) {
4554#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4555        case KVM_CLEAR_DIRTY_LOG: {
4556                struct compat_kvm_clear_dirty_log compat_log;
4557                struct kvm_clear_dirty_log log;
4558
4559                if (copy_from_user(&compat_log, (void __user *)arg,
4560                                   sizeof(compat_log)))
4561                        return -EFAULT;
4562                log.slot         = compat_log.slot;
4563                log.num_pages    = compat_log.num_pages;
4564                log.first_page   = compat_log.first_page;
4565                log.padding2     = compat_log.padding2;
4566                log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4567
4568                r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4569                break;
4570        }
4571#endif
4572        case KVM_GET_DIRTY_LOG: {
4573                struct compat_kvm_dirty_log compat_log;
4574                struct kvm_dirty_log log;
4575
4576                if (copy_from_user(&compat_log, (void __user *)arg,
4577                                   sizeof(compat_log)))
4578                        return -EFAULT;
4579                log.slot         = compat_log.slot;
4580                log.padding1     = compat_log.padding1;
4581                log.padding2     = compat_log.padding2;
4582                log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4583
4584                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4585                break;
4586        }
4587        default:
4588                r = kvm_vm_ioctl(filp, ioctl, arg);
4589        }
4590        return r;
4591}
4592#endif
4593
4594static struct file_operations kvm_vm_fops = {
4595        .release        = kvm_vm_release,
4596        .unlocked_ioctl = kvm_vm_ioctl,
4597        .llseek         = noop_llseek,
4598        KVM_COMPAT(kvm_vm_compat_ioctl),
4599};
4600
4601bool file_is_kvm(struct file *file)
4602{
4603        return file && file->f_op == &kvm_vm_fops;
4604}
4605EXPORT_SYMBOL_GPL(file_is_kvm);
4606
4607static int kvm_dev_ioctl_create_vm(unsigned long type)
4608{
4609        int r;
4610        struct kvm *kvm;
4611        struct file *file;
4612
4613        kvm = kvm_create_vm(type);
4614        if (IS_ERR(kvm))
4615                return PTR_ERR(kvm);
4616#ifdef CONFIG_KVM_MMIO
4617        r = kvm_coalesced_mmio_init(kvm);
4618        if (r < 0)
4619                goto put_kvm;
4620#endif
4621        r = get_unused_fd_flags(O_CLOEXEC);
4622        if (r < 0)
4623                goto put_kvm;
4624
4625        snprintf(kvm->stats_id, sizeof(kvm->stats_id),
4626                        "kvm-%d", task_pid_nr(current));
4627
4628        file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4629        if (IS_ERR(file)) {
4630                put_unused_fd(r);
4631                r = PTR_ERR(file);
4632                goto put_kvm;
4633        }
4634
4635        /*
4636         * Don't call kvm_put_kvm anymore at this point; file->f_op is
4637         * already set, with ->release() being kvm_vm_release().  In error
4638         * cases it will be called by the final fput(file) and will take
4639         * care of doing kvm_put_kvm(kvm).
4640         */
4641        if (kvm_create_vm_debugfs(kvm, r) < 0) {
4642                put_unused_fd(r);
4643                fput(file);
4644                return -ENOMEM;
4645        }
4646        kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
4647
4648        fd_install(r, file);
4649        return r;
4650
4651put_kvm:
4652        kvm_put_kvm(kvm);
4653        return r;
4654}
4655
4656static long kvm_dev_ioctl(struct file *filp,
4657                          unsigned int ioctl, unsigned long arg)
4658{
4659        long r = -EINVAL;
4660
4661        switch (ioctl) {
4662        case KVM_GET_API_VERSION:
4663                if (arg)
4664                        goto out;
4665                r = KVM_API_VERSION;
4666                break;
4667        case KVM_CREATE_VM:
4668                r = kvm_dev_ioctl_create_vm(arg);
4669                break;
4670        case KVM_CHECK_EXTENSION:
4671                r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
4672                break;
4673        case KVM_GET_VCPU_MMAP_SIZE:
4674                if (arg)
4675                        goto out;
4676                r = PAGE_SIZE;     /* struct kvm_run */
4677#ifdef CONFIG_X86
4678                r += PAGE_SIZE;    /* pio data page */
4679#endif
4680#ifdef CONFIG_KVM_MMIO
4681                r += PAGE_SIZE;    /* coalesced mmio ring page */
4682#endif
4683                break;
4684        case KVM_TRACE_ENABLE:
4685        case KVM_TRACE_PAUSE:
4686        case KVM_TRACE_DISABLE:
4687                r = -EOPNOTSUPP;
4688                break;
4689        default:
4690                return kvm_arch_dev_ioctl(filp, ioctl, arg);
4691        }
4692out:
4693        return r;
4694}
4695
4696static struct file_operations kvm_chardev_ops = {
4697        .unlocked_ioctl = kvm_dev_ioctl,
4698        .llseek         = noop_llseek,
4699        KVM_COMPAT(kvm_dev_ioctl),
4700};
4701
4702static struct miscdevice kvm_dev = {
4703        KVM_MINOR,
4704        "kvm",
4705        &kvm_chardev_ops,
4706};
4707
4708static void hardware_enable_nolock(void *junk)
4709{
4710        int cpu = raw_smp_processor_id();
4711        int r;
4712
4713        if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
4714                return;
4715
4716        cpumask_set_cpu(cpu, cpus_hardware_enabled);
4717
4718        r = kvm_arch_hardware_enable();
4719
4720        if (r) {
4721                cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4722                atomic_inc(&hardware_enable_failed);
4723                pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
4724        }
4725}
4726
4727static int kvm_starting_cpu(unsigned int cpu)
4728{
4729        raw_spin_lock(&kvm_count_lock);
4730        if (kvm_usage_count)
4731                hardware_enable_nolock(NULL);
4732        raw_spin_unlock(&kvm_count_lock);
4733        return 0;
4734}
4735
4736static void hardware_disable_nolock(void *junk)
4737{
4738        int cpu = raw_smp_processor_id();
4739
4740        if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
4741                return;
4742        cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4743        kvm_arch_hardware_disable();
4744}
4745
4746static int kvm_dying_cpu(unsigned int cpu)
4747{
4748        raw_spin_lock(&kvm_count_lock);
4749        if (kvm_usage_count)
4750                hardware_disable_nolock(NULL);
4751        raw_spin_unlock(&kvm_count_lock);
4752        return 0;
4753}
4754
4755static void hardware_disable_all_nolock(void)
4756{
4757        BUG_ON(!kvm_usage_count);
4758
4759        kvm_usage_count--;
4760        if (!kvm_usage_count)
4761                on_each_cpu(hardware_disable_nolock, NULL, 1);
4762}
4763
4764static void hardware_disable_all(void)
4765{
4766        raw_spin_lock(&kvm_count_lock);
4767        hardware_disable_all_nolock();
4768        raw_spin_unlock(&kvm_count_lock);
4769}
4770
4771static int hardware_enable_all(void)
4772{
4773        int r = 0;
4774
4775        raw_spin_lock(&kvm_count_lock);
4776
4777        kvm_usage_count++;
4778        if (kvm_usage_count == 1) {
4779                atomic_set(&hardware_enable_failed, 0);
4780                on_each_cpu(hardware_enable_nolock, NULL, 1);
4781
4782                if (atomic_read(&hardware_enable_failed)) {
4783                        hardware_disable_all_nolock();
4784                        r = -EBUSY;
4785                }
4786        }
4787
4788        raw_spin_unlock(&kvm_count_lock);
4789
4790        return r;
4791}
4792
4793static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
4794                      void *v)
4795{
4796        /*
4797         * Some (well, at least mine) BIOSes hang on reboot if
4798         * in vmx root mode.
4799         *
4800         * And Intel TXT required VMX off for all cpu when system shutdown.
4801         */
4802        pr_info("kvm: exiting hardware virtualization\n");
4803        kvm_rebooting = true;
4804        on_each_cpu(hardware_disable_nolock, NULL, 1);
4805        return NOTIFY_OK;
4806}
4807
4808static struct notifier_block kvm_reboot_notifier = {
4809        .notifier_call = kvm_reboot,
4810        .priority = 0,
4811};
4812
4813static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
4814{
4815        int i;
4816
4817        for (i = 0; i < bus->dev_count; i++) {
4818                struct kvm_io_device *pos = bus->range[i].dev;
4819
4820                kvm_iodevice_destructor(pos);
4821        }
4822        kfree(bus);
4823}
4824
4825static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
4826                                 const struct kvm_io_range *r2)
4827{
4828        gpa_t addr1 = r1->addr;
4829        gpa_t addr2 = r2->addr;
4830
4831        if (addr1 < addr2)
4832                return -1;
4833
4834        /* If r2->len == 0, match the exact address.  If r2->len != 0,
4835         * accept any overlapping write.  Any order is acceptable for
4836         * overlapping ranges, because kvm_io_bus_get_first_dev ensures
4837         * we process all of them.
4838         */
4839        if (r2->len) {
4840                addr1 += r1->len;
4841                addr2 += r2->len;
4842        }
4843
4844        if (addr1 > addr2)
4845                return 1;
4846
4847        return 0;
4848}
4849
4850static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4851{
4852        return kvm_io_bus_cmp(p1, p2);
4853}
4854
4855static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
4856                             gpa_t addr, int len)
4857{
4858        struct kvm_io_range *range, key;
4859        int off;
4860
4861        key = (struct kvm_io_range) {
4862                .addr = addr,
4863                .len = len,
4864        };
4865
4866        range = bsearch(&key, bus->range, bus->dev_count,
4867                        sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4868        if (range == NULL)
4869                return -ENOENT;
4870
4871        off = range - bus->range;
4872
4873        while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
4874                off--;
4875
4876        return off;
4877}
4878
4879static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4880                              struct kvm_io_range *range, const void *val)
4881{
4882        int idx;
4883
4884        idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4885        if (idx < 0)
4886                return -EOPNOTSUPP;
4887
4888        while (idx < bus->dev_count &&
4889                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4890                if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
4891                                        range->len, val))
4892                        return idx;
4893                idx++;
4894        }
4895
4896        return -EOPNOTSUPP;
4897}
4898
4899/* kvm_io_bus_write - called under kvm->slots_lock */
4900int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4901                     int len, const void *val)
4902{
4903        struct kvm_io_bus *bus;
4904        struct kvm_io_range range;
4905        int r;
4906
4907        range = (struct kvm_io_range) {
4908                .addr = addr,
4909                .len = len,
4910        };
4911
4912        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4913        if (!bus)
4914                return -ENOMEM;
4915        r = __kvm_io_bus_write(vcpu, bus, &range, val);
4916        return r < 0 ? r : 0;
4917}
4918EXPORT_SYMBOL_GPL(kvm_io_bus_write);
4919
4920/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
4921int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
4922                            gpa_t addr, int len, const void *val, long cookie)
4923{
4924        struct kvm_io_bus *bus;
4925        struct kvm_io_range range;
4926
4927        range = (struct kvm_io_range) {
4928                .addr = addr,
4929                .len = len,
4930        };
4931
4932        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4933        if (!bus)
4934                return -ENOMEM;
4935
4936        /* First try the device referenced by cookie. */
4937        if ((cookie >= 0) && (cookie < bus->dev_count) &&
4938            (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
4939                if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
4940                                        val))
4941                        return cookie;
4942
4943        /*
4944         * cookie contained garbage; fall back to search and return the
4945         * correct cookie value.
4946         */
4947        return __kvm_io_bus_write(vcpu, bus, &range, val);
4948}
4949
4950static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4951                             struct kvm_io_range *range, void *val)
4952{
4953        int idx;
4954
4955        idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4956        if (idx < 0)
4957                return -EOPNOTSUPP;
4958
4959        while (idx < bus->dev_count &&
4960                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4961                if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
4962                                       range->len, val))
4963                        return idx;
4964                idx++;
4965        }
4966
4967        return -EOPNOTSUPP;
4968}
4969
4970/* kvm_io_bus_read - called under kvm->slots_lock */
4971int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4972                    int len, void *val)
4973{
4974        struct kvm_io_bus *bus;
4975        struct kvm_io_range range;
4976        int r;
4977
4978        range = (struct kvm_io_range) {
4979                .addr = addr,
4980                .len = len,
4981        };
4982
4983        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4984        if (!bus)
4985                return -ENOMEM;
4986        r = __kvm_io_bus_read(vcpu, bus, &range, val);
4987        return r < 0 ? r : 0;
4988}
4989
4990/* Caller must hold slots_lock. */
4991int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
4992                            int len, struct kvm_io_device *dev)
4993{
4994        int i;
4995        struct kvm_io_bus *new_bus, *bus;
4996        struct kvm_io_range range;
4997
4998        bus = kvm_get_bus(kvm, bus_idx);
4999        if (!bus)
5000                return -ENOMEM;
5001
5002        /* exclude ioeventfd which is limited by maximum fd */
5003        if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5004                return -ENOSPC;
5005
5006        new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5007                          GFP_KERNEL_ACCOUNT);
5008        if (!new_bus)
5009                return -ENOMEM;
5010
5011        range = (struct kvm_io_range) {
5012                .addr = addr,
5013                .len = len,
5014                .dev = dev,
5015        };
5016
5017        for (i = 0; i < bus->dev_count; i++)
5018                if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5019                        break;
5020
5021        memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5022        new_bus->dev_count++;
5023        new_bus->range[i] = range;
5024        memcpy(new_bus->range + i + 1, bus->range + i,
5025                (bus->dev_count - i) * sizeof(struct kvm_io_range));
5026        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5027        synchronize_srcu_expedited(&kvm->srcu);
5028        kfree(bus);
5029
5030        return 0;
5031}
5032
5033int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5034                              struct kvm_io_device *dev)
5035{
5036        int i, j;
5037        struct kvm_io_bus *new_bus, *bus;
5038
5039        lockdep_assert_held(&kvm->slots_lock);
5040
5041        bus = kvm_get_bus(kvm, bus_idx);
5042        if (!bus)
5043                return 0;
5044
5045        for (i = 0; i < bus->dev_count; i++) {
5046                if (bus->range[i].dev == dev) {
5047                        break;
5048                }
5049        }
5050
5051        if (i == bus->dev_count)
5052                return 0;
5053
5054        new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5055                          GFP_KERNEL_ACCOUNT);
5056        if (new_bus) {
5057                memcpy(new_bus, bus, struct_size(bus, range, i));
5058                new_bus->dev_count--;
5059                memcpy(new_bus->range + i, bus->range + i + 1,
5060                                flex_array_size(new_bus, range, new_bus->dev_count - i));
5061        }
5062
5063        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5064        synchronize_srcu_expedited(&kvm->srcu);
5065
5066        /* Destroy the old bus _after_ installing the (null) bus. */
5067        if (!new_bus) {
5068                pr_err("kvm: failed to shrink bus, removing it completely\n");
5069                for (j = 0; j < bus->dev_count; j++) {
5070                        if (j == i)
5071                                continue;
5072                        kvm_iodevice_destructor(bus->range[j].dev);
5073                }
5074        }
5075
5076        kfree(bus);
5077        return new_bus ? 0 : -ENOMEM;
5078}
5079
5080struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5081                                         gpa_t addr)
5082{
5083        struct kvm_io_bus *bus;
5084        int dev_idx, srcu_idx;
5085        struct kvm_io_device *iodev = NULL;
5086
5087        srcu_idx = srcu_read_lock(&kvm->srcu);
5088
5089        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5090        if (!bus)
5091                goto out_unlock;
5092
5093        dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5094        if (dev_idx < 0)
5095                goto out_unlock;
5096
5097        iodev = bus->range[dev_idx].dev;
5098
5099out_unlock:
5100        srcu_read_unlock(&kvm->srcu, srcu_idx);
5101
5102        return iodev;
5103}
5104EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5105
5106static int kvm_debugfs_open(struct inode *inode, struct file *file,
5107                           int (*get)(void *, u64 *), int (*set)(void *, u64),
5108                           const char *fmt)
5109{
5110        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5111                                          inode->i_private;
5112
5113        /*
5114         * The debugfs files are a reference to the kvm struct which
5115        * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
5116        * avoids the race between open and the removal of the debugfs directory.
5117         */
5118        if (!kvm_get_kvm_safe(stat_data->kvm))
5119                return -ENOENT;
5120
5121        if (simple_attr_open(inode, file, get,
5122                    kvm_stats_debugfs_mode(stat_data->desc) & 0222
5123                    ? set : NULL,
5124                    fmt)) {
5125                kvm_put_kvm(stat_data->kvm);
5126                return -ENOMEM;
5127        }
5128
5129        return 0;
5130}
5131
5132static int kvm_debugfs_release(struct inode *inode, struct file *file)
5133{
5134        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5135                                          inode->i_private;
5136
5137        simple_attr_release(inode, file);
5138        kvm_put_kvm(stat_data->kvm);
5139
5140        return 0;
5141}
5142
5143static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5144{
5145        *val = *(u64 *)((void *)(&kvm->stat) + offset);
5146
5147        return 0;
5148}
5149
5150static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5151{
5152        *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5153
5154        return 0;
5155}
5156
5157static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5158{
5159        int i;
5160        struct kvm_vcpu *vcpu;
5161
5162        *val = 0;
5163
5164        kvm_for_each_vcpu(i, vcpu, kvm)
5165                *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5166
5167        return 0;
5168}
5169
5170static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5171{
5172        int i;
5173        struct kvm_vcpu *vcpu;
5174
5175        kvm_for_each_vcpu(i, vcpu, kvm)
5176                *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5177
5178        return 0;
5179}
5180
5181static int kvm_stat_data_get(void *data, u64 *val)
5182{
5183        int r = -EFAULT;
5184        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5185
5186        switch (stat_data->kind) {
5187        case KVM_STAT_VM:
5188                r = kvm_get_stat_per_vm(stat_data->kvm,
5189                                        stat_data->desc->desc.offset, val);
5190                break;
5191        case KVM_STAT_VCPU:
5192                r = kvm_get_stat_per_vcpu(stat_data->kvm,
5193                                          stat_data->desc->desc.offset, val);
5194                break;
5195        }
5196
5197        return r;
5198}
5199
5200static int kvm_stat_data_clear(void *data, u64 val)
5201{
5202        int r = -EFAULT;
5203        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5204
5205        if (val)
5206                return -EINVAL;
5207
5208        switch (stat_data->kind) {
5209        case KVM_STAT_VM:
5210                r = kvm_clear_stat_per_vm(stat_data->kvm,
5211                                          stat_data->desc->desc.offset);
5212                break;
5213        case KVM_STAT_VCPU:
5214                r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5215                                            stat_data->desc->desc.offset);
5216                break;
5217        }
5218
5219        return r;
5220}
5221
5222static int kvm_stat_data_open(struct inode *inode, struct file *file)
5223{
5224        __simple_attr_check_format("%llu\n", 0ull);
5225        return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5226                                kvm_stat_data_clear, "%llu\n");
5227}
5228
5229static const struct file_operations stat_fops_per_vm = {
5230        .owner = THIS_MODULE,
5231        .open = kvm_stat_data_open,
5232        .release = kvm_debugfs_release,
5233        .read = simple_attr_read,
5234        .write = simple_attr_write,
5235        .llseek = no_llseek,
5236};
5237
5238static int vm_stat_get(void *_offset, u64 *val)
5239{
5240        unsigned offset = (long)_offset;
5241        struct kvm *kvm;
5242        u64 tmp_val;
5243
5244        *val = 0;
5245        mutex_lock(&kvm_lock);
5246        list_for_each_entry(kvm, &vm_list, vm_list) {
5247                kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5248                *val += tmp_val;
5249        }
5250        mutex_unlock(&kvm_lock);
5251        return 0;
5252}
5253
5254static int vm_stat_clear(void *_offset, u64 val)
5255{
5256        unsigned offset = (long)_offset;
5257        struct kvm *kvm;
5258
5259        if (val)
5260                return -EINVAL;
5261
5262        mutex_lock(&kvm_lock);
5263        list_for_each_entry(kvm, &vm_list, vm_list) {
5264                kvm_clear_stat_per_vm(kvm, offset);
5265        }
5266        mutex_unlock(&kvm_lock);
5267
5268        return 0;
5269}
5270
5271DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5272DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5273
5274static int vcpu_stat_get(void *_offset, u64 *val)
5275{
5276        unsigned offset = (long)_offset;
5277        struct kvm *kvm;
5278        u64 tmp_val;
5279
5280        *val = 0;
5281        mutex_lock(&kvm_lock);
5282        list_for_each_entry(kvm, &vm_list, vm_list) {
5283                kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5284                *val += tmp_val;
5285        }
5286        mutex_unlock(&kvm_lock);
5287        return 0;
5288}
5289
5290static int vcpu_stat_clear(void *_offset, u64 val)
5291{
5292        unsigned offset = (long)_offset;
5293        struct kvm *kvm;
5294
5295        if (val)
5296                return -EINVAL;
5297
5298        mutex_lock(&kvm_lock);
5299        list_for_each_entry(kvm, &vm_list, vm_list) {
5300                kvm_clear_stat_per_vcpu(kvm, offset);
5301        }
5302        mutex_unlock(&kvm_lock);
5303
5304        return 0;
5305}
5306
5307DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5308                        "%llu\n");
5309DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5310
5311static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5312{
5313        struct kobj_uevent_env *env;
5314        unsigned long long created, active;
5315
5316        if (!kvm_dev.this_device || !kvm)
5317                return;
5318
5319        mutex_lock(&kvm_lock);
5320        if (type == KVM_EVENT_CREATE_VM) {
5321                kvm_createvm_count++;
5322                kvm_active_vms++;
5323        } else if (type == KVM_EVENT_DESTROY_VM) {
5324                kvm_active_vms--;
5325        }
5326        created = kvm_createvm_count;
5327        active = kvm_active_vms;
5328        mutex_unlock(&kvm_lock);
5329
5330        env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5331        if (!env)
5332                return;
5333
5334        add_uevent_var(env, "CREATED=%llu", created);
5335        add_uevent_var(env, "COUNT=%llu", active);
5336
5337        if (type == KVM_EVENT_CREATE_VM) {
5338                add_uevent_var(env, "EVENT=create");
5339                kvm->userspace_pid = task_pid_nr(current);
5340        } else if (type == KVM_EVENT_DESTROY_VM) {
5341                add_uevent_var(env, "EVENT=destroy");
5342        }
5343        add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5344
5345        if (kvm->debugfs_dentry) {
5346                char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5347
5348                if (p) {
5349                        tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5350                        if (!IS_ERR(tmp))
5351                                add_uevent_var(env, "STATS_PATH=%s", tmp);
5352                        kfree(p);
5353                }
5354        }
5355        /* no need for checks, since we are adding at most only 5 keys */
5356        env->envp[env->envp_idx++] = NULL;
5357        kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5358        kfree(env);
5359}
5360
5361static void kvm_init_debug(void)
5362{
5363        const struct file_operations *fops;
5364        const struct _kvm_stats_desc *pdesc;
5365        int i;
5366
5367        kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
5368
5369        for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5370                pdesc = &kvm_vm_stats_desc[i];
5371                if (kvm_stats_debugfs_mode(pdesc) & 0222)
5372                        fops = &vm_stat_fops;
5373                else
5374                        fops = &vm_stat_readonly_fops;
5375                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5376                                kvm_debugfs_dir,
5377                                (void *)(long)pdesc->desc.offset, fops);
5378        }
5379
5380        for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5381                pdesc = &kvm_vcpu_stats_desc[i];
5382                if (kvm_stats_debugfs_mode(pdesc) & 0222)
5383                        fops = &vcpu_stat_fops;
5384                else
5385                        fops = &vcpu_stat_readonly_fops;
5386                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5387                                kvm_debugfs_dir,
5388                                (void *)(long)pdesc->desc.offset, fops);
5389        }
5390}
5391
5392static int kvm_suspend(void)
5393{
5394        if (kvm_usage_count)
5395                hardware_disable_nolock(NULL);
5396        return 0;
5397}
5398
5399static void kvm_resume(void)
5400{
5401        if (kvm_usage_count) {
5402#ifdef CONFIG_LOCKDEP
5403                WARN_ON(lockdep_is_held(&kvm_count_lock));
5404#endif
5405                hardware_enable_nolock(NULL);
5406        }
5407}
5408
5409static struct syscore_ops kvm_syscore_ops = {
5410        .suspend = kvm_suspend,
5411        .resume = kvm_resume,
5412};
5413
5414static inline
5415struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5416{
5417        return container_of(pn, struct kvm_vcpu, preempt_notifier);
5418}
5419
5420static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5421{
5422        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5423
5424        WRITE_ONCE(vcpu->preempted, false);
5425        WRITE_ONCE(vcpu->ready, false);
5426
5427        __this_cpu_write(kvm_running_vcpu, vcpu);
5428        kvm_arch_sched_in(vcpu, cpu);
5429        kvm_arch_vcpu_load(vcpu, cpu);
5430}
5431
5432static void kvm_sched_out(struct preempt_notifier *pn,
5433                          struct task_struct *next)
5434{
5435        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5436
5437        if (current->on_rq) {
5438                WRITE_ONCE(vcpu->preempted, true);
5439                WRITE_ONCE(vcpu->ready, true);
5440        }
5441        kvm_arch_vcpu_put(vcpu);
5442        __this_cpu_write(kvm_running_vcpu, NULL);
5443}
5444
5445/**
5446 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
5447 *
5448 * We can disable preemption locally around accessing the per-CPU variable,
5449 * and use the resolved vcpu pointer after enabling preemption again,
5450 * because even if the current thread is migrated to another CPU, reading
5451 * the per-CPU value later will give us the same value as we update the
5452 * per-CPU variable in the preempt notifier handlers.
5453 */
5454struct kvm_vcpu *kvm_get_running_vcpu(void)
5455{
5456        struct kvm_vcpu *vcpu;
5457
5458        preempt_disable();
5459        vcpu = __this_cpu_read(kvm_running_vcpu);
5460        preempt_enable();
5461
5462        return vcpu;
5463}
5464EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5465
5466/**
5467 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
5468 */
5469struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5470{
5471        return &kvm_running_vcpu;
5472}
5473
5474struct kvm_cpu_compat_check {
5475        void *opaque;
5476        int *ret;
5477};
5478
5479static void check_processor_compat(void *data)
5480{
5481        struct kvm_cpu_compat_check *c = data;
5482
5483        *c->ret = kvm_arch_check_processor_compat(c->opaque);
5484}
5485
5486int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
5487                  struct module *module)
5488{
5489        struct kvm_cpu_compat_check c;
5490        int r;
5491        int cpu;
5492
5493        r = kvm_arch_init(opaque);
5494        if (r)
5495                goto out_fail;
5496
5497        /*
5498         * kvm_arch_init makes sure there's at most one caller
5499         * for architectures that support multiple implementations,
5500         * like intel and amd on x86.
5501         * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
5502         * conflicts in case kvm is already setup for another implementation.
5503         */
5504        r = kvm_irqfd_init();
5505        if (r)
5506                goto out_irqfd;
5507
5508        if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
5509                r = -ENOMEM;
5510                goto out_free_0;
5511        }
5512
5513        r = kvm_arch_hardware_setup(opaque);
5514        if (r < 0)
5515                goto out_free_1;
5516
5517        c.ret = &r;
5518        c.opaque = opaque;
5519        for_each_online_cpu(cpu) {
5520                smp_call_function_single(cpu, check_processor_compat, &c, 1);
5521                if (r < 0)
5522                        goto out_free_2;
5523        }
5524
5525        r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
5526                                      kvm_starting_cpu, kvm_dying_cpu);
5527        if (r)
5528                goto out_free_2;
5529        register_reboot_notifier(&kvm_reboot_notifier);
5530
5531        /* A kmem cache lets us meet the alignment requirements of fx_save. */
5532        if (!vcpu_align)
5533                vcpu_align = __alignof__(struct kvm_vcpu);
5534        kvm_vcpu_cache =
5535                kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5536                                           SLAB_ACCOUNT,
5537                                           offsetof(struct kvm_vcpu, arch),
5538                                           offsetofend(struct kvm_vcpu, stats_id)
5539                                           - offsetof(struct kvm_vcpu, arch),
5540                                           NULL);
5541        if (!kvm_vcpu_cache) {
5542                r = -ENOMEM;
5543                goto out_free_3;
5544        }
5545
5546        r = kvm_async_pf_init();
5547        if (r)
5548                goto out_free;
5549
5550        kvm_chardev_ops.owner = module;
5551        kvm_vm_fops.owner = module;
5552        kvm_vcpu_fops.owner = module;
5553
5554        r = misc_register(&kvm_dev);
5555        if (r) {
5556                pr_err("kvm: misc device register failed\n");
5557                goto out_unreg;
5558        }
5559
5560        register_syscore_ops(&kvm_syscore_ops);
5561
5562        kvm_preempt_ops.sched_in = kvm_sched_in;
5563        kvm_preempt_ops.sched_out = kvm_sched_out;
5564
5565        kvm_init_debug();
5566
5567        r = kvm_vfio_ops_init();
5568        WARN_ON(r);
5569
5570        return 0;
5571
5572out_unreg:
5573        kvm_async_pf_deinit();
5574out_free:
5575        kmem_cache_destroy(kvm_vcpu_cache);
5576out_free_3:
5577        unregister_reboot_notifier(&kvm_reboot_notifier);
5578        cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5579out_free_2:
5580        kvm_arch_hardware_unsetup();
5581out_free_1:
5582        free_cpumask_var(cpus_hardware_enabled);
5583out_free_0:
5584        kvm_irqfd_exit();
5585out_irqfd:
5586        kvm_arch_exit();
5587out_fail:
5588        return r;
5589}
5590EXPORT_SYMBOL_GPL(kvm_init);
5591
5592void kvm_exit(void)
5593{
5594        debugfs_remove_recursive(kvm_debugfs_dir);
5595        misc_deregister(&kvm_dev);
5596        kmem_cache_destroy(kvm_vcpu_cache);
5597        kvm_async_pf_deinit();
5598        unregister_syscore_ops(&kvm_syscore_ops);
5599        unregister_reboot_notifier(&kvm_reboot_notifier);
5600        cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5601        on_each_cpu(hardware_disable_nolock, NULL, 1);
5602        kvm_arch_hardware_unsetup();
5603        kvm_arch_exit();
5604        kvm_irqfd_exit();
5605        free_cpumask_var(cpus_hardware_enabled);
5606        kvm_vfio_ops_exit();
5607}
5608EXPORT_SYMBOL_GPL(kvm_exit);
5609
5610struct kvm_vm_worker_thread_context {
5611        struct kvm *kvm;
5612        struct task_struct *parent;
5613        struct completion init_done;
5614        kvm_vm_thread_fn_t thread_fn;
5615        uintptr_t data;
5616        int err;
5617};
5618
5619static int kvm_vm_worker_thread(void *context)
5620{
5621        /*
5622         * The init_context is allocated on the stack of the parent thread, so
5623         * we have to locally copy anything that is needed beyond initialization
5624         */
5625        struct kvm_vm_worker_thread_context *init_context = context;
5626        struct kvm *kvm = init_context->kvm;
5627        kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5628        uintptr_t data = init_context->data;
5629        int err;
5630
5631        err = kthread_park(current);
5632        /* kthread_park(current) is never supposed to return an error */
5633        WARN_ON(err != 0);
5634        if (err)
5635                goto init_complete;
5636
5637        err = cgroup_attach_task_all(init_context->parent, current);
5638        if (err) {
5639                kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5640                        __func__, err);
5641                goto init_complete;
5642        }
5643
5644        set_user_nice(current, task_nice(init_context->parent));
5645
5646init_complete:
5647        init_context->err = err;
5648        complete(&init_context->init_done);
5649        init_context = NULL;
5650
5651        if (err)
5652                return err;
5653
5654        /* Wait to be woken up by the spawner before proceeding. */
5655        kthread_parkme();
5656
5657        if (!kthread_should_stop())
5658                err = thread_fn(kvm, data);
5659
5660        return err;
5661}
5662
5663int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5664                                uintptr_t data, const char *name,
5665                                struct task_struct **thread_ptr)
5666{
5667        struct kvm_vm_worker_thread_context init_context = {};
5668        struct task_struct *thread;
5669
5670        *thread_ptr = NULL;
5671        init_context.kvm = kvm;
5672        init_context.parent = current;
5673        init_context.thread_fn = thread_fn;
5674        init_context.data = data;
5675        init_completion(&init_context.init_done);
5676
5677        thread = kthread_run(kvm_vm_worker_thread, &init_context,
5678                             "%s-%d", name, task_pid_nr(current));
5679        if (IS_ERR(thread))
5680                return PTR_ERR(thread);
5681
5682        /* kthread_run is never supposed to return NULL */
5683        WARN_ON(thread == NULL);
5684
5685        wait_for_completion(&init_context.init_done);
5686
5687        if (!init_context.err)
5688                *thread_ptr = thread;
5689
5690        return init_context.err;
5691}
5692