linux/drivers/kvm/kvm_main.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * This module enables machines with Intel VT-x extensions to run virtual
   5 * machines without emulation or binary translation.
   6 *
   7 * Copyright (C) 2006 Qumranet, Inc.
   8 *
   9 * Authors:
  10 *   Avi Kivity   <avi@qumranet.com>
  11 *   Yaniv Kamay  <yaniv@qumranet.com>
  12 *
  13 * This work is licensed under the terms of the GNU GPL, version 2.  See
  14 * the COPYING file in the top-level directory.
  15 *
  16 */
  17
  18#include "kvm.h"
  19#include "x86_emulate.h"
  20#include "segment_descriptor.h"
  21#include "irq.h"
  22
  23#include <linux/kvm.h>
  24#include <linux/module.h>
  25#include <linux/errno.h>
  26#include <linux/percpu.h>
  27#include <linux/gfp.h>
  28#include <linux/mm.h>
  29#include <linux/miscdevice.h>
  30#include <linux/vmalloc.h>
  31#include <linux/reboot.h>
  32#include <linux/debugfs.h>
  33#include <linux/highmem.h>
  34#include <linux/file.h>
  35#include <linux/sysdev.h>
  36#include <linux/cpu.h>
  37#include <linux/sched.h>
  38#include <linux/cpumask.h>
  39#include <linux/smp.h>
  40#include <linux/anon_inodes.h>
  41#include <linux/profile.h>
  42
  43#include <asm/processor.h>
  44#include <asm/msr.h>
  45#include <asm/io.h>
  46#include <asm/uaccess.h>
  47#include <asm/desc.h>
  48
  49MODULE_AUTHOR("Qumranet");
  50MODULE_LICENSE("GPL");
  51
  52static DEFINE_SPINLOCK(kvm_lock);
  53static LIST_HEAD(vm_list);
  54
  55static cpumask_t cpus_hardware_enabled;
  56
  57struct kvm_x86_ops *kvm_x86_ops;
  58struct kmem_cache *kvm_vcpu_cache;
  59EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
  60
  61static __read_mostly struct preempt_ops kvm_preempt_ops;
  62
  63#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
  64
  65static struct kvm_stats_debugfs_item {
  66        const char *name;
  67        int offset;
  68        struct dentry *dentry;
  69} debugfs_entries[] = {
  70        { "pf_fixed", STAT_OFFSET(pf_fixed) },
  71        { "pf_guest", STAT_OFFSET(pf_guest) },
  72        { "tlb_flush", STAT_OFFSET(tlb_flush) },
  73        { "invlpg", STAT_OFFSET(invlpg) },
  74        { "exits", STAT_OFFSET(exits) },
  75        { "io_exits", STAT_OFFSET(io_exits) },
  76        { "mmio_exits", STAT_OFFSET(mmio_exits) },
  77        { "signal_exits", STAT_OFFSET(signal_exits) },
  78        { "irq_window", STAT_OFFSET(irq_window_exits) },
  79        { "halt_exits", STAT_OFFSET(halt_exits) },
  80        { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
  81        { "request_irq", STAT_OFFSET(request_irq_exits) },
  82        { "irq_exits", STAT_OFFSET(irq_exits) },
  83        { "light_exits", STAT_OFFSET(light_exits) },
  84        { "efer_reload", STAT_OFFSET(efer_reload) },
  85        { NULL }
  86};
  87
  88static struct dentry *debugfs_dir;
  89
  90#define MAX_IO_MSRS 256
  91
  92#define CR0_RESERVED_BITS                                               \
  93        (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
  94                          | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
  95                          | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
  96#define CR4_RESERVED_BITS                                               \
  97        (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
  98                          | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
  99                          | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
 100                          | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 101
 102#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 103#define EFER_RESERVED_BITS 0xfffffffffffff2fe
 104
 105#ifdef CONFIG_X86_64
 106// LDT or TSS descriptor in the GDT. 16 bytes.
 107struct segment_descriptor_64 {
 108        struct segment_descriptor s;
 109        u32 base_higher;
 110        u32 pad_zero;
 111};
 112
 113#endif
 114
 115static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 116                           unsigned long arg);
 117
 118unsigned long segment_base(u16 selector)
 119{
 120        struct descriptor_table gdt;
 121        struct segment_descriptor *d;
 122        unsigned long table_base;
 123        typedef unsigned long ul;
 124        unsigned long v;
 125
 126        if (selector == 0)
 127                return 0;
 128
 129        asm ("sgdt %0" : "=m"(gdt));
 130        table_base = gdt.base;
 131
 132        if (selector & 4) {           /* from ldt */
 133                u16 ldt_selector;
 134
 135                asm ("sldt %0" : "=g"(ldt_selector));
 136                table_base = segment_base(ldt_selector);
 137        }
 138        d = (struct segment_descriptor *)(table_base + (selector & ~7));
 139        v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
 140#ifdef CONFIG_X86_64
 141        if (d->system == 0
 142            && (d->type == 2 || d->type == 9 || d->type == 11))
 143                v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
 144#endif
 145        return v;
 146}
 147EXPORT_SYMBOL_GPL(segment_base);
 148
 149static inline int valid_vcpu(int n)
 150{
 151        return likely(n >= 0 && n < KVM_MAX_VCPUS);
 152}
 153
 154void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 155{
 156        if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
 157                return;
 158
 159        vcpu->guest_fpu_loaded = 1;
 160        fx_save(&vcpu->host_fx_image);
 161        fx_restore(&vcpu->guest_fx_image);
 162}
 163EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
 164
 165void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 166{
 167        if (!vcpu->guest_fpu_loaded)
 168                return;
 169
 170        vcpu->guest_fpu_loaded = 0;
 171        fx_save(&vcpu->guest_fx_image);
 172        fx_restore(&vcpu->host_fx_image);
 173}
 174EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
 175
 176/*
 177 * Switches to specified vcpu, until a matching vcpu_put()
 178 */
 179static void vcpu_load(struct kvm_vcpu *vcpu)
 180{
 181        int cpu;
 182
 183        mutex_lock(&vcpu->mutex);
 184        cpu = get_cpu();
 185        preempt_notifier_register(&vcpu->preempt_notifier);
 186        kvm_x86_ops->vcpu_load(vcpu, cpu);
 187        put_cpu();
 188}
 189
 190static void vcpu_put(struct kvm_vcpu *vcpu)
 191{
 192        preempt_disable();
 193        kvm_x86_ops->vcpu_put(vcpu);
 194        preempt_notifier_unregister(&vcpu->preempt_notifier);
 195        preempt_enable();
 196        mutex_unlock(&vcpu->mutex);
 197}
 198
 199static void ack_flush(void *_completed)
 200{
 201}
 202
 203void kvm_flush_remote_tlbs(struct kvm *kvm)
 204{
 205        int i, cpu;
 206        cpumask_t cpus;
 207        struct kvm_vcpu *vcpu;
 208
 209        cpus_clear(cpus);
 210        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 211                vcpu = kvm->vcpus[i];
 212                if (!vcpu)
 213                        continue;
 214                if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
 215                        continue;
 216                cpu = vcpu->cpu;
 217                if (cpu != -1 && cpu != raw_smp_processor_id())
 218                        cpu_set(cpu, cpus);
 219        }
 220        smp_call_function_mask(cpus, ack_flush, NULL, 1);
 221}
 222
 223int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 224{
 225        struct page *page;
 226        int r;
 227
 228        mutex_init(&vcpu->mutex);
 229        vcpu->cpu = -1;
 230        vcpu->mmu.root_hpa = INVALID_PAGE;
 231        vcpu->kvm = kvm;
 232        vcpu->vcpu_id = id;
 233        if (!irqchip_in_kernel(kvm) || id == 0)
 234                vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
 235        else
 236                vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
 237        init_waitqueue_head(&vcpu->wq);
 238
 239        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 240        if (!page) {
 241                r = -ENOMEM;
 242                goto fail;
 243        }
 244        vcpu->run = page_address(page);
 245
 246        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 247        if (!page) {
 248                r = -ENOMEM;
 249                goto fail_free_run;
 250        }
 251        vcpu->pio_data = page_address(page);
 252
 253        r = kvm_mmu_create(vcpu);
 254        if (r < 0)
 255                goto fail_free_pio_data;
 256
 257        return 0;
 258
 259fail_free_pio_data:
 260        free_page((unsigned long)vcpu->pio_data);
 261fail_free_run:
 262        free_page((unsigned long)vcpu->run);
 263fail:
 264        return -ENOMEM;
 265}
 266EXPORT_SYMBOL_GPL(kvm_vcpu_init);
 267
 268void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 269{
 270        kvm_mmu_destroy(vcpu);
 271        if (vcpu->apic)
 272                hrtimer_cancel(&vcpu->apic->timer.dev);
 273        kvm_free_apic(vcpu->apic);
 274        free_page((unsigned long)vcpu->pio_data);
 275        free_page((unsigned long)vcpu->run);
 276}
 277EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
 278
 279static struct kvm *kvm_create_vm(void)
 280{
 281        struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
 282
 283        if (!kvm)
 284                return ERR_PTR(-ENOMEM);
 285
 286        kvm_io_bus_init(&kvm->pio_bus);
 287        mutex_init(&kvm->lock);
 288        INIT_LIST_HEAD(&kvm->active_mmu_pages);
 289        kvm_io_bus_init(&kvm->mmio_bus);
 290        spin_lock(&kvm_lock);
 291        list_add(&kvm->vm_list, &vm_list);
 292        spin_unlock(&kvm_lock);
 293        return kvm;
 294}
 295
 296/*
 297 * Free any memory in @free but not in @dont.
 298 */
 299static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 300                                  struct kvm_memory_slot *dont)
 301{
 302        int i;
 303
 304        if (!dont || free->phys_mem != dont->phys_mem)
 305                if (free->phys_mem) {
 306                        for (i = 0; i < free->npages; ++i)
 307                                if (free->phys_mem[i])
 308                                        __free_page(free->phys_mem[i]);
 309                        vfree(free->phys_mem);
 310                }
 311
 312        if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
 313                vfree(free->dirty_bitmap);
 314
 315        free->phys_mem = NULL;
 316        free->npages = 0;
 317        free->dirty_bitmap = NULL;
 318}
 319
 320static void kvm_free_physmem(struct kvm *kvm)
 321{
 322        int i;
 323
 324        for (i = 0; i < kvm->nmemslots; ++i)
 325                kvm_free_physmem_slot(&kvm->memslots[i], NULL);
 326}
 327
 328static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
 329{
 330        int i;
 331
 332        for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
 333                if (vcpu->pio.guest_pages[i]) {
 334                        __free_page(vcpu->pio.guest_pages[i]);
 335                        vcpu->pio.guest_pages[i] = NULL;
 336                }
 337}
 338
 339static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 340{
 341        vcpu_load(vcpu);
 342        kvm_mmu_unload(vcpu);
 343        vcpu_put(vcpu);
 344}
 345
 346static void kvm_free_vcpus(struct kvm *kvm)
 347{
 348        unsigned int i;
 349
 350        /*
 351         * Unpin any mmu pages first.
 352         */
 353        for (i = 0; i < KVM_MAX_VCPUS; ++i)
 354                if (kvm->vcpus[i])
 355                        kvm_unload_vcpu_mmu(kvm->vcpus[i]);
 356        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 357                if (kvm->vcpus[i]) {
 358                        kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
 359                        kvm->vcpus[i] = NULL;
 360                }
 361        }
 362
 363}
 364
 365static void kvm_destroy_vm(struct kvm *kvm)
 366{
 367        spin_lock(&kvm_lock);
 368        list_del(&kvm->vm_list);
 369        spin_unlock(&kvm_lock);
 370        kvm_io_bus_destroy(&kvm->pio_bus);
 371        kvm_io_bus_destroy(&kvm->mmio_bus);
 372        kfree(kvm->vpic);
 373        kfree(kvm->vioapic);
 374        kvm_free_vcpus(kvm);
 375        kvm_free_physmem(kvm);
 376        kfree(kvm);
 377}
 378
 379static int kvm_vm_release(struct inode *inode, struct file *filp)
 380{
 381        struct kvm *kvm = filp->private_data;
 382
 383        kvm_destroy_vm(kvm);
 384        return 0;
 385}
 386
 387static void inject_gp(struct kvm_vcpu *vcpu)
 388{
 389        kvm_x86_ops->inject_gp(vcpu, 0);
 390}
 391
 392/*
 393 * Load the pae pdptrs.  Return true is they are all valid.
 394 */
 395static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 396{
 397        gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 398        unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 399        int i;
 400        u64 *pdpt;
 401        int ret;
 402        struct page *page;
 403        u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
 404
 405        mutex_lock(&vcpu->kvm->lock);
 406        page = gfn_to_page(vcpu->kvm, pdpt_gfn);
 407        if (!page) {
 408                ret = 0;
 409                goto out;
 410        }
 411
 412        pdpt = kmap_atomic(page, KM_USER0);
 413        memcpy(pdpte, pdpt+offset, sizeof(pdpte));
 414        kunmap_atomic(pdpt, KM_USER0);
 415
 416        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 417                if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
 418                        ret = 0;
 419                        goto out;
 420                }
 421        }
 422        ret = 1;
 423
 424        memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
 425out:
 426        mutex_unlock(&vcpu->kvm->lock);
 427
 428        return ret;
 429}
 430
 431void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 432{
 433        if (cr0 & CR0_RESERVED_BITS) {
 434                printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
 435                       cr0, vcpu->cr0);
 436                inject_gp(vcpu);
 437                return;
 438        }
 439
 440        if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
 441                printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
 442                inject_gp(vcpu);
 443                return;
 444        }
 445
 446        if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
 447                printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
 448                       "and a clear PE flag\n");
 449                inject_gp(vcpu);
 450                return;
 451        }
 452
 453        if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 454#ifdef CONFIG_X86_64
 455                if ((vcpu->shadow_efer & EFER_LME)) {
 456                        int cs_db, cs_l;
 457
 458                        if (!is_pae(vcpu)) {
 459                                printk(KERN_DEBUG "set_cr0: #GP, start paging "
 460                                       "in long mode while PAE is disabled\n");
 461                                inject_gp(vcpu);
 462                                return;
 463                        }
 464                        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 465                        if (cs_l) {
 466                                printk(KERN_DEBUG "set_cr0: #GP, start paging "
 467                                       "in long mode while CS.L == 1\n");
 468                                inject_gp(vcpu);
 469                                return;
 470
 471                        }
 472                } else
 473#endif
 474                if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
 475                        printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
 476                               "reserved bits\n");
 477                        inject_gp(vcpu);
 478                        return;
 479                }
 480
 481        }
 482
 483        kvm_x86_ops->set_cr0(vcpu, cr0);
 484        vcpu->cr0 = cr0;
 485
 486        mutex_lock(&vcpu->kvm->lock);
 487        kvm_mmu_reset_context(vcpu);
 488        mutex_unlock(&vcpu->kvm->lock);
 489        return;
 490}
 491EXPORT_SYMBOL_GPL(set_cr0);
 492
 493void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 494{
 495        set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
 496}
 497EXPORT_SYMBOL_GPL(lmsw);
 498
 499void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 500{
 501        if (cr4 & CR4_RESERVED_BITS) {
 502                printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
 503                inject_gp(vcpu);
 504                return;
 505        }
 506
 507        if (is_long_mode(vcpu)) {
 508                if (!(cr4 & X86_CR4_PAE)) {
 509                        printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
 510                               "in long mode\n");
 511                        inject_gp(vcpu);
 512                        return;
 513                }
 514        } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
 515                   && !load_pdptrs(vcpu, vcpu->cr3)) {
 516                printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
 517                inject_gp(vcpu);
 518                return;
 519        }
 520
 521        if (cr4 & X86_CR4_VMXE) {
 522                printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
 523                inject_gp(vcpu);
 524                return;
 525        }
 526        kvm_x86_ops->set_cr4(vcpu, cr4);
 527        vcpu->cr4 = cr4;
 528        mutex_lock(&vcpu->kvm->lock);
 529        kvm_mmu_reset_context(vcpu);
 530        mutex_unlock(&vcpu->kvm->lock);
 531}
 532EXPORT_SYMBOL_GPL(set_cr4);
 533
 534void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 535{
 536        if (is_long_mode(vcpu)) {
 537                if (cr3 & CR3_L_MODE_RESERVED_BITS) {
 538                        printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
 539                        inject_gp(vcpu);
 540                        return;
 541                }
 542        } else {
 543                if (is_pae(vcpu)) {
 544                        if (cr3 & CR3_PAE_RESERVED_BITS) {
 545                                printk(KERN_DEBUG
 546                                       "set_cr3: #GP, reserved bits\n");
 547                                inject_gp(vcpu);
 548                                return;
 549                        }
 550                        if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
 551                                printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
 552                                       "reserved bits\n");
 553                                inject_gp(vcpu);
 554                                return;
 555                        }
 556                } else {
 557                        if (cr3 & CR3_NONPAE_RESERVED_BITS) {
 558                                printk(KERN_DEBUG
 559                                       "set_cr3: #GP, reserved bits\n");
 560                                inject_gp(vcpu);
 561                                return;
 562                        }
 563                }
 564        }
 565
 566        mutex_lock(&vcpu->kvm->lock);
 567        /*
 568         * Does the new cr3 value map to physical memory? (Note, we
 569         * catch an invalid cr3 even in real-mode, because it would
 570         * cause trouble later on when we turn on paging anyway.)
 571         *
 572         * A real CPU would silently accept an invalid cr3 and would
 573         * attempt to use it - with largely undefined (and often hard
 574         * to debug) behavior on the guest side.
 575         */
 576        if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 577                inject_gp(vcpu);
 578        else {
 579                vcpu->cr3 = cr3;
 580                vcpu->mmu.new_cr3(vcpu);
 581        }
 582        mutex_unlock(&vcpu->kvm->lock);
 583}
 584EXPORT_SYMBOL_GPL(set_cr3);
 585
 586void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 587{
 588        if (cr8 & CR8_RESERVED_BITS) {
 589                printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
 590                inject_gp(vcpu);
 591                return;
 592        }
 593        if (irqchip_in_kernel(vcpu->kvm))
 594                kvm_lapic_set_tpr(vcpu, cr8);
 595        else
 596                vcpu->cr8 = cr8;
 597}
 598EXPORT_SYMBOL_GPL(set_cr8);
 599
 600unsigned long get_cr8(struct kvm_vcpu *vcpu)
 601{
 602        if (irqchip_in_kernel(vcpu->kvm))
 603                return kvm_lapic_get_cr8(vcpu);
 604        else
 605                return vcpu->cr8;
 606}
 607EXPORT_SYMBOL_GPL(get_cr8);
 608
 609u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 610{
 611        if (irqchip_in_kernel(vcpu->kvm))
 612                return vcpu->apic_base;
 613        else
 614                return vcpu->apic_base;
 615}
 616EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 617
 618void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 619{
 620        /* TODO: reserve bits check */
 621        if (irqchip_in_kernel(vcpu->kvm))
 622                kvm_lapic_set_base(vcpu, data);
 623        else
 624                vcpu->apic_base = data;
 625}
 626EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 627
 628void fx_init(struct kvm_vcpu *vcpu)
 629{
 630        unsigned after_mxcsr_mask;
 631
 632        /* Initialize guest FPU by resetting ours and saving into guest's */
 633        preempt_disable();
 634        fx_save(&vcpu->host_fx_image);
 635        fpu_init();
 636        fx_save(&vcpu->guest_fx_image);
 637        fx_restore(&vcpu->host_fx_image);
 638        preempt_enable();
 639
 640        vcpu->cr0 |= X86_CR0_ET;
 641        after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
 642        vcpu->guest_fx_image.mxcsr = 0x1f80;
 643        memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
 644               0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
 645}
 646EXPORT_SYMBOL_GPL(fx_init);
 647
 648/*
 649 * Allocate some memory and give it an address in the guest physical address
 650 * space.
 651 *
 652 * Discontiguous memory is allowed, mostly for framebuffers.
 653 */
 654static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 655                                          struct kvm_memory_region *mem)
 656{
 657        int r;
 658        gfn_t base_gfn;
 659        unsigned long npages;
 660        unsigned long i;
 661        struct kvm_memory_slot *memslot;
 662        struct kvm_memory_slot old, new;
 663
 664        r = -EINVAL;
 665        /* General sanity checks */
 666        if (mem->memory_size & (PAGE_SIZE - 1))
 667                goto out;
 668        if (mem->guest_phys_addr & (PAGE_SIZE - 1))
 669                goto out;
 670        if (mem->slot >= KVM_MEMORY_SLOTS)
 671                goto out;
 672        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
 673                goto out;
 674
 675        memslot = &kvm->memslots[mem->slot];
 676        base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
 677        npages = mem->memory_size >> PAGE_SHIFT;
 678
 679        if (!npages)
 680                mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
 681
 682        mutex_lock(&kvm->lock);
 683
 684        new = old = *memslot;
 685
 686        new.base_gfn = base_gfn;
 687        new.npages = npages;
 688        new.flags = mem->flags;
 689
 690        /* Disallow changing a memory slot's size. */
 691        r = -EINVAL;
 692        if (npages && old.npages && npages != old.npages)
 693                goto out_unlock;
 694
 695        /* Check for overlaps */
 696        r = -EEXIST;
 697        for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
 698                struct kvm_memory_slot *s = &kvm->memslots[i];
 699
 700                if (s == memslot)
 701                        continue;
 702                if (!((base_gfn + npages <= s->base_gfn) ||
 703                      (base_gfn >= s->base_gfn + s->npages)))
 704                        goto out_unlock;
 705        }
 706
 707        /* Deallocate if slot is being removed */
 708        if (!npages)
 709                new.phys_mem = NULL;
 710
 711        /* Free page dirty bitmap if unneeded */
 712        if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
 713                new.dirty_bitmap = NULL;
 714
 715        r = -ENOMEM;
 716
 717        /* Allocate if a slot is being created */
 718        if (npages && !new.phys_mem) {
 719                new.phys_mem = vmalloc(npages * sizeof(struct page *));
 720
 721                if (!new.phys_mem)
 722                        goto out_unlock;
 723
 724                memset(new.phys_mem, 0, npages * sizeof(struct page *));
 725                for (i = 0; i < npages; ++i) {
 726                        new.phys_mem[i] = alloc_page(GFP_HIGHUSER
 727                                                     | __GFP_ZERO);
 728                        if (!new.phys_mem[i])
 729                                goto out_unlock;
 730                        set_page_private(new.phys_mem[i],0);
 731                }
 732        }
 733
 734        /* Allocate page dirty bitmap if needed */
 735        if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
 736                unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
 737
 738                new.dirty_bitmap = vmalloc(dirty_bytes);
 739                if (!new.dirty_bitmap)
 740                        goto out_unlock;
 741                memset(new.dirty_bitmap, 0, dirty_bytes);
 742        }
 743
 744        if (mem->slot >= kvm->nmemslots)
 745                kvm->nmemslots = mem->slot + 1;
 746
 747        *memslot = new;
 748
 749        kvm_mmu_slot_remove_write_access(kvm, mem->slot);
 750        kvm_flush_remote_tlbs(kvm);
 751
 752        mutex_unlock(&kvm->lock);
 753
 754        kvm_free_physmem_slot(&old, &new);
 755        return 0;
 756
 757out_unlock:
 758        mutex_unlock(&kvm->lock);
 759        kvm_free_physmem_slot(&new, &old);
 760out:
 761        return r;
 762}
 763
 764/*
 765 * Get (and clear) the dirty memory log for a memory slot.
 766 */
 767static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 768                                      struct kvm_dirty_log *log)
 769{
 770        struct kvm_memory_slot *memslot;
 771        int r, i;
 772        int n;
 773        unsigned long any = 0;
 774
 775        mutex_lock(&kvm->lock);
 776
 777        r = -EINVAL;
 778        if (log->slot >= KVM_MEMORY_SLOTS)
 779                goto out;
 780
 781        memslot = &kvm->memslots[log->slot];
 782        r = -ENOENT;
 783        if (!memslot->dirty_bitmap)
 784                goto out;
 785
 786        n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
 787
 788        for (i = 0; !any && i < n/sizeof(long); ++i)
 789                any = memslot->dirty_bitmap[i];
 790
 791        r = -EFAULT;
 792        if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
 793                goto out;
 794
 795        /* If nothing is dirty, don't bother messing with page tables. */
 796        if (any) {
 797                kvm_mmu_slot_remove_write_access(kvm, log->slot);
 798                kvm_flush_remote_tlbs(kvm);
 799                memset(memslot->dirty_bitmap, 0, n);
 800        }
 801
 802        r = 0;
 803
 804out:
 805        mutex_unlock(&kvm->lock);
 806        return r;
 807}
 808
 809/*
 810 * Set a new alias region.  Aliases map a portion of physical memory into
 811 * another portion.  This is useful for memory windows, for example the PC
 812 * VGA region.
 813 */
 814static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
 815                                         struct kvm_memory_alias *alias)
 816{
 817        int r, n;
 818        struct kvm_mem_alias *p;
 819
 820        r = -EINVAL;
 821        /* General sanity checks */
 822        if (alias->memory_size & (PAGE_SIZE - 1))
 823                goto out;
 824        if (alias->guest_phys_addr & (PAGE_SIZE - 1))
 825                goto out;
 826        if (alias->slot >= KVM_ALIAS_SLOTS)
 827                goto out;
 828        if (alias->guest_phys_addr + alias->memory_size
 829            < alias->guest_phys_addr)
 830                goto out;
 831        if (alias->target_phys_addr + alias->memory_size
 832            < alias->target_phys_addr)
 833                goto out;
 834
 835        mutex_lock(&kvm->lock);
 836
 837        p = &kvm->aliases[alias->slot];
 838        p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
 839        p->npages = alias->memory_size >> PAGE_SHIFT;
 840        p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
 841
 842        for (n = KVM_ALIAS_SLOTS; n > 0; --n)
 843                if (kvm->aliases[n - 1].npages)
 844                        break;
 845        kvm->naliases = n;
 846
 847        kvm_mmu_zap_all(kvm);
 848
 849        mutex_unlock(&kvm->lock);
 850
 851        return 0;
 852
 853out:
 854        return r;
 855}
 856
 857static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 858{
 859        int r;
 860
 861        r = 0;
 862        switch (chip->chip_id) {
 863        case KVM_IRQCHIP_PIC_MASTER:
 864                memcpy (&chip->chip.pic,
 865                        &pic_irqchip(kvm)->pics[0],
 866                        sizeof(struct kvm_pic_state));
 867                break;
 868        case KVM_IRQCHIP_PIC_SLAVE:
 869                memcpy (&chip->chip.pic,
 870                        &pic_irqchip(kvm)->pics[1],
 871                        sizeof(struct kvm_pic_state));
 872                break;
 873        case KVM_IRQCHIP_IOAPIC:
 874                memcpy (&chip->chip.ioapic,
 875                        ioapic_irqchip(kvm),
 876                        sizeof(struct kvm_ioapic_state));
 877                break;
 878        default:
 879                r = -EINVAL;
 880                break;
 881        }
 882        return r;
 883}
 884
 885static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 886{
 887        int r;
 888
 889        r = 0;
 890        switch (chip->chip_id) {
 891        case KVM_IRQCHIP_PIC_MASTER:
 892                memcpy (&pic_irqchip(kvm)->pics[0],
 893                        &chip->chip.pic,
 894                        sizeof(struct kvm_pic_state));
 895                break;
 896        case KVM_IRQCHIP_PIC_SLAVE:
 897                memcpy (&pic_irqchip(kvm)->pics[1],
 898                        &chip->chip.pic,
 899                        sizeof(struct kvm_pic_state));
 900                break;
 901        case KVM_IRQCHIP_IOAPIC:
 902                memcpy (ioapic_irqchip(kvm),
 903                        &chip->chip.ioapic,
 904                        sizeof(struct kvm_ioapic_state));
 905                break;
 906        default:
 907                r = -EINVAL;
 908                break;
 909        }
 910        kvm_pic_update_irq(pic_irqchip(kvm));
 911        return r;
 912}
 913
 914static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 915{
 916        int i;
 917        struct kvm_mem_alias *alias;
 918
 919        for (i = 0; i < kvm->naliases; ++i) {
 920                alias = &kvm->aliases[i];
 921                if (gfn >= alias->base_gfn
 922                    && gfn < alias->base_gfn + alias->npages)
 923                        return alias->target_gfn + gfn - alias->base_gfn;
 924        }
 925        return gfn;
 926}
 927
 928static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 929{
 930        int i;
 931
 932        for (i = 0; i < kvm->nmemslots; ++i) {
 933                struct kvm_memory_slot *memslot = &kvm->memslots[i];
 934
 935                if (gfn >= memslot->base_gfn
 936                    && gfn < memslot->base_gfn + memslot->npages)
 937                        return memslot;
 938        }
 939        return NULL;
 940}
 941
 942struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 943{
 944        gfn = unalias_gfn(kvm, gfn);
 945        return __gfn_to_memslot(kvm, gfn);
 946}
 947
 948struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 949{
 950        struct kvm_memory_slot *slot;
 951
 952        gfn = unalias_gfn(kvm, gfn);
 953        slot = __gfn_to_memslot(kvm, gfn);
 954        if (!slot)
 955                return NULL;
 956        return slot->phys_mem[gfn - slot->base_gfn];
 957}
 958EXPORT_SYMBOL_GPL(gfn_to_page);
 959
 960/* WARNING: Does not work on aliased pages. */
 961void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 962{
 963        struct kvm_memory_slot *memslot;
 964
 965        memslot = __gfn_to_memslot(kvm, gfn);
 966        if (memslot && memslot->dirty_bitmap) {
 967                unsigned long rel_gfn = gfn - memslot->base_gfn;
 968
 969                /* avoid RMW */
 970                if (!test_bit(rel_gfn, memslot->dirty_bitmap))
 971                        set_bit(rel_gfn, memslot->dirty_bitmap);
 972        }
 973}
 974
 975int emulator_read_std(unsigned long addr,
 976                             void *val,
 977                             unsigned int bytes,
 978                             struct kvm_vcpu *vcpu)
 979{
 980        void *data = val;
 981
 982        while (bytes) {
 983                gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
 984                unsigned offset = addr & (PAGE_SIZE-1);
 985                unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
 986                unsigned long pfn;
 987                struct page *page;
 988                void *page_virt;
 989
 990                if (gpa == UNMAPPED_GVA)
 991                        return X86EMUL_PROPAGATE_FAULT;
 992                pfn = gpa >> PAGE_SHIFT;
 993                page = gfn_to_page(vcpu->kvm, pfn);
 994                if (!page)
 995                        return X86EMUL_UNHANDLEABLE;
 996                page_virt = kmap_atomic(page, KM_USER0);
 997
 998                memcpy(data, page_virt + offset, tocopy);
 999
1000                kunmap_atomic(page_virt, KM_USER0);
1001
1002                bytes -= tocopy;
1003                data += tocopy;
1004                addr += tocopy;
1005        }
1006
1007        return X86EMUL_CONTINUE;
1008}
1009EXPORT_SYMBOL_GPL(emulator_read_std);
1010
1011static int emulator_write_std(unsigned long addr,
1012                              const void *val,
1013                              unsigned int bytes,
1014                              struct kvm_vcpu *vcpu)
1015{
1016        pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
1017        return X86EMUL_UNHANDLEABLE;
1018}
1019
1020/*
1021 * Only apic need an MMIO device hook, so shortcut now..
1022 */
1023static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1024                                                gpa_t addr)
1025{
1026        struct kvm_io_device *dev;
1027
1028        if (vcpu->apic) {
1029                dev = &vcpu->apic->dev;
1030                if (dev->in_range(dev, addr))
1031                        return dev;
1032        }
1033        return NULL;
1034}
1035
1036static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1037                                                gpa_t addr)
1038{
1039        struct kvm_io_device *dev;
1040
1041        dev = vcpu_find_pervcpu_dev(vcpu, addr);
1042        if (dev == NULL)
1043                dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1044        return dev;
1045}
1046
1047static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1048                                               gpa_t addr)
1049{
1050        return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1051}
1052
1053static int emulator_read_emulated(unsigned long addr,
1054                                  void *val,
1055                                  unsigned int bytes,
1056                                  struct kvm_vcpu *vcpu)
1057{
1058        struct kvm_io_device *mmio_dev;
1059        gpa_t                 gpa;
1060
1061        if (vcpu->mmio_read_completed) {
1062                memcpy(val, vcpu->mmio_data, bytes);
1063                vcpu->mmio_read_completed = 0;
1064                return X86EMUL_CONTINUE;
1065        } else if (emulator_read_std(addr, val, bytes, vcpu)
1066                   == X86EMUL_CONTINUE)
1067                return X86EMUL_CONTINUE;
1068
1069        gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1070        if (gpa == UNMAPPED_GVA)
1071                return X86EMUL_PROPAGATE_FAULT;
1072
1073        /*
1074         * Is this MMIO handled locally?
1075         */
1076        mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1077        if (mmio_dev) {
1078                kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1079                return X86EMUL_CONTINUE;
1080        }
1081
1082        vcpu->mmio_needed = 1;
1083        vcpu->mmio_phys_addr = gpa;
1084        vcpu->mmio_size = bytes;
1085        vcpu->mmio_is_write = 0;
1086
1087        return X86EMUL_UNHANDLEABLE;
1088}
1089
1090static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1091                               const void *val, int bytes)
1092{
1093        struct page *page;
1094        void *virt;
1095
1096        if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1097                return 0;
1098        page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1099        if (!page)
1100                return 0;
1101        mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1102        virt = kmap_atomic(page, KM_USER0);
1103        kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1104        memcpy(virt + offset_in_page(gpa), val, bytes);
1105        kunmap_atomic(virt, KM_USER0);
1106        return 1;
1107}
1108
1109static int emulator_write_emulated_onepage(unsigned long addr,
1110                                           const void *val,
1111                                           unsigned int bytes,
1112                                           struct kvm_vcpu *vcpu)
1113{
1114        struct kvm_io_device *mmio_dev;
1115        gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1116
1117        if (gpa == UNMAPPED_GVA) {
1118                kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
1119                return X86EMUL_PROPAGATE_FAULT;
1120        }
1121
1122        if (emulator_write_phys(vcpu, gpa, val, bytes))
1123                return X86EMUL_CONTINUE;
1124
1125        /*
1126         * Is this MMIO handled locally?
1127         */
1128        mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1129        if (mmio_dev) {
1130                kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1131                return X86EMUL_CONTINUE;
1132        }
1133
1134        vcpu->mmio_needed = 1;
1135        vcpu->mmio_phys_addr = gpa;
1136        vcpu->mmio_size = bytes;
1137        vcpu->mmio_is_write = 1;
1138        memcpy(vcpu->mmio_data, val, bytes);
1139
1140        return X86EMUL_CONTINUE;
1141}
1142
1143int emulator_write_emulated(unsigned long addr,
1144                                   const void *val,
1145                                   unsigned int bytes,
1146                                   struct kvm_vcpu *vcpu)
1147{
1148        /* Crossing a page boundary? */
1149        if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1150                int rc, now;
1151
1152                now = -addr & ~PAGE_MASK;
1153                rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1154                if (rc != X86EMUL_CONTINUE)
1155                        return rc;
1156                addr += now;
1157                val += now;
1158                bytes -= now;
1159        }
1160        return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1161}
1162EXPORT_SYMBOL_GPL(emulator_write_emulated);
1163
1164static int emulator_cmpxchg_emulated(unsigned long addr,
1165                                     const void *old,
1166                                     const void *new,
1167                                     unsigned int bytes,
1168                                     struct kvm_vcpu *vcpu)
1169{
1170        static int reported;
1171
1172        if (!reported) {
1173                reported = 1;
1174                printk(KERN_WARNING "kvm: emulating exchange as write\n");
1175        }
1176        return emulator_write_emulated(addr, new, bytes, vcpu);
1177}
1178
1179static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1180{
1181        return kvm_x86_ops->get_segment_base(vcpu, seg);
1182}
1183
1184int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1185{
1186        return X86EMUL_CONTINUE;
1187}
1188
1189int emulate_clts(struct kvm_vcpu *vcpu)
1190{
1191        kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
1192        return X86EMUL_CONTINUE;
1193}
1194
1195int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1196{
1197        struct kvm_vcpu *vcpu = ctxt->vcpu;
1198
1199        switch (dr) {
1200        case 0 ... 3:
1201                *dest = kvm_x86_ops->get_dr(vcpu, dr);
1202                return X86EMUL_CONTINUE;
1203        default:
1204                pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1205                return X86EMUL_UNHANDLEABLE;
1206        }
1207}
1208
1209int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1210{
1211        unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1212        int exception;
1213
1214        kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1215        if (exception) {
1216                /* FIXME: better handling */
1217                return X86EMUL_UNHANDLEABLE;
1218        }
1219        return X86EMUL_CONTINUE;
1220}
1221
1222void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1223{
1224        static int reported;
1225        u8 opcodes[4];
1226        unsigned long rip = vcpu->rip;
1227        unsigned long rip_linear;
1228
1229        rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1230
1231        if (reported)
1232                return;
1233
1234        emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1235
1236        printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1237               context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1238        reported = 1;
1239}
1240EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1241
1242struct x86_emulate_ops emulate_ops = {
1243        .read_std            = emulator_read_std,
1244        .write_std           = emulator_write_std,
1245        .read_emulated       = emulator_read_emulated,
1246        .write_emulated      = emulator_write_emulated,
1247        .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1248};
1249
1250int emulate_instruction(struct kvm_vcpu *vcpu,
1251                        struct kvm_run *run,
1252                        unsigned long cr2,
1253                        u16 error_code)
1254{
1255        struct x86_emulate_ctxt emulate_ctxt;
1256        int r;
1257        int cs_db, cs_l;
1258
1259        vcpu->mmio_fault_cr2 = cr2;
1260        kvm_x86_ops->cache_regs(vcpu);
1261
1262        kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1263
1264        emulate_ctxt.vcpu = vcpu;
1265        emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1266        emulate_ctxt.cr2 = cr2;
1267        emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1268                ? X86EMUL_MODE_REAL : cs_l
1269                ? X86EMUL_MODE_PROT64 : cs_db
1270                ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1271
1272        if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1273                emulate_ctxt.cs_base = 0;
1274                emulate_ctxt.ds_base = 0;
1275                emulate_ctxt.es_base = 0;
1276                emulate_ctxt.ss_base = 0;
1277        } else {
1278                emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1279                emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1280                emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1281                emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1282        }
1283
1284        emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1285        emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1286
1287        vcpu->mmio_is_write = 0;
1288        vcpu->pio.string = 0;
1289        r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1290        if (vcpu->pio.string)
1291                return EMULATE_DO_MMIO;
1292
1293        if ((r || vcpu->mmio_is_write) && run) {
1294                run->exit_reason = KVM_EXIT_MMIO;
1295                run->mmio.phys_addr = vcpu->mmio_phys_addr;
1296                memcpy(run->mmio.data, vcpu->mmio_data, 8);
1297                run->mmio.len = vcpu->mmio_size;
1298                run->mmio.is_write = vcpu->mmio_is_write;
1299        }
1300
1301        if (r) {
1302                if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1303                        return EMULATE_DONE;
1304                if (!vcpu->mmio_needed) {
1305                        kvm_report_emulation_failure(vcpu, "mmio");
1306                        return EMULATE_FAIL;
1307                }
1308                return EMULATE_DO_MMIO;
1309        }
1310
1311        kvm_x86_ops->decache_regs(vcpu);
1312        kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1313
1314        if (vcpu->mmio_is_write) {
1315                vcpu->mmio_needed = 0;
1316                return EMULATE_DO_MMIO;
1317        }
1318
1319        return EMULATE_DONE;
1320}
1321EXPORT_SYMBOL_GPL(emulate_instruction);
1322
1323/*
1324 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1325 */
1326static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1327{
1328        DECLARE_WAITQUEUE(wait, current);
1329
1330        add_wait_queue(&vcpu->wq, &wait);
1331
1332        /*
1333         * We will block until either an interrupt or a signal wakes us up
1334         */
1335        while (!kvm_cpu_has_interrupt(vcpu)
1336               && !signal_pending(current)
1337               && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
1338               && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
1339                set_current_state(TASK_INTERRUPTIBLE);
1340                vcpu_put(vcpu);
1341                schedule();
1342                vcpu_load(vcpu);
1343        }
1344
1345        __set_current_state(TASK_RUNNING);
1346        remove_wait_queue(&vcpu->wq, &wait);
1347}
1348
1349int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1350{
1351        ++vcpu->stat.halt_exits;
1352        if (irqchip_in_kernel(vcpu->kvm)) {
1353                vcpu->mp_state = VCPU_MP_STATE_HALTED;
1354                kvm_vcpu_block(vcpu);
1355                if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
1356                        return -EINTR;
1357                return 1;
1358        } else {
1359                vcpu->run->exit_reason = KVM_EXIT_HLT;
1360                return 0;
1361        }
1362}
1363EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1364
1365int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1366{
1367        unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1368
1369        kvm_x86_ops->cache_regs(vcpu);
1370        ret = -KVM_EINVAL;
1371#ifdef CONFIG_X86_64
1372        if (is_long_mode(vcpu)) {
1373                nr = vcpu->regs[VCPU_REGS_RAX];
1374                a0 = vcpu->regs[VCPU_REGS_RDI];
1375                a1 = vcpu->regs[VCPU_REGS_RSI];
1376                a2 = vcpu->regs[VCPU_REGS_RDX];
1377                a3 = vcpu->regs[VCPU_REGS_RCX];
1378                a4 = vcpu->regs[VCPU_REGS_R8];
1379                a5 = vcpu->regs[VCPU_REGS_R9];
1380        } else
1381#endif
1382        {
1383                nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1384                a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1385                a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1386                a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1387                a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1388                a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1389                a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1390        }
1391        switch (nr) {
1392        default:
1393                run->hypercall.nr = nr;
1394                run->hypercall.args[0] = a0;
1395                run->hypercall.args[1] = a1;
1396                run->hypercall.args[2] = a2;
1397                run->hypercall.args[3] = a3;
1398                run->hypercall.args[4] = a4;
1399                run->hypercall.args[5] = a5;
1400                run->hypercall.ret = ret;
1401                run->hypercall.longmode = is_long_mode(vcpu);
1402                kvm_x86_ops->decache_regs(vcpu);
1403                return 0;
1404        }
1405        vcpu->regs[VCPU_REGS_RAX] = ret;
1406        kvm_x86_ops->decache_regs(vcpu);
1407        return 1;
1408}
1409EXPORT_SYMBOL_GPL(kvm_hypercall);
1410
1411static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1412{
1413        return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1414}
1415
1416void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1417{
1418        struct descriptor_table dt = { limit, base };
1419
1420        kvm_x86_ops->set_gdt(vcpu, &dt);
1421}
1422
1423void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1424{
1425        struct descriptor_table dt = { limit, base };
1426
1427        kvm_x86_ops->set_idt(vcpu, &dt);
1428}
1429
1430void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1431                   unsigned long *rflags)
1432{
1433        lmsw(vcpu, msw);
1434        *rflags = kvm_x86_ops->get_rflags(vcpu);
1435}
1436
1437unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1438{
1439        kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1440        switch (cr) {
1441        case 0:
1442                return vcpu->cr0;
1443        case 2:
1444                return vcpu->cr2;
1445        case 3:
1446                return vcpu->cr3;
1447        case 4:
1448                return vcpu->cr4;
1449        default:
1450                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1451                return 0;
1452        }
1453}
1454
1455void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1456                     unsigned long *rflags)
1457{
1458        switch (cr) {
1459        case 0:
1460                set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1461                *rflags = kvm_x86_ops->get_rflags(vcpu);
1462                break;
1463        case 2:
1464                vcpu->cr2 = val;
1465                break;
1466        case 3:
1467                set_cr3(vcpu, val);
1468                break;
1469        case 4:
1470                set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1471                break;
1472        default:
1473                vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1474        }
1475}
1476
1477/*
1478 * Register the para guest with the host:
1479 */
1480static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1481{
1482        struct kvm_vcpu_para_state *para_state;
1483        hpa_t para_state_hpa, hypercall_hpa;
1484        struct page *para_state_page;
1485        unsigned char *hypercall;
1486        gpa_t hypercall_gpa;
1487
1488        printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1489        printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1490
1491        /*
1492         * Needs to be page aligned:
1493         */
1494        if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1495                goto err_gp;
1496
1497        para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1498        printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1499        if (is_error_hpa(para_state_hpa))
1500                goto err_gp;
1501
1502        mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1503        para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1504        para_state = kmap(para_state_page);
1505
1506        printk(KERN_DEBUG "....  guest version: %d\n", para_state->guest_version);
1507        printk(KERN_DEBUG "....           size: %d\n", para_state->size);
1508
1509        para_state->host_version = KVM_PARA_API_VERSION;
1510        /*
1511         * We cannot support guests that try to register themselves
1512         * with a newer API version than the host supports:
1513         */
1514        if (para_state->guest_version > KVM_PARA_API_VERSION) {
1515                para_state->ret = -KVM_EINVAL;
1516                goto err_kunmap_skip;
1517        }
1518
1519        hypercall_gpa = para_state->hypercall_gpa;
1520        hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1521        printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1522        if (is_error_hpa(hypercall_hpa)) {
1523                para_state->ret = -KVM_EINVAL;
1524                goto err_kunmap_skip;
1525        }
1526
1527        printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1528        vcpu->para_state_page = para_state_page;
1529        vcpu->para_state_gpa = para_state_gpa;
1530        vcpu->hypercall_gpa = hypercall_gpa;
1531
1532        mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1533        hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1534                                KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1535        kvm_x86_ops->patch_hypercall(vcpu, hypercall);
1536        kunmap_atomic(hypercall, KM_USER1);
1537
1538        para_state->ret = 0;
1539err_kunmap_skip:
1540        kunmap(para_state_page);
1541        return 0;
1542err_gp:
1543        return 1;
1544}
1545
1546int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1547{
1548        u64 data;
1549
1550        switch (msr) {
1551        case 0xc0010010: /* SYSCFG */
1552        case 0xc0010015: /* HWCR */
1553        case MSR_IA32_PLATFORM_ID:
1554        case MSR_IA32_P5_MC_ADDR:
1555        case MSR_IA32_P5_MC_TYPE:
1556        case MSR_IA32_MC0_CTL:
1557        case MSR_IA32_MCG_STATUS:
1558        case MSR_IA32_MCG_CAP:
1559        case MSR_IA32_MC0_MISC:
1560        case MSR_IA32_MC0_MISC+4:
1561        case MSR_IA32_MC0_MISC+8:
1562        case MSR_IA32_MC0_MISC+12:
1563        case MSR_IA32_MC0_MISC+16:
1564        case MSR_IA32_UCODE_REV:
1565        case MSR_IA32_PERF_STATUS:
1566        case MSR_IA32_EBL_CR_POWERON:
1567                /* MTRR registers */
1568        case 0xfe:
1569        case 0x200 ... 0x2ff:
1570                data = 0;
1571                break;
1572        case 0xcd: /* fsb frequency */
1573                data = 3;
1574                break;
1575        case MSR_IA32_APICBASE:
1576                data = kvm_get_apic_base(vcpu);
1577                break;
1578        case MSR_IA32_MISC_ENABLE:
1579                data = vcpu->ia32_misc_enable_msr;
1580                break;
1581#ifdef CONFIG_X86_64
1582        case MSR_EFER:
1583                data = vcpu->shadow_efer;
1584                break;
1585#endif
1586        default:
1587                pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1588                return 1;
1589        }
1590        *pdata = data;
1591        return 0;
1592}
1593EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1594
1595/*
1596 * Reads an msr value (of 'msr_index') into 'pdata'.
1597 * Returns 0 on success, non-0 otherwise.
1598 * Assumes vcpu_load() was already called.
1599 */
1600int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1601{
1602        return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1603}
1604
1605#ifdef CONFIG_X86_64
1606
1607static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1608{
1609        if (efer & EFER_RESERVED_BITS) {
1610                printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1611                       efer);
1612                inject_gp(vcpu);
1613                return;
1614        }
1615
1616        if (is_paging(vcpu)
1617            && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1618                printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1619                inject_gp(vcpu);
1620                return;
1621        }
1622
1623        kvm_x86_ops->set_efer(vcpu, efer);
1624
1625        efer &= ~EFER_LMA;
1626        efer |= vcpu->shadow_efer & EFER_LMA;
1627
1628        vcpu->shadow_efer = efer;
1629}
1630
1631#endif
1632
1633int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1634{
1635        switch (msr) {
1636#ifdef CONFIG_X86_64
1637        case MSR_EFER:
1638                set_efer(vcpu, data);
1639                break;
1640#endif
1641        case MSR_IA32_MC0_STATUS:
1642                pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1643                       __FUNCTION__, data);
1644                break;
1645        case MSR_IA32_MCG_STATUS:
1646                pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1647                        __FUNCTION__, data);
1648                break;
1649        case MSR_IA32_UCODE_REV:
1650        case MSR_IA32_UCODE_WRITE:
1651        case 0x200 ... 0x2ff: /* MTRRs */
1652                break;
1653        case MSR_IA32_APICBASE:
1654                kvm_set_apic_base(vcpu, data);
1655                break;
1656        case MSR_IA32_MISC_ENABLE:
1657                vcpu->ia32_misc_enable_msr = data;
1658                break;
1659        /*
1660         * This is the 'probe whether the host is KVM' logic:
1661         */
1662        case MSR_KVM_API_MAGIC:
1663                return vcpu_register_para(vcpu, data);
1664
1665        default:
1666                pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
1667                return 1;
1668        }
1669        return 0;
1670}
1671EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1672
1673/*
1674 * Writes msr value into into the appropriate "register".
1675 * Returns 0 on success, non-0 otherwise.
1676 * Assumes vcpu_load() was already called.
1677 */
1678int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1679{
1680        return kvm_x86_ops->set_msr(vcpu, msr_index, data);
1681}
1682
1683void kvm_resched(struct kvm_vcpu *vcpu)
1684{
1685        if (!need_resched())
1686                return;
1687        cond_resched();
1688}
1689EXPORT_SYMBOL_GPL(kvm_resched);
1690
1691void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1692{
1693        int i;
1694        u32 function;
1695        struct kvm_cpuid_entry *e, *best;
1696
1697        kvm_x86_ops->cache_regs(vcpu);
1698        function = vcpu->regs[VCPU_REGS_RAX];
1699        vcpu->regs[VCPU_REGS_RAX] = 0;
1700        vcpu->regs[VCPU_REGS_RBX] = 0;
1701        vcpu->regs[VCPU_REGS_RCX] = 0;
1702        vcpu->regs[VCPU_REGS_RDX] = 0;
1703        best = NULL;
1704        for (i = 0; i < vcpu->cpuid_nent; ++i) {
1705                e = &vcpu->cpuid_entries[i];
1706                if (e->function == function) {
1707                        best = e;
1708                        break;
1709                }
1710                /*
1711                 * Both basic or both extended?
1712                 */
1713                if (((e->function ^ function) & 0x80000000) == 0)
1714                        if (!best || e->function > best->function)
1715                                best = e;
1716        }
1717        if (best) {
1718                vcpu->regs[VCPU_REGS_RAX] = best->eax;
1719                vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1720                vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1721                vcpu->regs[VCPU_REGS_RDX] = best->edx;
1722        }
1723        kvm_x86_ops->decache_regs(vcpu);
1724        kvm_x86_ops->skip_emulated_instruction(vcpu);
1725}
1726EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1727
1728static int pio_copy_data(struct kvm_vcpu *vcpu)
1729{
1730        void *p = vcpu->pio_data;
1731        void *q;
1732        unsigned bytes;
1733        int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1734
1735        q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1736                 PAGE_KERNEL);
1737        if (!q) {
1738                free_pio_guest_pages(vcpu);
1739                return -ENOMEM;
1740        }
1741        q += vcpu->pio.guest_page_offset;
1742        bytes = vcpu->pio.size * vcpu->pio.cur_count;
1743        if (vcpu->pio.in)
1744                memcpy(q, p, bytes);
1745        else
1746                memcpy(p, q, bytes);
1747        q -= vcpu->pio.guest_page_offset;
1748        vunmap(q);
1749        free_pio_guest_pages(vcpu);
1750        return 0;
1751}
1752
1753static int complete_pio(struct kvm_vcpu *vcpu)
1754{
1755        struct kvm_pio_request *io = &vcpu->pio;
1756        long delta;
1757        int r;
1758
1759        kvm_x86_ops->cache_regs(vcpu);
1760
1761        if (!io->string) {
1762                if (io->in)
1763                        memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1764                               io->size);
1765        } else {
1766                if (io->in) {
1767                        r = pio_copy_data(vcpu);
1768                        if (r) {
1769                                kvm_x86_ops->cache_regs(vcpu);
1770                                return r;
1771                        }
1772                }
1773
1774                delta = 1;
1775                if (io->rep) {
1776                        delta *= io->cur_count;
1777                        /*
1778                         * The size of the register should really depend on
1779                         * current address size.
1780                         */
1781                        vcpu->regs[VCPU_REGS_RCX] -= delta;
1782                }
1783                if (io->down)
1784                        delta = -delta;
1785                delta *= io->size;
1786                if (io->in)
1787                        vcpu->regs[VCPU_REGS_RDI] += delta;
1788                else
1789                        vcpu->regs[VCPU_REGS_RSI] += delta;
1790        }
1791
1792        kvm_x86_ops->decache_regs(vcpu);
1793
1794        io->count -= io->cur_count;
1795        io->cur_count = 0;
1796
1797        return 0;
1798}
1799
1800static void kernel_pio(struct kvm_io_device *pio_dev,
1801                       struct kvm_vcpu *vcpu,
1802                       void *pd)
1803{
1804        /* TODO: String I/O for in kernel device */
1805
1806        mutex_lock(&vcpu->kvm->lock);
1807        if (vcpu->pio.in)
1808                kvm_iodevice_read(pio_dev, vcpu->pio.port,
1809                                  vcpu->pio.size,
1810                                  pd);
1811        else
1812                kvm_iodevice_write(pio_dev, vcpu->pio.port,
1813                                   vcpu->pio.size,
1814                                   pd);
1815        mutex_unlock(&vcpu->kvm->lock);
1816}
1817
1818static void pio_string_write(struct kvm_io_device *pio_dev,
1819                             struct kvm_vcpu *vcpu)
1820{
1821        struct kvm_pio_request *io = &vcpu->pio;
1822        void *pd = vcpu->pio_data;
1823        int i;
1824
1825        mutex_lock(&vcpu->kvm->lock);
1826        for (i = 0; i < io->cur_count; i++) {
1827                kvm_iodevice_write(pio_dev, io->port,
1828                                   io->size,
1829                                   pd);
1830                pd += io->size;
1831        }
1832        mutex_unlock(&vcpu->kvm->lock);
1833}
1834
1835int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1836                  int size, unsigned port)
1837{
1838        struct kvm_io_device *pio_dev;
1839
1840        vcpu->run->exit_reason = KVM_EXIT_IO;
1841        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1842        vcpu->run->io.size = vcpu->pio.size = size;
1843        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1844        vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
1845        vcpu->run->io.port = vcpu->pio.port = port;
1846        vcpu->pio.in = in;
1847        vcpu->pio.string = 0;
1848        vcpu->pio.down = 0;
1849        vcpu->pio.guest_page_offset = 0;
1850        vcpu->pio.rep = 0;
1851
1852        kvm_x86_ops->cache_regs(vcpu);
1853        memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1854        kvm_x86_ops->decache_regs(vcpu);
1855
1856        kvm_x86_ops->skip_emulated_instruction(vcpu);
1857
1858        pio_dev = vcpu_find_pio_dev(vcpu, port);
1859        if (pio_dev) {
1860                kernel_pio(pio_dev, vcpu, vcpu->pio_data);
1861                complete_pio(vcpu);
1862                return 1;
1863        }
1864        return 0;
1865}
1866EXPORT_SYMBOL_GPL(kvm_emulate_pio);
1867
1868int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1869                  int size, unsigned long count, int down,
1870                  gva_t address, int rep, unsigned port)
1871{
1872        unsigned now, in_page;
1873        int i, ret = 0;
1874        int nr_pages = 1;
1875        struct page *page;
1876        struct kvm_io_device *pio_dev;
1877
1878        vcpu->run->exit_reason = KVM_EXIT_IO;
1879        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1880        vcpu->run->io.size = vcpu->pio.size = size;
1881        vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1882        vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
1883        vcpu->run->io.port = vcpu->pio.port = port;
1884        vcpu->pio.in = in;
1885        vcpu->pio.string = 1;
1886        vcpu->pio.down = down;
1887        vcpu->pio.guest_page_offset = offset_in_page(address);
1888        vcpu->pio.rep = rep;
1889
1890        if (!count) {
1891                kvm_x86_ops->skip_emulated_instruction(vcpu);
1892                return 1;
1893        }
1894
1895        if (!down)
1896                in_page = PAGE_SIZE - offset_in_page(address);
1897        else
1898                in_page = offset_in_page(address) + size;
1899        now = min(count, (unsigned long)in_page / size);
1900        if (!now) {
1901                /*
1902                 * String I/O straddles page boundary.  Pin two guest pages
1903                 * so that we satisfy atomicity constraints.  Do just one
1904                 * transaction to avoid complexity.
1905                 */
1906                nr_pages = 2;
1907                now = 1;
1908        }
1909        if (down) {
1910                /*
1911                 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
1912                 */
1913                pr_unimpl(vcpu, "guest string pio down\n");
1914                inject_gp(vcpu);
1915                return 1;
1916        }
1917        vcpu->run->io.count = now;
1918        vcpu->pio.cur_count = now;
1919
1920        if (vcpu->pio.cur_count == vcpu->pio.count)
1921                kvm_x86_ops->skip_emulated_instruction(vcpu);
1922
1923        for (i = 0; i < nr_pages; ++i) {
1924                mutex_lock(&vcpu->kvm->lock);
1925                page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1926                if (page)
1927                        get_page(page);
1928                vcpu->pio.guest_pages[i] = page;
1929                mutex_unlock(&vcpu->kvm->lock);
1930                if (!page) {
1931                        inject_gp(vcpu);
1932                        free_pio_guest_pages(vcpu);
1933                        return 1;
1934                }
1935        }
1936
1937        pio_dev = vcpu_find_pio_dev(vcpu, port);
1938        if (!vcpu->pio.in) {
1939                /* string PIO write */
1940                ret = pio_copy_data(vcpu);
1941                if (ret >= 0 && pio_dev) {
1942                        pio_string_write(pio_dev, vcpu);
1943                        complete_pio(vcpu);
1944                        if (vcpu->pio.count == 0)
1945                                ret = 1;
1946                }
1947        } else if (pio_dev)
1948                pr_unimpl(vcpu, "no string pio read support yet, "
1949                       "port %x size %d count %ld\n",
1950                        port, size, count);
1951
1952        return ret;
1953}
1954EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1955
1956/*
1957 * Check if userspace requested an interrupt window, and that the
1958 * interrupt window is open.
1959 *
1960 * No need to exit to userspace if we already have an interrupt queued.
1961 */
1962static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1963                                          struct kvm_run *kvm_run)
1964{
1965        return (!vcpu->irq_summary &&
1966                kvm_run->request_interrupt_window &&
1967                vcpu->interrupt_window_open &&
1968                (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
1969}
1970
1971static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1972                              struct kvm_run *kvm_run)
1973{
1974        kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
1975        kvm_run->cr8 = get_cr8(vcpu);
1976        kvm_run->apic_base = kvm_get_apic_base(vcpu);
1977        if (irqchip_in_kernel(vcpu->kvm))
1978                kvm_run->ready_for_interrupt_injection = 1;
1979        else
1980                kvm_run->ready_for_interrupt_injection =
1981                                        (vcpu->interrupt_window_open &&
1982                                         vcpu->irq_summary == 0);
1983}
1984
1985static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1986{
1987        int r;
1988
1989        if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
1990                printk("vcpu %d received sipi with vector # %x\n",
1991                       vcpu->vcpu_id, vcpu->sipi_vector);
1992                kvm_lapic_reset(vcpu);
1993                kvm_x86_ops->vcpu_reset(vcpu);
1994                vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
1995        }
1996
1997preempted:
1998        if (vcpu->guest_debug.enabled)
1999                kvm_x86_ops->guest_debug_pre(vcpu);
2000
2001again:
2002        r = kvm_mmu_reload(vcpu);
2003        if (unlikely(r))
2004                goto out;
2005
2006        preempt_disable();
2007
2008        kvm_x86_ops->prepare_guest_switch(vcpu);
2009        kvm_load_guest_fpu(vcpu);
2010
2011        local_irq_disable();
2012
2013        if (signal_pending(current)) {
2014                local_irq_enable();
2015                preempt_enable();
2016                r = -EINTR;
2017                kvm_run->exit_reason = KVM_EXIT_INTR;
2018                ++vcpu->stat.signal_exits;
2019                goto out;
2020        }
2021
2022        if (irqchip_in_kernel(vcpu->kvm))
2023                kvm_x86_ops->inject_pending_irq(vcpu);
2024        else if (!vcpu->mmio_read_completed)
2025                kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2026
2027        vcpu->guest_mode = 1;
2028        kvm_guest_enter();
2029
2030        if (vcpu->requests)
2031                if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
2032                        kvm_x86_ops->tlb_flush(vcpu);
2033
2034        kvm_x86_ops->run(vcpu, kvm_run);
2035
2036        vcpu->guest_mode = 0;
2037        local_irq_enable();
2038
2039        ++vcpu->stat.exits;
2040
2041        /*
2042         * We must have an instruction between local_irq_enable() and
2043         * kvm_guest_exit(), so the timer interrupt isn't delayed by
2044         * the interrupt shadow.  The stat.exits increment will do nicely.
2045         * But we need to prevent reordering, hence this barrier():
2046         */
2047        barrier();
2048
2049        kvm_guest_exit();
2050
2051        preempt_enable();
2052
2053        /*
2054         * Profile KVM exit RIPs:
2055         */
2056        if (unlikely(prof_on == KVM_PROFILING)) {
2057                kvm_x86_ops->cache_regs(vcpu);
2058                profile_hit(KVM_PROFILING, (void *)vcpu->rip);
2059        }
2060
2061        r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2062
2063        if (r > 0) {
2064                if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2065                        r = -EINTR;
2066                        kvm_run->exit_reason = KVM_EXIT_INTR;
2067                        ++vcpu->stat.request_irq_exits;
2068                        goto out;
2069                }
2070                if (!need_resched()) {
2071                        ++vcpu->stat.light_exits;
2072                        goto again;
2073                }
2074        }
2075
2076out:
2077        if (r > 0) {
2078                kvm_resched(vcpu);
2079                goto preempted;
2080        }
2081
2082        post_kvm_run_save(vcpu, kvm_run);
2083
2084        return r;
2085}
2086
2087
2088static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2089{
2090        int r;
2091        sigset_t sigsaved;
2092
2093        vcpu_load(vcpu);
2094
2095        if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2096                kvm_vcpu_block(vcpu);
2097                vcpu_put(vcpu);
2098                return -EAGAIN;
2099        }
2100
2101        if (vcpu->sigset_active)
2102                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2103
2104        /* re-sync apic's tpr */
2105        if (!irqchip_in_kernel(vcpu->kvm))
2106                set_cr8(vcpu, kvm_run->cr8);
2107
2108        if (vcpu->pio.cur_count) {
2109                r = complete_pio(vcpu);
2110                if (r)
2111                        goto out;
2112        }
2113
2114        if (vcpu->mmio_needed) {
2115                memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2116                vcpu->mmio_read_completed = 1;
2117                vcpu->mmio_needed = 0;
2118                r = emulate_instruction(vcpu, kvm_run,
2119                                        vcpu->mmio_fault_cr2, 0);
2120                if (r == EMULATE_DO_MMIO) {
2121                        /*
2122                         * Read-modify-write.  Back to userspace.
2123                         */
2124                        r = 0;
2125                        goto out;
2126                }
2127        }
2128
2129        if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2130                kvm_x86_ops->cache_regs(vcpu);
2131                vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2132                kvm_x86_ops->decache_regs(vcpu);
2133        }
2134
2135        r = __vcpu_run(vcpu, kvm_run);
2136
2137out:
2138        if (vcpu->sigset_active)
2139                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2140
2141        vcpu_put(vcpu);
2142        return r;
2143}
2144
2145static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
2146                                   struct kvm_regs *regs)
2147{
2148        vcpu_load(vcpu);
2149
2150        kvm_x86_ops->cache_regs(vcpu);
2151
2152        regs->rax = vcpu->regs[VCPU_REGS_RAX];
2153        regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2154        regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2155        regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2156        regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2157        regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2158        regs->rsp = vcpu->regs[VCPU_REGS_RSP];
2159        regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2160#ifdef CONFIG_X86_64
2161        regs->r8 = vcpu->regs[VCPU_REGS_R8];
2162        regs->r9 = vcpu->regs[VCPU_REGS_R9];
2163        regs->r10 = vcpu->regs[VCPU_REGS_R10];
2164        regs->r11 = vcpu->regs[VCPU_REGS_R11];
2165        regs->r12 = vcpu->regs[VCPU_REGS_R12];
2166        regs->r13 = vcpu->regs[VCPU_REGS_R13];
2167        regs->r14 = vcpu->regs[VCPU_REGS_R14];
2168        regs->r15 = vcpu->regs[VCPU_REGS_R15];
2169#endif
2170
2171        regs->rip = vcpu->rip;
2172        regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2173
2174        /*
2175         * Don't leak debug flags in case they were set for guest debugging
2176         */
2177        if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2178                regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2179
2180        vcpu_put(vcpu);
2181
2182        return 0;
2183}
2184
2185static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
2186                                   struct kvm_regs *regs)
2187{
2188        vcpu_load(vcpu);
2189
2190        vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2191        vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2192        vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2193        vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2194        vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2195        vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2196        vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2197        vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2198#ifdef CONFIG_X86_64
2199        vcpu->regs[VCPU_REGS_R8] = regs->r8;
2200        vcpu->regs[VCPU_REGS_R9] = regs->r9;
2201        vcpu->regs[VCPU_REGS_R10] = regs->r10;
2202        vcpu->regs[VCPU_REGS_R11] = regs->r11;
2203        vcpu->regs[VCPU_REGS_R12] = regs->r12;
2204        vcpu->regs[VCPU_REGS_R13] = regs->r13;
2205        vcpu->regs[VCPU_REGS_R14] = regs->r14;
2206        vcpu->regs[VCPU_REGS_R15] = regs->r15;
2207#endif
2208
2209        vcpu->rip = regs->rip;
2210        kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2211
2212        kvm_x86_ops->decache_regs(vcpu);
2213
2214        vcpu_put(vcpu);
2215
2216        return 0;
2217}
2218
2219static void get_segment(struct kvm_vcpu *vcpu,
2220                        struct kvm_segment *var, int seg)
2221{
2222        return kvm_x86_ops->get_segment(vcpu, var, seg);
2223}
2224
2225static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2226                                    struct kvm_sregs *sregs)
2227{
2228        struct descriptor_table dt;
2229        int pending_vec;
2230
2231        vcpu_load(vcpu);
2232
2233        get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2234        get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2235        get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2236        get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2237        get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2238        get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2239
2240        get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2241        get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2242
2243        kvm_x86_ops->get_idt(vcpu, &dt);
2244        sregs->idt.limit = dt.limit;
2245        sregs->idt.base = dt.base;
2246        kvm_x86_ops->get_gdt(vcpu, &dt);
2247        sregs->gdt.limit = dt.limit;
2248        sregs->gdt.base = dt.base;
2249
2250        kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2251        sregs->cr0 = vcpu->cr0;
2252        sregs->cr2 = vcpu->cr2;
2253        sregs->cr3 = vcpu->cr3;
2254        sregs->cr4 = vcpu->cr4;
2255        sregs->cr8 = get_cr8(vcpu);
2256        sregs->efer = vcpu->shadow_efer;
2257        sregs->apic_base = kvm_get_apic_base(vcpu);
2258
2259        if (irqchip_in_kernel(vcpu->kvm)) {
2260                memset(sregs->interrupt_bitmap, 0,
2261                       sizeof sregs->interrupt_bitmap);
2262                pending_vec = kvm_x86_ops->get_irq(vcpu);
2263                if (pending_vec >= 0)
2264                        set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
2265        } else
2266                memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2267                       sizeof sregs->interrupt_bitmap);
2268
2269        vcpu_put(vcpu);
2270
2271        return 0;
2272}
2273
2274static void set_segment(struct kvm_vcpu *vcpu,
2275                        struct kvm_segment *var, int seg)
2276{
2277        return kvm_x86_ops->set_segment(vcpu, var, seg);
2278}
2279
2280static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2281                                    struct kvm_sregs *sregs)
2282{
2283        int mmu_reset_needed = 0;
2284        int i, pending_vec, max_bits;
2285        struct descriptor_table dt;
2286
2287        vcpu_load(vcpu);
2288
2289        dt.limit = sregs->idt.limit;
2290        dt.base = sregs->idt.base;
2291        kvm_x86_ops->set_idt(vcpu, &dt);
2292        dt.limit = sregs->gdt.limit;
2293        dt.base = sregs->gdt.base;
2294        kvm_x86_ops->set_gdt(vcpu, &dt);
2295
2296        vcpu->cr2 = sregs->cr2;
2297        mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2298        vcpu->cr3 = sregs->cr3;
2299
2300        set_cr8(vcpu, sregs->cr8);
2301
2302        mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2303#ifdef CONFIG_X86_64
2304        kvm_x86_ops->set_efer(vcpu, sregs->efer);
2305#endif
2306        kvm_set_apic_base(vcpu, sregs->apic_base);
2307
2308        kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2309
2310        mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2311        vcpu->cr0 = sregs->cr0;
2312        kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2313
2314        mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2315        kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2316        if (!is_long_mode(vcpu) && is_pae(vcpu))
2317                load_pdptrs(vcpu, vcpu->cr3);
2318
2319        if (mmu_reset_needed)
2320                kvm_mmu_reset_context(vcpu);
2321
2322        if (!irqchip_in_kernel(vcpu->kvm)) {
2323                memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2324                       sizeof vcpu->irq_pending);
2325                vcpu->irq_summary = 0;
2326                for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
2327                        if (vcpu->irq_pending[i])
2328                                __set_bit(i, &vcpu->irq_summary);
2329        } else {
2330                max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2331                pending_vec = find_first_bit(
2332                        (const unsigned long *)sregs->interrupt_bitmap,
2333                        max_bits);
2334                /* Only pending external irq is handled here */
2335                if (pending_vec < max_bits) {
2336                        kvm_x86_ops->set_irq(vcpu, pending_vec);
2337                        printk("Set back pending irq %d\n", pending_vec);
2338                }
2339        }
2340
2341        set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2342        set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2343        set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2344        set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2345        set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2346        set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2347
2348        set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2349        set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2350
2351        vcpu_put(vcpu);
2352
2353        return 0;
2354}
2355
2356void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2357{
2358        struct kvm_segment cs;
2359
2360        get_segment(vcpu, &cs, VCPU_SREG_CS);
2361        *db = cs.db;
2362        *l = cs.l;
2363}
2364EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2365
2366/*
2367 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2368 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2369 *
2370 * This list is modified at module load time to reflect the
2371 * capabilities of the host cpu.
2372 */
2373static u32 msrs_to_save[] = {
2374        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2375        MSR_K6_STAR,
2376#ifdef CONFIG_X86_64
2377        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2378#endif
2379        MSR_IA32_TIME_STAMP_COUNTER,
2380};
2381
2382static unsigned num_msrs_to_save;
2383
2384static u32 emulated_msrs[] = {
2385        MSR_IA32_MISC_ENABLE,
2386};
2387
2388static __init void kvm_init_msr_list(void)
2389{
2390        u32 dummy[2];
2391        unsigned i, j;
2392
2393        for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2394                if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2395                        continue;
2396                if (j < i)
2397                        msrs_to_save[j] = msrs_to_save[i];
2398                j++;
2399        }
2400        num_msrs_to_save = j;
2401}
2402
2403/*
2404 * Adapt set_msr() to msr_io()'s calling convention
2405 */
2406static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2407{
2408        return kvm_set_msr(vcpu, index, *data);
2409}
2410
2411/*
2412 * Read or write a bunch of msrs. All parameters are kernel addresses.
2413 *
2414 * @return number of msrs set successfully.
2415 */
2416static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2417                    struct kvm_msr_entry *entries,
2418                    int (*do_msr)(struct kvm_vcpu *vcpu,
2419                                  unsigned index, u64 *data))
2420{
2421        int i;
2422
2423        vcpu_load(vcpu);
2424
2425        for (i = 0; i < msrs->nmsrs; ++i)
2426                if (do_msr(vcpu, entries[i].index, &entries[i].data))
2427                        break;
2428
2429        vcpu_put(vcpu);
2430
2431        return i;
2432}
2433
2434/*
2435 * Read or write a bunch of msrs. Parameters are user addresses.
2436 *
2437 * @return number of msrs set successfully.
2438 */
2439static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2440                  int (*do_msr)(struct kvm_vcpu *vcpu,
2441                                unsigned index, u64 *data),
2442                  int writeback)
2443{
2444        struct kvm_msrs msrs;
2445        struct kvm_msr_entry *entries;
2446        int r, n;
2447        unsigned size;
2448
2449        r = -EFAULT;
2450        if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2451                goto out;
2452
2453        r = -E2BIG;
2454        if (msrs.nmsrs >= MAX_IO_MSRS)
2455                goto out;
2456
2457        r = -ENOMEM;
2458        size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2459        entries = vmalloc(size);
2460        if (!entries)
2461                goto out;
2462
2463        r = -EFAULT;
2464        if (copy_from_user(entries, user_msrs->entries, size))
2465                goto out_free;
2466
2467        r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2468        if (r < 0)
2469                goto out_free;
2470
2471        r = -EFAULT;
2472        if (writeback && copy_to_user(user_msrs->entries, entries, size))
2473                goto out_free;
2474
2475        r = n;
2476
2477out_free:
2478        vfree(entries);
2479out:
2480        return r;
2481}
2482
2483/*
2484 * Translate a guest virtual address to a guest physical address.
2485 */
2486static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2487                                    struct kvm_translation *tr)
2488{
2489        unsigned long vaddr = tr->linear_address;
2490        gpa_t gpa;
2491
2492        vcpu_load(vcpu);
2493        mutex_lock(&vcpu->kvm->lock);
2494        gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2495        tr->physical_address = gpa;
2496        tr->valid = gpa != UNMAPPED_GVA;
2497        tr->writeable = 1;
2498        tr->usermode = 0;
2499        mutex_unlock(&vcpu->kvm->lock);
2500        vcpu_put(vcpu);
2501
2502        return 0;
2503}
2504
2505static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2506                                    struct kvm_interrupt *irq)
2507{
2508        if (irq->irq < 0 || irq->irq >= 256)
2509                return -EINVAL;
2510        if (irqchip_in_kernel(vcpu->kvm))
2511                return -ENXIO;
2512        vcpu_load(vcpu);
2513
2514        set_bit(irq->irq, vcpu->irq_pending);
2515        set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2516
2517        vcpu_put(vcpu);
2518
2519        return 0;
2520}
2521
2522static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2523                                      struct kvm_debug_guest *dbg)
2524{
2525        int r;
2526
2527        vcpu_load(vcpu);
2528
2529        r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2530
2531        vcpu_put(vcpu);
2532
2533        return r;
2534}
2535
2536static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2537                                    unsigned long address,
2538                                    int *type)
2539{
2540        struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2541        unsigned long pgoff;
2542        struct page *page;
2543
2544        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2545        if (pgoff == 0)
2546                page = virt_to_page(vcpu->run);
2547        else if (pgoff == KVM_PIO_PAGE_OFFSET)
2548                page = virt_to_page(vcpu->pio_data);
2549        else
2550                return NOPAGE_SIGBUS;
2551        get_page(page);
2552        if (type != NULL)
2553                *type = VM_FAULT_MINOR;
2554
2555        return page;
2556}
2557
2558static struct vm_operations_struct kvm_vcpu_vm_ops = {
2559        .nopage = kvm_vcpu_nopage,
2560};
2561
2562static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2563{
2564        vma->vm_ops = &kvm_vcpu_vm_ops;
2565        return 0;
2566}
2567
2568static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2569{
2570        struct kvm_vcpu *vcpu = filp->private_data;
2571
2572        fput(vcpu->kvm->filp);
2573        return 0;
2574}
2575
2576static struct file_operations kvm_vcpu_fops = {
2577        .release        = kvm_vcpu_release,
2578        .unlocked_ioctl = kvm_vcpu_ioctl,
2579        .compat_ioctl   = kvm_vcpu_ioctl,
2580        .mmap           = kvm_vcpu_mmap,
2581};
2582
2583/*
2584 * Allocates an inode for the vcpu.
2585 */
2586static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2587{
2588        int fd, r;
2589        struct inode *inode;
2590        struct file *file;
2591
2592        r = anon_inode_getfd(&fd, &inode, &file,
2593                             "kvm-vcpu", &kvm_vcpu_fops, vcpu);
2594        if (r)
2595                return r;
2596        atomic_inc(&vcpu->kvm->filp->f_count);
2597        return fd;
2598}
2599
2600/*
2601 * Creates some virtual cpus.  Good luck creating more than one.
2602 */
2603static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2604{
2605        int r;
2606        struct kvm_vcpu *vcpu;
2607
2608        if (!valid_vcpu(n))
2609                return -EINVAL;
2610
2611        vcpu = kvm_x86_ops->vcpu_create(kvm, n);
2612        if (IS_ERR(vcpu))
2613                return PTR_ERR(vcpu);
2614
2615        preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
2616
2617        /* We do fxsave: this must be aligned. */
2618        BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
2619
2620        vcpu_load(vcpu);
2621        r = kvm_mmu_setup(vcpu);
2622        vcpu_put(vcpu);
2623        if (r < 0)
2624                goto free_vcpu;
2625
2626        mutex_lock(&kvm->lock);
2627        if (kvm->vcpus[n]) {
2628                r = -EEXIST;
2629                mutex_unlock(&kvm->lock);
2630                goto mmu_unload;
2631        }
2632        kvm->vcpus[n] = vcpu;
2633        mutex_unlock(&kvm->lock);
2634
2635        /* Now it's all set up, let userspace reach it */
2636        r = create_vcpu_fd(vcpu);
2637        if (r < 0)
2638                goto unlink;
2639        return r;
2640
2641unlink:
2642        mutex_lock(&kvm->lock);
2643        kvm->vcpus[n] = NULL;
2644        mutex_unlock(&kvm->lock);
2645
2646mmu_unload:
2647        vcpu_load(vcpu);
2648        kvm_mmu_unload(vcpu);
2649        vcpu_put(vcpu);
2650
2651free_vcpu:
2652        kvm_x86_ops->vcpu_free(vcpu);
2653        return r;
2654}
2655
2656static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2657{
2658        u64 efer;
2659        int i;
2660        struct kvm_cpuid_entry *e, *entry;
2661
2662        rdmsrl(MSR_EFER, efer);
2663        entry = NULL;
2664        for (i = 0; i < vcpu->cpuid_nent; ++i) {
2665                e = &vcpu->cpuid_entries[i];
2666                if (e->function == 0x80000001) {
2667                        entry = e;
2668                        break;
2669                }
2670        }
2671        if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
2672                entry->edx &= ~(1 << 20);
2673                printk(KERN_INFO "kvm: guest NX capability removed\n");
2674        }
2675}
2676
2677static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2678                                    struct kvm_cpuid *cpuid,
2679                                    struct kvm_cpuid_entry __user *entries)
2680{
2681        int r;
2682
2683        r = -E2BIG;
2684        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2685                goto out;
2686        r = -EFAULT;
2687        if (copy_from_user(&vcpu->cpuid_entries, entries,
2688                           cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2689                goto out;
2690        vcpu->cpuid_nent = cpuid->nent;
2691        cpuid_fix_nx_cap(vcpu);
2692        return 0;
2693
2694out:
2695        return r;
2696}
2697
2698static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2699{
2700        if (sigset) {
2701                sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2702                vcpu->sigset_active = 1;
2703                vcpu->sigset = *sigset;
2704        } else
2705                vcpu->sigset_active = 0;
2706        return 0;
2707}
2708
2709/*
2710 * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2711 * we have asm/x86/processor.h
2712 */
2713struct fxsave {
2714        u16     cwd;
2715        u16     swd;
2716        u16     twd;
2717        u16     fop;
2718        u64     rip;
2719        u64     rdp;
2720        u32     mxcsr;
2721        u32     mxcsr_mask;
2722        u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
2723#ifdef CONFIG_X86_64
2724        u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
2725#else
2726        u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
2727#endif
2728};
2729
2730static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2731{
2732        struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2733
2734        vcpu_load(vcpu);
2735
2736        memcpy(fpu->fpr, fxsave->st_space, 128);
2737        fpu->fcw = fxsave->cwd;
2738        fpu->fsw = fxsave->swd;
2739        fpu->ftwx = fxsave->twd;
2740        fpu->last_opcode = fxsave->fop;
2741        fpu->last_ip = fxsave->rip;
2742        fpu->last_dp = fxsave->rdp;
2743        memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2744
2745        vcpu_put(vcpu);
2746
2747        return 0;
2748}
2749
2750static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2751{
2752        struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2753
2754        vcpu_load(vcpu);
2755
2756        memcpy(fxsave->st_space, fpu->fpr, 128);
2757        fxsave->cwd = fpu->fcw;
2758        fxsave->swd = fpu->fsw;
2759        fxsave->twd = fpu->ftwx;
2760        fxsave->fop = fpu->last_opcode;
2761        fxsave->rip = fpu->last_ip;
2762        fxsave->rdp = fpu->last_dp;
2763        memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2764
2765        vcpu_put(vcpu);
2766
2767        return 0;
2768}
2769
2770static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2771                                    struct kvm_lapic_state *s)
2772{
2773        vcpu_load(vcpu);
2774        memcpy(s->regs, vcpu->apic->regs, sizeof *s);
2775        vcpu_put(vcpu);
2776
2777        return 0;
2778}
2779
2780static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2781                                    struct kvm_lapic_state *s)
2782{
2783        vcpu_load(vcpu);
2784        memcpy(vcpu->apic->regs, s->regs, sizeof *s);
2785        kvm_apic_post_state_restore(vcpu);
2786        vcpu_put(vcpu);
2787
2788        return 0;
2789}
2790
2791static long kvm_vcpu_ioctl(struct file *filp,
2792                           unsigned int ioctl, unsigned long arg)
2793{
2794        struct kvm_vcpu *vcpu = filp->private_data;
2795        void __user *argp = (void __user *)arg;
2796        int r = -EINVAL;
2797
2798        switch (ioctl) {
2799        case KVM_RUN:
2800                r = -EINVAL;
2801                if (arg)
2802                        goto out;
2803                r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2804                break;
2805        case KVM_GET_REGS: {
2806                struct kvm_regs kvm_regs;
2807
2808                memset(&kvm_regs, 0, sizeof kvm_regs);
2809                r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2810                if (r)
2811                        goto out;
2812                r = -EFAULT;
2813                if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2814                        goto out;
2815                r = 0;
2816                break;
2817        }
2818        case KVM_SET_REGS: {
2819                struct kvm_regs kvm_regs;
2820
2821                r = -EFAULT;
2822                if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2823                        goto out;
2824                r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2825                if (r)
2826                        goto out;
2827                r = 0;
2828                break;
2829        }
2830        case KVM_GET_SREGS: {
2831                struct kvm_sregs kvm_sregs;
2832
2833                memset(&kvm_sregs, 0, sizeof kvm_sregs);
2834                r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2835                if (r)
2836                        goto out;
2837                r = -EFAULT;
2838                if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2839                        goto out;
2840                r = 0;
2841                break;
2842        }
2843        case KVM_SET_SREGS: {
2844                struct kvm_sregs kvm_sregs;
2845
2846                r = -EFAULT;
2847                if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2848                        goto out;
2849                r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2850                if (r)
2851                        goto out;
2852                r = 0;
2853                break;
2854        }
2855        case KVM_TRANSLATE: {
2856                struct kvm_translation tr;
2857
2858                r = -EFAULT;
2859                if (copy_from_user(&tr, argp, sizeof tr))
2860                        goto out;
2861                r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2862                if (r)
2863                        goto out;
2864                r = -EFAULT;
2865                if (copy_to_user(argp, &tr, sizeof tr))
2866                        goto out;
2867                r = 0;
2868                break;
2869        }
2870        case KVM_INTERRUPT: {
2871                struct kvm_interrupt irq;
2872
2873                r = -EFAULT;
2874                if (copy_from_user(&irq, argp, sizeof irq))
2875                        goto out;
2876                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2877                if (r)
2878                        goto out;
2879                r = 0;
2880                break;
2881        }
2882        case KVM_DEBUG_GUEST: {
2883                struct kvm_debug_guest dbg;
2884
2885                r = -EFAULT;
2886                if (copy_from_user(&dbg, argp, sizeof dbg))
2887                        goto out;
2888                r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2889                if (r)
2890                        goto out;
2891                r = 0;
2892                break;
2893        }
2894        case KVM_GET_MSRS:
2895                r = msr_io(vcpu, argp, kvm_get_msr, 1);
2896                break;
2897        case KVM_SET_MSRS:
2898                r = msr_io(vcpu, argp, do_set_msr, 0);
2899                break;
2900        case KVM_SET_CPUID: {
2901                struct kvm_cpuid __user *cpuid_arg = argp;
2902                struct kvm_cpuid cpuid;
2903
2904                r = -EFAULT;
2905                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2906                        goto out;
2907                r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2908                if (r)
2909                        goto out;
2910                break;
2911        }
2912        case KVM_SET_SIGNAL_MASK: {
2913                struct kvm_signal_mask __user *sigmask_arg = argp;
2914                struct kvm_signal_mask kvm_sigmask;
2915                sigset_t sigset, *p;
2916
2917                p = NULL;
2918                if (argp) {
2919                        r = -EFAULT;
2920                        if (copy_from_user(&kvm_sigmask, argp,
2921                                           sizeof kvm_sigmask))
2922                                goto out;
2923                        r = -EINVAL;
2924                        if (kvm_sigmask.len != sizeof sigset)
2925                                goto out;
2926                        r = -EFAULT;
2927                        if (copy_from_user(&sigset, sigmask_arg->sigset,
2928                                           sizeof sigset))
2929                                goto out;
2930                        p = &sigset;
2931                }
2932                r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2933                break;
2934        }
2935        case KVM_GET_FPU: {
2936                struct kvm_fpu fpu;
2937
2938                memset(&fpu, 0, sizeof fpu);
2939                r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2940                if (r)
2941                        goto out;
2942                r = -EFAULT;
2943                if (copy_to_user(argp, &fpu, sizeof fpu))
2944                        goto out;
2945                r = 0;
2946                break;
2947        }
2948        case KVM_SET_FPU: {
2949                struct kvm_fpu fpu;
2950
2951                r = -EFAULT;
2952                if (copy_from_user(&fpu, argp, sizeof fpu))
2953                        goto out;
2954                r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2955                if (r)
2956                        goto out;
2957                r = 0;
2958                break;
2959        }
2960        case KVM_GET_LAPIC: {
2961                struct kvm_lapic_state lapic;
2962
2963                memset(&lapic, 0, sizeof lapic);
2964                r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
2965                if (r)
2966                        goto out;
2967                r = -EFAULT;
2968                if (copy_to_user(argp, &lapic, sizeof lapic))
2969                        goto out;
2970                r = 0;
2971                break;
2972        }
2973        case KVM_SET_LAPIC: {
2974                struct kvm_lapic_state lapic;
2975
2976                r = -EFAULT;
2977                if (copy_from_user(&lapic, argp, sizeof lapic))
2978                        goto out;
2979                r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
2980                if (r)
2981                        goto out;
2982                r = 0;
2983                break;
2984        }
2985        default:
2986                ;
2987        }
2988out:
2989        return r;
2990}
2991
2992static long kvm_vm_ioctl(struct file *filp,
2993                           unsigned int ioctl, unsigned long arg)
2994{
2995        struct kvm *kvm = filp->private_data;
2996        void __user *argp = (void __user *)arg;
2997        int r = -EINVAL;
2998
2999        switch (ioctl) {
3000        case KVM_CREATE_VCPU:
3001                r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3002                if (r < 0)
3003                        goto out;
3004                break;
3005        case KVM_SET_MEMORY_REGION: {
3006                struct kvm_memory_region kvm_mem;
3007
3008                r = -EFAULT;
3009                if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
3010                        goto out;
3011                r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
3012                if (r)
3013                        goto out;
3014                break;
3015        }
3016        case KVM_GET_DIRTY_LOG: {
3017                struct kvm_dirty_log log;
3018
3019                r = -EFAULT;
3020                if (copy_from_user(&log, argp, sizeof log))
3021                        goto out;
3022                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3023                if (r)
3024                        goto out;
3025                break;
3026        }
3027        case KVM_SET_MEMORY_ALIAS: {
3028                struct kvm_memory_alias alias;
3029
3030                r = -EFAULT;
3031                if (copy_from_user(&alias, argp, sizeof alias))
3032                        goto out;
3033                r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
3034                if (r)
3035                        goto out;
3036                break;
3037        }
3038        case KVM_CREATE_IRQCHIP:
3039                r = -ENOMEM;
3040                kvm->vpic = kvm_create_pic(kvm);
3041                if (kvm->vpic) {
3042                        r = kvm_ioapic_init(kvm);
3043                        if (r) {
3044                                kfree(kvm->vpic);
3045                                kvm->vpic = NULL;
3046                                goto out;
3047                        }
3048                }
3049                else
3050                        goto out;
3051                break;
3052        case KVM_IRQ_LINE: {
3053                struct kvm_irq_level irq_event;
3054
3055                r = -EFAULT;
3056                if (copy_from_user(&irq_event, argp, sizeof irq_event))
3057                        goto out;
3058                if (irqchip_in_kernel(kvm)) {
3059                        mutex_lock(&kvm->lock);
3060                        if (irq_event.irq < 16)
3061                                kvm_pic_set_irq(pic_irqchip(kvm),
3062                                        irq_event.irq,
3063                                        irq_event.level);
3064                        kvm_ioapic_set_irq(kvm->vioapic,
3065                                        irq_event.irq,
3066                                        irq_event.level);
3067                        mutex_unlock(&kvm->lock);
3068                        r = 0;
3069                }
3070                break;
3071        }
3072        case KVM_GET_IRQCHIP: {
3073                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3074                struct kvm_irqchip chip;
3075
3076                r = -EFAULT;
3077                if (copy_from_user(&chip, argp, sizeof chip))
3078                        goto out;
3079                r = -ENXIO;
3080                if (!irqchip_in_kernel(kvm))
3081                        goto out;
3082                r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
3083                if (r)
3084                        goto out;
3085                r = -EFAULT;
3086                if (copy_to_user(argp, &chip, sizeof chip))
3087                        goto out;
3088                r = 0;
3089                break;
3090        }
3091        case KVM_SET_IRQCHIP: {
3092                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3093                struct kvm_irqchip chip;
3094
3095                r = -EFAULT;
3096                if (copy_from_user(&chip, argp, sizeof chip))
3097                        goto out;
3098                r = -ENXIO;
3099                if (!irqchip_in_kernel(kvm))
3100                        goto out;
3101                r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
3102                if (r)
3103                        goto out;
3104                r = 0;
3105                break;
3106        }
3107        default:
3108                ;
3109        }
3110out:
3111        return r;
3112}
3113
3114static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
3115                                  unsigned long address,
3116                                  int *type)
3117{
3118        struct kvm *kvm = vma->vm_file->private_data;
3119        unsigned long pgoff;
3120        struct page *page;
3121
3122        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3123        page = gfn_to_page(kvm, pgoff);
3124        if (!page)
3125                return NOPAGE_SIGBUS;
3126        get_page(page);
3127        if (type != NULL)
3128                *type = VM_FAULT_MINOR;
3129
3130        return page;
3131}
3132
3133static struct vm_operations_struct kvm_vm_vm_ops = {
3134        .nopage = kvm_vm_nopage,
3135};
3136
3137static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
3138{
3139        vma->vm_ops = &kvm_vm_vm_ops;
3140        return 0;
3141}
3142
3143static struct file_operations kvm_vm_fops = {
3144        .release        = kvm_vm_release,
3145        .unlocked_ioctl = kvm_vm_ioctl,
3146        .compat_ioctl   = kvm_vm_ioctl,
3147        .mmap           = kvm_vm_mmap,
3148};
3149
3150static int kvm_dev_ioctl_create_vm(void)
3151{
3152        int fd, r;
3153        struct inode *inode;
3154        struct file *file;
3155        struct kvm *kvm;
3156
3157        kvm = kvm_create_vm();
3158        if (IS_ERR(kvm))
3159                return PTR_ERR(kvm);
3160        r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
3161        if (r) {
3162                kvm_destroy_vm(kvm);
3163                return r;
3164        }
3165
3166        kvm->filp = file;
3167
3168        return fd;
3169}
3170
3171static long kvm_dev_ioctl(struct file *filp,
3172                          unsigned int ioctl, unsigned long arg)
3173{
3174        void __user *argp = (void __user *)arg;
3175        long r = -EINVAL;
3176
3177        switch (ioctl) {
3178        case KVM_GET_API_VERSION:
3179                r = -EINVAL;
3180                if (arg)
3181                        goto out;
3182                r = KVM_API_VERSION;
3183                break;
3184        case KVM_CREATE_VM:
3185                r = -EINVAL;
3186                if (arg)
3187                        goto out;
3188                r = kvm_dev_ioctl_create_vm();
3189                break;
3190        case KVM_GET_MSR_INDEX_LIST: {
3191                struct kvm_msr_list __user *user_msr_list = argp;
3192                struct kvm_msr_list msr_list;
3193                unsigned n;
3194
3195                r = -EFAULT;
3196                if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
3197                        goto out;
3198                n = msr_list.nmsrs;
3199                msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
3200                if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
3201                        goto out;
3202                r = -E2BIG;
3203                if (n < num_msrs_to_save)
3204                        goto out;
3205                r = -EFAULT;
3206                if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3207                                 num_msrs_to_save * sizeof(u32)))
3208                        goto out;
3209                if (copy_to_user(user_msr_list->indices
3210                                 + num_msrs_to_save * sizeof(u32),
3211                                 &emulated_msrs,
3212                                 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
3213                        goto out;
3214                r = 0;
3215                break;
3216        }
3217        case KVM_CHECK_EXTENSION: {
3218                int ext = (long)argp;
3219
3220                switch (ext) {
3221                case KVM_CAP_IRQCHIP:
3222                case KVM_CAP_HLT:
3223                        r = 1;
3224                        break;
3225                default:
3226                        r = 0;
3227                        break;
3228                }
3229                break;
3230        }
3231        case KVM_GET_VCPU_MMAP_SIZE:
3232                r = -EINVAL;
3233                if (arg)
3234                        goto out;
3235                r = 2 * PAGE_SIZE;
3236                break;
3237        default:
3238                ;
3239        }
3240out:
3241        return r;
3242}
3243
3244static struct file_operations kvm_chardev_ops = {
3245        .unlocked_ioctl = kvm_dev_ioctl,
3246        .compat_ioctl   = kvm_dev_ioctl,
3247};
3248
3249static struct miscdevice kvm_dev = {
3250        KVM_MINOR,
3251        "kvm",
3252        &kvm_chardev_ops,
3253};
3254
3255/*
3256 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
3257 * cached on it.
3258 */
3259static void decache_vcpus_on_cpu(int cpu)
3260{
3261        struct kvm *vm;
3262        struct kvm_vcpu *vcpu;
3263        int i;
3264
3265        spin_lock(&kvm_lock);
3266        list_for_each_entry(vm, &vm_list, vm_list)
3267                for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3268                        vcpu = vm->vcpus[i];
3269                        if (!vcpu)
3270                                continue;
3271                        /*
3272                         * If the vcpu is locked, then it is running on some
3273                         * other cpu and therefore it is not cached on the
3274                         * cpu in question.
3275                         *
3276                         * If it's not locked, check the last cpu it executed
3277                         * on.
3278                         */
3279                        if (mutex_trylock(&vcpu->mutex)) {
3280                                if (vcpu->cpu == cpu) {
3281                                        kvm_x86_ops->vcpu_decache(vcpu);
3282                                        vcpu->cpu = -1;
3283                                }
3284                                mutex_unlock(&vcpu->mutex);
3285                        }
3286                }
3287        spin_unlock(&kvm_lock);
3288}
3289
3290static void hardware_enable(void *junk)
3291{
3292        int cpu = raw_smp_processor_id();
3293
3294        if (cpu_isset(cpu, cpus_hardware_enabled))
3295                return;
3296        cpu_set(cpu, cpus_hardware_enabled);
3297        kvm_x86_ops->hardware_enable(NULL);
3298}
3299
3300static void hardware_disable(void *junk)
3301{
3302        int cpu = raw_smp_processor_id();
3303
3304        if (!cpu_isset(cpu, cpus_hardware_enabled))
3305                return;
3306        cpu_clear(cpu, cpus_hardware_enabled);
3307        decache_vcpus_on_cpu(cpu);
3308        kvm_x86_ops->hardware_disable(NULL);
3309}
3310
3311static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
3312                           void *v)
3313{
3314        int cpu = (long)v;
3315
3316        switch (val) {
3317        case CPU_DYING:
3318        case CPU_DYING_FROZEN:
3319                printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3320                       cpu);
3321                hardware_disable(NULL);
3322                break;
3323        case CPU_UP_CANCELED:
3324        case CPU_UP_CANCELED_FROZEN:
3325                printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3326                       cpu);
3327                smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
3328                break;
3329        case CPU_ONLINE:
3330        case CPU_ONLINE_FROZEN:
3331                printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3332                       cpu);
3333                smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
3334                break;
3335        }
3336        return NOTIFY_OK;
3337}
3338
3339static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
3340                       void *v)
3341{
3342        if (val == SYS_RESTART) {
3343                /*
3344                 * Some (well, at least mine) BIOSes hang on reboot if
3345                 * in vmx root mode.
3346                 */
3347                printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3348                on_each_cpu(hardware_disable, NULL, 0, 1);
3349        }
3350        return NOTIFY_OK;
3351}
3352
3353static struct notifier_block kvm_reboot_notifier = {
3354        .notifier_call = kvm_reboot,
3355        .priority = 0,
3356};
3357
3358void kvm_io_bus_init(struct kvm_io_bus *bus)
3359{
3360        memset(bus, 0, sizeof(*bus));
3361}
3362
3363void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3364{
3365        int i;
3366
3367        for (i = 0; i < bus->dev_count; i++) {
3368                struct kvm_io_device *pos = bus->devs[i];
3369
3370                kvm_iodevice_destructor(pos);
3371        }
3372}
3373
3374struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
3375{
3376        int i;
3377
3378        for (i = 0; i < bus->dev_count; i++) {
3379                struct kvm_io_device *pos = bus->devs[i];
3380
3381                if (pos->in_range(pos, addr))
3382                        return pos;
3383        }
3384
3385        return NULL;
3386}
3387
3388void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3389{
3390        BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3391
3392        bus->devs[bus->dev_count++] = dev;
3393}
3394
3395static struct notifier_block kvm_cpu_notifier = {
3396        .notifier_call = kvm_cpu_hotplug,
3397        .priority = 20, /* must be > scheduler priority */
3398};
3399
3400static u64 stat_get(void *_offset)
3401{
3402        unsigned offset = (long)_offset;
3403        u64 total = 0;
3404        struct kvm *kvm;
3405        struct kvm_vcpu *vcpu;
3406        int i;
3407
3408        spin_lock(&kvm_lock);
3409        list_for_each_entry(kvm, &vm_list, vm_list)
3410                for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3411                        vcpu = kvm->vcpus[i];
3412                        if (vcpu)
3413                                total += *(u32 *)((void *)vcpu + offset);
3414                }
3415        spin_unlock(&kvm_lock);
3416        return total;
3417}
3418
3419DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
3420
3421static __init void kvm_init_debug(void)
3422{
3423        struct kvm_stats_debugfs_item *p;
3424
3425        debugfs_dir = debugfs_create_dir("kvm", NULL);
3426        for (p = debugfs_entries; p->name; ++p)
3427                p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3428                                                (void *)(long)p->offset,
3429                                                &stat_fops);
3430}
3431
3432static void kvm_exit_debug(void)
3433{
3434        struct kvm_stats_debugfs_item *p;
3435
3436        for (p = debugfs_entries; p->name; ++p)
3437                debugfs_remove(p->dentry);
3438        debugfs_remove(debugfs_dir);
3439}
3440
3441static int kvm_suspend(struct sys_device *dev, pm_message_t state)
3442{
3443        hardware_disable(NULL);
3444        return 0;
3445}
3446
3447static int kvm_resume(struct sys_device *dev)
3448{
3449        hardware_enable(NULL);
3450        return 0;
3451}
3452
3453static struct sysdev_class kvm_sysdev_class = {
3454        set_kset_name("kvm"),
3455        .suspend = kvm_suspend,
3456        .resume = kvm_resume,
3457};
3458
3459static struct sys_device kvm_sysdev = {
3460        .id = 0,
3461        .cls = &kvm_sysdev_class,
3462};
3463
3464hpa_t bad_page_address;
3465
3466static inline
3467struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
3468{
3469        return container_of(pn, struct kvm_vcpu, preempt_notifier);
3470}
3471
3472static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
3473{
3474        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3475
3476        kvm_x86_ops->vcpu_load(vcpu, cpu);
3477}
3478
3479static void kvm_sched_out(struct preempt_notifier *pn,
3480                          struct task_struct *next)
3481{
3482        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3483
3484        kvm_x86_ops->vcpu_put(vcpu);
3485}
3486
3487int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
3488                  struct module *module)
3489{
3490        int r;
3491        int cpu;
3492
3493        if (kvm_x86_ops) {
3494                printk(KERN_ERR "kvm: already loaded the other module\n");
3495                return -EEXIST;
3496        }
3497
3498        if (!ops->cpu_has_kvm_support()) {
3499                printk(KERN_ERR "kvm: no hardware support\n");
3500                return -EOPNOTSUPP;
3501        }
3502        if (ops->disabled_by_bios()) {
3503                printk(KERN_ERR "kvm: disabled by bios\n");
3504                return -EOPNOTSUPP;
3505        }
3506
3507        kvm_x86_ops = ops;
3508
3509        r = kvm_x86_ops->hardware_setup();
3510        if (r < 0)
3511                goto out;
3512
3513        for_each_online_cpu(cpu) {
3514                smp_call_function_single(cpu,
3515                                kvm_x86_ops->check_processor_compatibility,
3516                                &r, 0, 1);
3517                if (r < 0)
3518                        goto out_free_0;
3519        }
3520
3521        on_each_cpu(hardware_enable, NULL, 0, 1);
3522        r = register_cpu_notifier(&kvm_cpu_notifier);
3523        if (r)
3524                goto out_free_1;
3525        register_reboot_notifier(&kvm_reboot_notifier);
3526
3527        r = sysdev_class_register(&kvm_sysdev_class);
3528        if (r)
3529                goto out_free_2;
3530
3531        r = sysdev_register(&kvm_sysdev);
3532        if (r)
3533                goto out_free_3;
3534
3535        /* A kmem cache lets us meet the alignment requirements of fx_save. */
3536        kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
3537                                           __alignof__(struct kvm_vcpu), 0, 0);
3538        if (!kvm_vcpu_cache) {
3539                r = -ENOMEM;
3540                goto out_free_4;
3541        }
3542
3543        kvm_chardev_ops.owner = module;
3544
3545        r = misc_register(&kvm_dev);
3546        if (r) {
3547                printk (KERN_ERR "kvm: misc device register failed\n");
3548                goto out_free;
3549        }
3550
3551        kvm_preempt_ops.sched_in = kvm_sched_in;
3552        kvm_preempt_ops.sched_out = kvm_sched_out;
3553
3554        return r;
3555
3556out_free:
3557        kmem_cache_destroy(kvm_vcpu_cache);
3558out_free_4:
3559        sysdev_unregister(&kvm_sysdev);
3560out_free_3:
3561        sysdev_class_unregister(&kvm_sysdev_class);
3562out_free_2:
3563        unregister_reboot_notifier(&kvm_reboot_notifier);
3564        unregister_cpu_notifier(&kvm_cpu_notifier);
3565out_free_1:
3566        on_each_cpu(hardware_disable, NULL, 0, 1);
3567out_free_0:
3568        kvm_x86_ops->hardware_unsetup();
3569out:
3570        kvm_x86_ops = NULL;
3571        return r;
3572}
3573
3574void kvm_exit_x86(void)
3575{
3576        misc_deregister(&kvm_dev);
3577        kmem_cache_destroy(kvm_vcpu_cache);
3578        sysdev_unregister(&kvm_sysdev);
3579        sysdev_class_unregister(&kvm_sysdev_class);
3580        unregister_reboot_notifier(&kvm_reboot_notifier);
3581        unregister_cpu_notifier(&kvm_cpu_notifier);
3582        on_each_cpu(hardware_disable, NULL, 0, 1);
3583        kvm_x86_ops->hardware_unsetup();
3584        kvm_x86_ops = NULL;
3585}
3586
3587static __init int kvm_init(void)
3588{
3589        static struct page *bad_page;
3590        int r;
3591
3592        r = kvm_mmu_module_init();
3593        if (r)
3594                goto out4;
3595
3596        kvm_init_debug();
3597
3598        kvm_init_msr_list();
3599
3600        if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3601                r = -ENOMEM;
3602                goto out;
3603        }
3604
3605        bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3606        memset(__va(bad_page_address), 0, PAGE_SIZE);
3607
3608        return 0;
3609
3610out:
3611        kvm_exit_debug();
3612        kvm_mmu_module_exit();
3613out4:
3614        return r;
3615}
3616
3617static __exit void kvm_exit(void)
3618{
3619        kvm_exit_debug();
3620        __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3621        kvm_mmu_module_exit();
3622}
3623
3624module_init(kvm_init)
3625module_exit(kvm_exit)
3626
3627EXPORT_SYMBOL_GPL(kvm_init_x86);
3628EXPORT_SYMBOL_GPL(kvm_exit_x86);
3629