linux/arch/x86/kvm/svm.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * AMD SVM support
   5 *
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   8 *
   9 * Authors:
  10 *   Yaniv Kamay  <yaniv@qumranet.com>
  11 *   Avi Kivity   <avi@qumranet.com>
  12 *
  13 * This work is licensed under the terms of the GNU GPL, version 2.  See
  14 * the COPYING file in the top-level directory.
  15 *
  16 */
  17#include <linux/kvm_host.h>
  18
  19#include "irq.h"
  20#include "mmu.h"
  21#include "kvm_cache_regs.h"
  22#include "x86.h"
  23
  24#include <linux/module.h>
  25#include <linux/mod_devicetable.h>
  26#include <linux/kernel.h>
  27#include <linux/vmalloc.h>
  28#include <linux/highmem.h>
  29#include <linux/sched.h>
  30#include <linux/ftrace_event.h>
  31#include <linux/slab.h>
  32
  33#include <asm/perf_event.h>
  34#include <asm/tlbflush.h>
  35#include <asm/desc.h>
  36#include <asm/kvm_para.h>
  37
  38#include <asm/virtext.h>
  39#include "trace.h"
  40
  41#define __ex(x) __kvm_handle_fault_on_reboot(x)
  42
  43MODULE_AUTHOR("Qumranet");
  44MODULE_LICENSE("GPL");
  45
  46static const struct x86_cpu_id svm_cpu_id[] = {
  47        X86_FEATURE_MATCH(X86_FEATURE_SVM),
  48        {}
  49};
  50MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  51
  52#define IOPM_ALLOC_ORDER 2
  53#define MSRPM_ALLOC_ORDER 1
  54
  55#define SEG_TYPE_LDT 2
  56#define SEG_TYPE_BUSY_TSS16 3
  57
  58#define SVM_FEATURE_NPT            (1 <<  0)
  59#define SVM_FEATURE_LBRV           (1 <<  1)
  60#define SVM_FEATURE_SVML           (1 <<  2)
  61#define SVM_FEATURE_NRIP           (1 <<  3)
  62#define SVM_FEATURE_TSC_RATE       (1 <<  4)
  63#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
  64#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
  65#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
  66#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
  67
  68#define NESTED_EXIT_HOST        0       /* Exit handled on host level */
  69#define NESTED_EXIT_DONE        1       /* Exit caused nested vmexit  */
  70#define NESTED_EXIT_CONTINUE    2       /* Further checks needed      */
  71
  72#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
  73
  74#define TSC_RATIO_RSVD          0xffffff0000000000ULL
  75#define TSC_RATIO_MIN           0x0000000000000001ULL
  76#define TSC_RATIO_MAX           0x000000ffffffffffULL
  77
  78static bool erratum_383_found __read_mostly;
  79
  80static const u32 host_save_user_msrs[] = {
  81#ifdef CONFIG_X86_64
  82        MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
  83        MSR_FS_BASE,
  84#endif
  85        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
  86};
  87
  88#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
  89
  90struct kvm_vcpu;
  91
  92struct nested_state {
  93        struct vmcb *hsave;
  94        u64 hsave_msr;
  95        u64 vm_cr_msr;
  96        u64 vmcb;
  97
  98        /* These are the merged vectors */
  99        u32 *msrpm;
 100
 101        /* gpa pointers to the real vectors */
 102        u64 vmcb_msrpm;
 103        u64 vmcb_iopm;
 104
 105        /* A VMEXIT is required but not yet emulated */
 106        bool exit_required;
 107
 108        /* cache for intercepts of the guest */
 109        u32 intercept_cr;
 110        u32 intercept_dr;
 111        u32 intercept_exceptions;
 112        u64 intercept;
 113
 114        /* Nested Paging related state */
 115        u64 nested_cr3;
 116};
 117
 118#define MSRPM_OFFSETS   16
 119static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 120
 121/*
 122 * Set osvw_len to higher value when updated Revision Guides
 123 * are published and we know what the new status bits are
 124 */
 125static uint64_t osvw_len = 4, osvw_status;
 126
 127struct vcpu_svm {
 128        struct kvm_vcpu vcpu;
 129        struct vmcb *vmcb;
 130        unsigned long vmcb_pa;
 131        struct svm_cpu_data *svm_data;
 132        uint64_t asid_generation;
 133        uint64_t sysenter_esp;
 134        uint64_t sysenter_eip;
 135
 136        u64 next_rip;
 137
 138        u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
 139        struct {
 140                u16 fs;
 141                u16 gs;
 142                u16 ldt;
 143                u64 gs_base;
 144        } host;
 145
 146        u32 *msrpm;
 147
 148        ulong nmi_iret_rip;
 149
 150        struct nested_state nested;
 151
 152        bool nmi_singlestep;
 153
 154        unsigned int3_injected;
 155        unsigned long int3_rip;
 156        u32 apf_reason;
 157
 158        u64  tsc_ratio;
 159};
 160
 161static DEFINE_PER_CPU(u64, current_tsc_ratio);
 162#define TSC_RATIO_DEFAULT       0x0100000000ULL
 163
 164#define MSR_INVALID                     0xffffffffU
 165
 166static struct svm_direct_access_msrs {
 167        u32 index;   /* Index of the MSR */
 168        bool always; /* True if intercept is always on */
 169} direct_access_msrs[] = {
 170        { .index = MSR_STAR,                            .always = true  },
 171        { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
 172#ifdef CONFIG_X86_64
 173        { .index = MSR_GS_BASE,                         .always = true  },
 174        { .index = MSR_FS_BASE,                         .always = true  },
 175        { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
 176        { .index = MSR_LSTAR,                           .always = true  },
 177        { .index = MSR_CSTAR,                           .always = true  },
 178        { .index = MSR_SYSCALL_MASK,                    .always = true  },
 179#endif
 180        { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
 181        { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
 182        { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
 183        { .index = MSR_IA32_LASTINTTOIP,                .always = false },
 184        { .index = MSR_INVALID,                         .always = false },
 185};
 186
 187/* enable NPT for AMD64 and X86 with PAE */
 188#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 189static bool npt_enabled = true;
 190#else
 191static bool npt_enabled;
 192#endif
 193
 194/* allow nested paging (virtualized MMU) for all guests */
 195static int npt = true;
 196module_param(npt, int, S_IRUGO);
 197
 198/* allow nested virtualization in KVM/SVM */
 199static int nested = true;
 200module_param(nested, int, S_IRUGO);
 201
 202static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 203static void svm_complete_interrupts(struct vcpu_svm *svm);
 204
 205static int nested_svm_exit_handled(struct vcpu_svm *svm);
 206static int nested_svm_intercept(struct vcpu_svm *svm);
 207static int nested_svm_vmexit(struct vcpu_svm *svm);
 208static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 209                                      bool has_error_code, u32 error_code);
 210static u64 __scale_tsc(u64 ratio, u64 tsc);
 211
 212enum {
 213        VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
 214                            pause filter count */
 215        VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
 216        VMCB_ASID,       /* ASID */
 217        VMCB_INTR,       /* int_ctl, int_vector */
 218        VMCB_NPT,        /* npt_en, nCR3, gPAT */
 219        VMCB_CR,         /* CR0, CR3, CR4, EFER */
 220        VMCB_DR,         /* DR6, DR7 */
 221        VMCB_DT,         /* GDT, IDT */
 222        VMCB_SEG,        /* CS, DS, SS, ES, CPL */
 223        VMCB_CR2,        /* CR2 only */
 224        VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
 225        VMCB_DIRTY_MAX,
 226};
 227
 228/* TPR and CR2 are always written before VMRUN */
 229#define VMCB_ALWAYS_DIRTY_MASK  ((1U << VMCB_INTR) | (1U << VMCB_CR2))
 230
 231static inline void mark_all_dirty(struct vmcb *vmcb)
 232{
 233        vmcb->control.clean = 0;
 234}
 235
 236static inline void mark_all_clean(struct vmcb *vmcb)
 237{
 238        vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
 239                               & ~VMCB_ALWAYS_DIRTY_MASK;
 240}
 241
 242static inline void mark_dirty(struct vmcb *vmcb, int bit)
 243{
 244        vmcb->control.clean &= ~(1 << bit);
 245}
 246
 247static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 248{
 249        return container_of(vcpu, struct vcpu_svm, vcpu);
 250}
 251
 252static void recalc_intercepts(struct vcpu_svm *svm)
 253{
 254        struct vmcb_control_area *c, *h;
 255        struct nested_state *g;
 256
 257        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 258
 259        if (!is_guest_mode(&svm->vcpu))
 260                return;
 261
 262        c = &svm->vmcb->control;
 263        h = &svm->nested.hsave->control;
 264        g = &svm->nested;
 265
 266        c->intercept_cr = h->intercept_cr | g->intercept_cr;
 267        c->intercept_dr = h->intercept_dr | g->intercept_dr;
 268        c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 269        c->intercept = h->intercept | g->intercept;
 270}
 271
 272static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
 273{
 274        if (is_guest_mode(&svm->vcpu))
 275                return svm->nested.hsave;
 276        else
 277                return svm->vmcb;
 278}
 279
 280static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
 281{
 282        struct vmcb *vmcb = get_host_vmcb(svm);
 283
 284        vmcb->control.intercept_cr |= (1U << bit);
 285
 286        recalc_intercepts(svm);
 287}
 288
 289static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
 290{
 291        struct vmcb *vmcb = get_host_vmcb(svm);
 292
 293        vmcb->control.intercept_cr &= ~(1U << bit);
 294
 295        recalc_intercepts(svm);
 296}
 297
 298static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
 299{
 300        struct vmcb *vmcb = get_host_vmcb(svm);
 301
 302        return vmcb->control.intercept_cr & (1U << bit);
 303}
 304
 305static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
 306{
 307        struct vmcb *vmcb = get_host_vmcb(svm);
 308
 309        vmcb->control.intercept_dr |= (1U << bit);
 310
 311        recalc_intercepts(svm);
 312}
 313
 314static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
 315{
 316        struct vmcb *vmcb = get_host_vmcb(svm);
 317
 318        vmcb->control.intercept_dr &= ~(1U << bit);
 319
 320        recalc_intercepts(svm);
 321}
 322
 323static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
 324{
 325        struct vmcb *vmcb = get_host_vmcb(svm);
 326
 327        vmcb->control.intercept_exceptions |= (1U << bit);
 328
 329        recalc_intercepts(svm);
 330}
 331
 332static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
 333{
 334        struct vmcb *vmcb = get_host_vmcb(svm);
 335
 336        vmcb->control.intercept_exceptions &= ~(1U << bit);
 337
 338        recalc_intercepts(svm);
 339}
 340
 341static inline void set_intercept(struct vcpu_svm *svm, int bit)
 342{
 343        struct vmcb *vmcb = get_host_vmcb(svm);
 344
 345        vmcb->control.intercept |= (1ULL << bit);
 346
 347        recalc_intercepts(svm);
 348}
 349
 350static inline void clr_intercept(struct vcpu_svm *svm, int bit)
 351{
 352        struct vmcb *vmcb = get_host_vmcb(svm);
 353
 354        vmcb->control.intercept &= ~(1ULL << bit);
 355
 356        recalc_intercepts(svm);
 357}
 358
 359static inline void enable_gif(struct vcpu_svm *svm)
 360{
 361        svm->vcpu.arch.hflags |= HF_GIF_MASK;
 362}
 363
 364static inline void disable_gif(struct vcpu_svm *svm)
 365{
 366        svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
 367}
 368
 369static inline bool gif_set(struct vcpu_svm *svm)
 370{
 371        return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
 372}
 373
 374static unsigned long iopm_base;
 375
 376struct kvm_ldttss_desc {
 377        u16 limit0;
 378        u16 base0;
 379        unsigned base1:8, type:5, dpl:2, p:1;
 380        unsigned limit1:4, zero0:3, g:1, base2:8;
 381        u32 base3;
 382        u32 zero1;
 383} __attribute__((packed));
 384
 385struct svm_cpu_data {
 386        int cpu;
 387
 388        u64 asid_generation;
 389        u32 max_asid;
 390        u32 next_asid;
 391        struct kvm_ldttss_desc *tss_desc;
 392
 393        struct page *save_area;
 394};
 395
 396static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 397
 398struct svm_init_data {
 399        int cpu;
 400        int r;
 401};
 402
 403static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 404
 405#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 406#define MSRS_RANGE_SIZE 2048
 407#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 408
 409static u32 svm_msrpm_offset(u32 msr)
 410{
 411        u32 offset;
 412        int i;
 413
 414        for (i = 0; i < NUM_MSR_MAPS; i++) {
 415                if (msr < msrpm_ranges[i] ||
 416                    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 417                        continue;
 418
 419                offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 420                offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 421
 422                /* Now we have the u8 offset - but need the u32 offset */
 423                return offset / 4;
 424        }
 425
 426        /* MSR not in any range */
 427        return MSR_INVALID;
 428}
 429
 430#define MAX_INST_SIZE 15
 431
 432static inline void clgi(void)
 433{
 434        asm volatile (__ex(SVM_CLGI));
 435}
 436
 437static inline void stgi(void)
 438{
 439        asm volatile (__ex(SVM_STGI));
 440}
 441
 442static inline void invlpga(unsigned long addr, u32 asid)
 443{
 444        asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
 445}
 446
 447static int get_npt_level(void)
 448{
 449#ifdef CONFIG_X86_64
 450        return PT64_ROOT_LEVEL;
 451#else
 452        return PT32E_ROOT_LEVEL;
 453#endif
 454}
 455
 456static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 457{
 458        vcpu->arch.efer = efer;
 459        if (!npt_enabled && !(efer & EFER_LMA))
 460                efer &= ~EFER_LME;
 461
 462        to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
 463        mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
 464}
 465
 466static int is_external_interrupt(u32 info)
 467{
 468        info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
 469        return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 470}
 471
 472static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 473{
 474        struct vcpu_svm *svm = to_svm(vcpu);
 475        u32 ret = 0;
 476
 477        if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 478                ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 479        return ret & mask;
 480}
 481
 482static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 483{
 484        struct vcpu_svm *svm = to_svm(vcpu);
 485
 486        if (mask == 0)
 487                svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 488        else
 489                svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 490
 491}
 492
 493static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 494{
 495        struct vcpu_svm *svm = to_svm(vcpu);
 496
 497        if (svm->vmcb->control.next_rip != 0)
 498                svm->next_rip = svm->vmcb->control.next_rip;
 499
 500        if (!svm->next_rip) {
 501                if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
 502                                EMULATE_DONE)
 503                        printk(KERN_DEBUG "%s: NOP\n", __func__);
 504                return;
 505        }
 506        if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
 507                printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
 508                       __func__, kvm_rip_read(vcpu), svm->next_rip);
 509
 510        kvm_rip_write(vcpu, svm->next_rip);
 511        svm_set_interrupt_shadow(vcpu, 0);
 512}
 513
 514static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 515                                bool has_error_code, u32 error_code,
 516                                bool reinject)
 517{
 518        struct vcpu_svm *svm = to_svm(vcpu);
 519
 520        /*
 521         * If we are within a nested VM we'd better #VMEXIT and let the guest
 522         * handle the exception
 523         */
 524        if (!reinject &&
 525            nested_svm_check_exception(svm, nr, has_error_code, error_code))
 526                return;
 527
 528        if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
 529                unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
 530
 531                /*
 532                 * For guest debugging where we have to reinject #BP if some
 533                 * INT3 is guest-owned:
 534                 * Emulate nRIP by moving RIP forward. Will fail if injection
 535                 * raises a fault that is not intercepted. Still better than
 536                 * failing in all cases.
 537                 */
 538                skip_emulated_instruction(&svm->vcpu);
 539                rip = kvm_rip_read(&svm->vcpu);
 540                svm->int3_rip = rip + svm->vmcb->save.cs.base;
 541                svm->int3_injected = rip - old_rip;
 542        }
 543
 544        svm->vmcb->control.event_inj = nr
 545                | SVM_EVTINJ_VALID
 546                | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 547                | SVM_EVTINJ_TYPE_EXEPT;
 548        svm->vmcb->control.event_inj_err = error_code;
 549}
 550
 551static void svm_init_erratum_383(void)
 552{
 553        u32 low, high;
 554        int err;
 555        u64 val;
 556
 557        if (!cpu_has_amd_erratum(amd_erratum_383))
 558                return;
 559
 560        /* Use _safe variants to not break nested virtualization */
 561        val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 562        if (err)
 563                return;
 564
 565        val |= (1ULL << 47);
 566
 567        low  = lower_32_bits(val);
 568        high = upper_32_bits(val);
 569
 570        native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 571
 572        erratum_383_found = true;
 573}
 574
 575static void svm_init_osvw(struct kvm_vcpu *vcpu)
 576{
 577        /*
 578         * Guests should see errata 400 and 415 as fixed (assuming that
 579         * HLT and IO instructions are intercepted).
 580         */
 581        vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 582        vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 583
 584        /*
 585         * By increasing VCPU's osvw.length to 3 we are telling the guest that
 586         * all osvw.status bits inside that length, including bit 0 (which is
 587         * reserved for erratum 298), are valid. However, if host processor's
 588         * osvw_len is 0 then osvw_status[0] carries no information. We need to
 589         * be conservative here and therefore we tell the guest that erratum 298
 590         * is present (because we really don't know).
 591         */
 592        if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 593                vcpu->arch.osvw.status |= 1;
 594}
 595
 596static int has_svm(void)
 597{
 598        const char *msg;
 599
 600        if (!cpu_has_svm(&msg)) {
 601                printk(KERN_INFO "has_svm: %s\n", msg);
 602                return 0;
 603        }
 604
 605        return 1;
 606}
 607
 608static void svm_hardware_disable(void *garbage)
 609{
 610        /* Make sure we clean up behind us */
 611        if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
 612                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 613
 614        cpu_svm_disable();
 615
 616        amd_pmu_disable_virt();
 617}
 618
 619static int svm_hardware_enable(void *garbage)
 620{
 621
 622        struct svm_cpu_data *sd;
 623        uint64_t efer;
 624        struct desc_ptr gdt_descr;
 625        struct desc_struct *gdt;
 626        int me = raw_smp_processor_id();
 627
 628        rdmsrl(MSR_EFER, efer);
 629        if (efer & EFER_SVME)
 630                return -EBUSY;
 631
 632        if (!has_svm()) {
 633                printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
 634                       me);
 635                return -EINVAL;
 636        }
 637        sd = per_cpu(svm_data, me);
 638
 639        if (!sd) {
 640                printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
 641                       me);
 642                return -EINVAL;
 643        }
 644
 645        sd->asid_generation = 1;
 646        sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 647        sd->next_asid = sd->max_asid + 1;
 648
 649        native_store_gdt(&gdt_descr);
 650        gdt = (struct desc_struct *)gdt_descr.address;
 651        sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 652
 653        wrmsrl(MSR_EFER, efer | EFER_SVME);
 654
 655        wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
 656
 657        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 658                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 659                __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
 660        }
 661
 662
 663        /*
 664         * Get OSVW bits.
 665         *
 666         * Note that it is possible to have a system with mixed processor
 667         * revisions and therefore different OSVW bits. If bits are not the same
 668         * on different processors then choose the worst case (i.e. if erratum
 669         * is present on one processor and not on another then assume that the
 670         * erratum is present everywhere).
 671         */
 672        if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 673                uint64_t len, status = 0;
 674                int err;
 675
 676                len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 677                if (!err)
 678                        status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 679                                                      &err);
 680
 681                if (err)
 682                        osvw_status = osvw_len = 0;
 683                else {
 684                        if (len < osvw_len)
 685                                osvw_len = len;
 686                        osvw_status |= status;
 687                        osvw_status &= (1ULL << osvw_len) - 1;
 688                }
 689        } else
 690                osvw_status = osvw_len = 0;
 691
 692        svm_init_erratum_383();
 693
 694        amd_pmu_enable_virt();
 695
 696        return 0;
 697}
 698
 699static void svm_cpu_uninit(int cpu)
 700{
 701        struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
 702
 703        if (!sd)
 704                return;
 705
 706        per_cpu(svm_data, raw_smp_processor_id()) = NULL;
 707        __free_page(sd->save_area);
 708        kfree(sd);
 709}
 710
 711static int svm_cpu_init(int cpu)
 712{
 713        struct svm_cpu_data *sd;
 714        int r;
 715
 716        sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
 717        if (!sd)
 718                return -ENOMEM;
 719        sd->cpu = cpu;
 720        sd->save_area = alloc_page(GFP_KERNEL);
 721        r = -ENOMEM;
 722        if (!sd->save_area)
 723                goto err_1;
 724
 725        per_cpu(svm_data, cpu) = sd;
 726
 727        return 0;
 728
 729err_1:
 730        kfree(sd);
 731        return r;
 732
 733}
 734
 735static bool valid_msr_intercept(u32 index)
 736{
 737        int i;
 738
 739        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 740                if (direct_access_msrs[i].index == index)
 741                        return true;
 742
 743        return false;
 744}
 745
 746static void set_msr_interception(u32 *msrpm, unsigned msr,
 747                                 int read, int write)
 748{
 749        u8 bit_read, bit_write;
 750        unsigned long tmp;
 751        u32 offset;
 752
 753        /*
 754         * If this warning triggers extend the direct_access_msrs list at the
 755         * beginning of the file
 756         */
 757        WARN_ON(!valid_msr_intercept(msr));
 758
 759        offset    = svm_msrpm_offset(msr);
 760        bit_read  = 2 * (msr & 0x0f);
 761        bit_write = 2 * (msr & 0x0f) + 1;
 762        tmp       = msrpm[offset];
 763
 764        BUG_ON(offset == MSR_INVALID);
 765
 766        read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 767        write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 768
 769        msrpm[offset] = tmp;
 770}
 771
 772static void svm_vcpu_init_msrpm(u32 *msrpm)
 773{
 774        int i;
 775
 776        memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
 777
 778        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 779                if (!direct_access_msrs[i].always)
 780                        continue;
 781
 782                set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
 783        }
 784}
 785
 786static void add_msr_offset(u32 offset)
 787{
 788        int i;
 789
 790        for (i = 0; i < MSRPM_OFFSETS; ++i) {
 791
 792                /* Offset already in list? */
 793                if (msrpm_offsets[i] == offset)
 794                        return;
 795
 796                /* Slot used by another offset? */
 797                if (msrpm_offsets[i] != MSR_INVALID)
 798                        continue;
 799
 800                /* Add offset to list */
 801                msrpm_offsets[i] = offset;
 802
 803                return;
 804        }
 805
 806        /*
 807         * If this BUG triggers the msrpm_offsets table has an overflow. Just
 808         * increase MSRPM_OFFSETS in this case.
 809         */
 810        BUG();
 811}
 812
 813static void init_msrpm_offsets(void)
 814{
 815        int i;
 816
 817        memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 818
 819        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 820                u32 offset;
 821
 822                offset = svm_msrpm_offset(direct_access_msrs[i].index);
 823                BUG_ON(offset == MSR_INVALID);
 824
 825                add_msr_offset(offset);
 826        }
 827}
 828
 829static void svm_enable_lbrv(struct vcpu_svm *svm)
 830{
 831        u32 *msrpm = svm->msrpm;
 832
 833        svm->vmcb->control.lbr_ctl = 1;
 834        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 835        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 836        set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 837        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 838}
 839
 840static void svm_disable_lbrv(struct vcpu_svm *svm)
 841{
 842        u32 *msrpm = svm->msrpm;
 843
 844        svm->vmcb->control.lbr_ctl = 0;
 845        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
 846        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 847        set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 848        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 849}
 850
 851static __init int svm_hardware_setup(void)
 852{
 853        int cpu;
 854        struct page *iopm_pages;
 855        void *iopm_va;
 856        int r;
 857
 858        iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
 859
 860        if (!iopm_pages)
 861                return -ENOMEM;
 862
 863        iopm_va = page_address(iopm_pages);
 864        memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
 865        iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 866
 867        init_msrpm_offsets();
 868
 869        if (boot_cpu_has(X86_FEATURE_NX))
 870                kvm_enable_efer_bits(EFER_NX);
 871
 872        if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
 873                kvm_enable_efer_bits(EFER_FFXSR);
 874
 875        if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 876                u64 max;
 877
 878                kvm_has_tsc_control = true;
 879
 880                /*
 881                 * Make sure the user can only configure tsc_khz values that
 882                 * fit into a signed integer.
 883                 * A min value is not calculated needed because it will always
 884                 * be 1 on all machines and a value of 0 is used to disable
 885                 * tsc-scaling for the vcpu.
 886                 */
 887                max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
 888
 889                kvm_max_guest_tsc_khz = max;
 890        }
 891
 892        if (nested) {
 893                printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
 894                kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
 895        }
 896
 897        for_each_possible_cpu(cpu) {
 898                r = svm_cpu_init(cpu);
 899                if (r)
 900                        goto err;
 901        }
 902
 903        if (!boot_cpu_has(X86_FEATURE_NPT))
 904                npt_enabled = false;
 905
 906        if (npt_enabled && !npt) {
 907                printk(KERN_INFO "kvm: Nested Paging disabled\n");
 908                npt_enabled = false;
 909        }
 910
 911        if (npt_enabled) {
 912                printk(KERN_INFO "kvm: Nested Paging enabled\n");
 913                kvm_enable_tdp();
 914        } else
 915                kvm_disable_tdp();
 916
 917        return 0;
 918
 919err:
 920        __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
 921        iopm_base = 0;
 922        return r;
 923}
 924
 925static __exit void svm_hardware_unsetup(void)
 926{
 927        int cpu;
 928
 929        for_each_possible_cpu(cpu)
 930                svm_cpu_uninit(cpu);
 931
 932        __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
 933        iopm_base = 0;
 934}
 935
 936static void init_seg(struct vmcb_seg *seg)
 937{
 938        seg->selector = 0;
 939        seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
 940                      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
 941        seg->limit = 0xffff;
 942        seg->base = 0;
 943}
 944
 945static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 946{
 947        seg->selector = 0;
 948        seg->attrib = SVM_SELECTOR_P_MASK | type;
 949        seg->limit = 0xffff;
 950        seg->base = 0;
 951}
 952
 953static u64 __scale_tsc(u64 ratio, u64 tsc)
 954{
 955        u64 mult, frac, _tsc;
 956
 957        mult  = ratio >> 32;
 958        frac  = ratio & ((1ULL << 32) - 1);
 959
 960        _tsc  = tsc;
 961        _tsc *= mult;
 962        _tsc += (tsc >> 32) * frac;
 963        _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
 964
 965        return _tsc;
 966}
 967
 968static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
 969{
 970        struct vcpu_svm *svm = to_svm(vcpu);
 971        u64 _tsc = tsc;
 972
 973        if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
 974                _tsc = __scale_tsc(svm->tsc_ratio, tsc);
 975
 976        return _tsc;
 977}
 978
 979static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 980{
 981        struct vcpu_svm *svm = to_svm(vcpu);
 982        u64 ratio;
 983        u64 khz;
 984
 985        /* Guest TSC same frequency as host TSC? */
 986        if (!scale) {
 987                svm->tsc_ratio = TSC_RATIO_DEFAULT;
 988                return;
 989        }
 990
 991        /* TSC scaling supported? */
 992        if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 993                if (user_tsc_khz > tsc_khz) {
 994                        vcpu->arch.tsc_catchup = 1;
 995                        vcpu->arch.tsc_always_catchup = 1;
 996                } else
 997                        WARN(1, "user requested TSC rate below hardware speed\n");
 998                return;
 999        }
1000
1001        khz = user_tsc_khz;
1002
1003        /* TSC scaling required  - calculate ratio */
1004        ratio = khz << 32;
1005        do_div(ratio, tsc_khz);
1006
1007        if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
1008                WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
1009                                user_tsc_khz);
1010                return;
1011        }
1012        svm->tsc_ratio             = ratio;
1013}
1014
1015static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1016{
1017        struct vcpu_svm *svm = to_svm(vcpu);
1018        u64 g_tsc_offset = 0;
1019
1020        if (is_guest_mode(vcpu)) {
1021                g_tsc_offset = svm->vmcb->control.tsc_offset -
1022                               svm->nested.hsave->control.tsc_offset;
1023                svm->nested.hsave->control.tsc_offset = offset;
1024        }
1025
1026        svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1027
1028        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1029}
1030
1031static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
1032{
1033        struct vcpu_svm *svm = to_svm(vcpu);
1034
1035        WARN_ON(adjustment < 0);
1036        if (host)
1037                adjustment = svm_scale_tsc(vcpu, adjustment);
1038
1039        svm->vmcb->control.tsc_offset += adjustment;
1040        if (is_guest_mode(vcpu))
1041                svm->nested.hsave->control.tsc_offset += adjustment;
1042        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1043}
1044
1045static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1046{
1047        u64 tsc;
1048
1049        tsc = svm_scale_tsc(vcpu, native_read_tsc());
1050
1051        return target_tsc - tsc;
1052}
1053
1054static void init_vmcb(struct vcpu_svm *svm)
1055{
1056        struct vmcb_control_area *control = &svm->vmcb->control;
1057        struct vmcb_save_area *save = &svm->vmcb->save;
1058
1059        svm->vcpu.fpu_active = 1;
1060        svm->vcpu.arch.hflags = 0;
1061
1062        set_cr_intercept(svm, INTERCEPT_CR0_READ);
1063        set_cr_intercept(svm, INTERCEPT_CR3_READ);
1064        set_cr_intercept(svm, INTERCEPT_CR4_READ);
1065        set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1066        set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1067        set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1068        set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
1069
1070        set_dr_intercept(svm, INTERCEPT_DR0_READ);
1071        set_dr_intercept(svm, INTERCEPT_DR1_READ);
1072        set_dr_intercept(svm, INTERCEPT_DR2_READ);
1073        set_dr_intercept(svm, INTERCEPT_DR3_READ);
1074        set_dr_intercept(svm, INTERCEPT_DR4_READ);
1075        set_dr_intercept(svm, INTERCEPT_DR5_READ);
1076        set_dr_intercept(svm, INTERCEPT_DR6_READ);
1077        set_dr_intercept(svm, INTERCEPT_DR7_READ);
1078
1079        set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
1080        set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
1081        set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
1082        set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
1083        set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
1084        set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
1085        set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
1086        set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
1087
1088        set_exception_intercept(svm, PF_VECTOR);
1089        set_exception_intercept(svm, UD_VECTOR);
1090        set_exception_intercept(svm, MC_VECTOR);
1091
1092        set_intercept(svm, INTERCEPT_INTR);
1093        set_intercept(svm, INTERCEPT_NMI);
1094        set_intercept(svm, INTERCEPT_SMI);
1095        set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1096        set_intercept(svm, INTERCEPT_RDPMC);
1097        set_intercept(svm, INTERCEPT_CPUID);
1098        set_intercept(svm, INTERCEPT_INVD);
1099        set_intercept(svm, INTERCEPT_HLT);
1100        set_intercept(svm, INTERCEPT_INVLPG);
1101        set_intercept(svm, INTERCEPT_INVLPGA);
1102        set_intercept(svm, INTERCEPT_IOIO_PROT);
1103        set_intercept(svm, INTERCEPT_MSR_PROT);
1104        set_intercept(svm, INTERCEPT_TASK_SWITCH);
1105        set_intercept(svm, INTERCEPT_SHUTDOWN);
1106        set_intercept(svm, INTERCEPT_VMRUN);
1107        set_intercept(svm, INTERCEPT_VMMCALL);
1108        set_intercept(svm, INTERCEPT_VMLOAD);
1109        set_intercept(svm, INTERCEPT_VMSAVE);
1110        set_intercept(svm, INTERCEPT_STGI);
1111        set_intercept(svm, INTERCEPT_CLGI);
1112        set_intercept(svm, INTERCEPT_SKINIT);
1113        set_intercept(svm, INTERCEPT_WBINVD);
1114        set_intercept(svm, INTERCEPT_MONITOR);
1115        set_intercept(svm, INTERCEPT_MWAIT);
1116        set_intercept(svm, INTERCEPT_XSETBV);
1117
1118        control->iopm_base_pa = iopm_base;
1119        control->msrpm_base_pa = __pa(svm->msrpm);
1120        control->int_ctl = V_INTR_MASKING_MASK;
1121
1122        init_seg(&save->es);
1123        init_seg(&save->ss);
1124        init_seg(&save->ds);
1125        init_seg(&save->fs);
1126        init_seg(&save->gs);
1127
1128        save->cs.selector = 0xf000;
1129        /* Executable/Readable Code Segment */
1130        save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1131                SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1132        save->cs.limit = 0xffff;
1133        /*
1134         * cs.base should really be 0xffff0000, but vmx can't handle that, so
1135         * be consistent with it.
1136         *
1137         * Replace when we have real mode working for vmx.
1138         */
1139        save->cs.base = 0xf0000;
1140
1141        save->gdtr.limit = 0xffff;
1142        save->idtr.limit = 0xffff;
1143
1144        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1145        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1146
1147        svm_set_efer(&svm->vcpu, 0);
1148        save->dr6 = 0xffff0ff0;
1149        save->dr7 = 0x400;
1150        kvm_set_rflags(&svm->vcpu, 2);
1151        save->rip = 0x0000fff0;
1152        svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
1153
1154        /*
1155         * This is the guest-visible cr0 value.
1156         * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1157         */
1158        svm->vcpu.arch.cr0 = 0;
1159        (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1160
1161        save->cr4 = X86_CR4_PAE;
1162        /* rdx = ?? */
1163
1164        if (npt_enabled) {
1165                /* Setup VMCB for Nested Paging */
1166                control->nested_ctl = 1;
1167                clr_intercept(svm, INTERCEPT_INVLPG);
1168                clr_exception_intercept(svm, PF_VECTOR);
1169                clr_cr_intercept(svm, INTERCEPT_CR3_READ);
1170                clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1171                save->g_pat = 0x0007040600070406ULL;
1172                save->cr3 = 0;
1173                save->cr4 = 0;
1174        }
1175        svm->asid_generation = 0;
1176
1177        svm->nested.vmcb = 0;
1178        svm->vcpu.arch.hflags = 0;
1179
1180        if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1181                control->pause_filter_count = 3000;
1182                set_intercept(svm, INTERCEPT_PAUSE);
1183        }
1184
1185        mark_all_dirty(svm->vmcb);
1186
1187        enable_gif(svm);
1188}
1189
1190static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
1191{
1192        struct vcpu_svm *svm = to_svm(vcpu);
1193
1194        init_vmcb(svm);
1195
1196        if (!kvm_vcpu_is_bsp(vcpu)) {
1197                kvm_rip_write(vcpu, 0);
1198                svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
1199                svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
1200        }
1201        vcpu->arch.regs_avail = ~0;
1202        vcpu->arch.regs_dirty = ~0;
1203
1204        return 0;
1205}
1206
1207static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1208{
1209        struct vcpu_svm *svm;
1210        struct page *page;
1211        struct page *msrpm_pages;
1212        struct page *hsave_page;
1213        struct page *nested_msrpm_pages;
1214        int err;
1215
1216        svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1217        if (!svm) {
1218                err = -ENOMEM;
1219                goto out;
1220        }
1221
1222        svm->tsc_ratio = TSC_RATIO_DEFAULT;
1223
1224        err = kvm_vcpu_init(&svm->vcpu, kvm, id);
1225        if (err)
1226                goto free_svm;
1227
1228        err = -ENOMEM;
1229        page = alloc_page(GFP_KERNEL);
1230        if (!page)
1231                goto uninit;
1232
1233        msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1234        if (!msrpm_pages)
1235                goto free_page1;
1236
1237        nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1238        if (!nested_msrpm_pages)
1239                goto free_page2;
1240
1241        hsave_page = alloc_page(GFP_KERNEL);
1242        if (!hsave_page)
1243                goto free_page3;
1244
1245        svm->nested.hsave = page_address(hsave_page);
1246
1247        svm->msrpm = page_address(msrpm_pages);
1248        svm_vcpu_init_msrpm(svm->msrpm);
1249
1250        svm->nested.msrpm = page_address(nested_msrpm_pages);
1251        svm_vcpu_init_msrpm(svm->nested.msrpm);
1252
1253        svm->vmcb = page_address(page);
1254        clear_page(svm->vmcb);
1255        svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1256        svm->asid_generation = 0;
1257        init_vmcb(svm);
1258        kvm_write_tsc(&svm->vcpu, 0);
1259
1260        err = fx_init(&svm->vcpu);
1261        if (err)
1262                goto free_page4;
1263
1264        svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1265        if (kvm_vcpu_is_bsp(&svm->vcpu))
1266                svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1267
1268        svm_init_osvw(&svm->vcpu);
1269
1270        return &svm->vcpu;
1271
1272free_page4:
1273        __free_page(hsave_page);
1274free_page3:
1275        __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1276free_page2:
1277        __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
1278free_page1:
1279        __free_page(page);
1280uninit:
1281        kvm_vcpu_uninit(&svm->vcpu);
1282free_svm:
1283        kmem_cache_free(kvm_vcpu_cache, svm);
1284out:
1285        return ERR_PTR(err);
1286}
1287
1288static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1289{
1290        struct vcpu_svm *svm = to_svm(vcpu);
1291
1292        __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
1293        __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1294        __free_page(virt_to_page(svm->nested.hsave));
1295        __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1296        kvm_vcpu_uninit(vcpu);
1297        kmem_cache_free(kvm_vcpu_cache, svm);
1298}
1299
1300static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1301{
1302        struct vcpu_svm *svm = to_svm(vcpu);
1303        int i;
1304
1305        if (unlikely(cpu != vcpu->cpu)) {
1306                svm->asid_generation = 0;
1307                mark_all_dirty(svm->vmcb);
1308        }
1309
1310#ifdef CONFIG_X86_64
1311        rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1312#endif
1313        savesegment(fs, svm->host.fs);
1314        savesegment(gs, svm->host.gs);
1315        svm->host.ldt = kvm_read_ldt();
1316
1317        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1318                rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1319
1320        if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
1321            svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) {
1322                __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio;
1323                wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
1324        }
1325}
1326
1327static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1328{
1329        struct vcpu_svm *svm = to_svm(vcpu);
1330        int i;
1331
1332        ++vcpu->stat.host_state_reload;
1333        kvm_load_ldt(svm->host.ldt);
1334#ifdef CONFIG_X86_64
1335        loadsegment(fs, svm->host.fs);
1336        wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1337        load_gs_index(svm->host.gs);
1338#else
1339#ifdef CONFIG_X86_32_LAZY_GS
1340        loadsegment(gs, svm->host.gs);
1341#endif
1342#endif
1343        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1344                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1345}
1346
1347static void svm_update_cpl(struct kvm_vcpu *vcpu)
1348{
1349        struct vcpu_svm *svm = to_svm(vcpu);
1350        int cpl;
1351
1352        if (!is_protmode(vcpu))
1353                cpl = 0;
1354        else if (svm->vmcb->save.rflags & X86_EFLAGS_VM)
1355                cpl = 3;
1356        else
1357                cpl = svm->vmcb->save.cs.selector & 0x3;
1358
1359        svm->vmcb->save.cpl = cpl;
1360}
1361
1362static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1363{
1364        return to_svm(vcpu)->vmcb->save.rflags;
1365}
1366
1367static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1368{
1369        unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags;
1370
1371        to_svm(vcpu)->vmcb->save.rflags = rflags;
1372        if ((old_rflags ^ rflags) & X86_EFLAGS_VM)
1373                svm_update_cpl(vcpu);
1374}
1375
1376static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1377{
1378        switch (reg) {
1379        case VCPU_EXREG_PDPTR:
1380                BUG_ON(!npt_enabled);
1381                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1382                break;
1383        default:
1384                BUG();
1385        }
1386}
1387
1388static void svm_set_vintr(struct vcpu_svm *svm)
1389{
1390        set_intercept(svm, INTERCEPT_VINTR);
1391}
1392
1393static void svm_clear_vintr(struct vcpu_svm *svm)
1394{
1395        clr_intercept(svm, INTERCEPT_VINTR);
1396}
1397
1398static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1399{
1400        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1401
1402        switch (seg) {
1403        case VCPU_SREG_CS: return &save->cs;
1404        case VCPU_SREG_DS: return &save->ds;
1405        case VCPU_SREG_ES: return &save->es;
1406        case VCPU_SREG_FS: return &save->fs;
1407        case VCPU_SREG_GS: return &save->gs;
1408        case VCPU_SREG_SS: return &save->ss;
1409        case VCPU_SREG_TR: return &save->tr;
1410        case VCPU_SREG_LDTR: return &save->ldtr;
1411        }
1412        BUG();
1413        return NULL;
1414}
1415
1416static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1417{
1418        struct vmcb_seg *s = svm_seg(vcpu, seg);
1419
1420        return s->base;
1421}
1422
1423static void svm_get_segment(struct kvm_vcpu *vcpu,
1424                            struct kvm_segment *var, int seg)
1425{
1426        struct vmcb_seg *s = svm_seg(vcpu, seg);
1427
1428        var->base = s->base;
1429        var->limit = s->limit;
1430        var->selector = s->selector;
1431        var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1432        var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1433        var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1434        var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1435        var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1436        var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1437        var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1438        var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
1439
1440        /*
1441         * AMD's VMCB does not have an explicit unusable field, so emulate it
1442         * for cross vendor migration purposes by "not present"
1443         */
1444        var->unusable = !var->present || (var->type == 0);
1445
1446        switch (seg) {
1447        case VCPU_SREG_CS:
1448                /*
1449                 * SVM always stores 0 for the 'G' bit in the CS selector in
1450                 * the VMCB on a VMEXIT. This hurts cross-vendor migration:
1451                 * Intel's VMENTRY has a check on the 'G' bit.
1452                 */
1453                var->g = s->limit > 0xfffff;
1454                break;
1455        case VCPU_SREG_TR:
1456                /*
1457                 * Work around a bug where the busy flag in the tr selector
1458                 * isn't exposed
1459                 */
1460                var->type |= 0x2;
1461                break;
1462        case VCPU_SREG_DS:
1463        case VCPU_SREG_ES:
1464        case VCPU_SREG_FS:
1465        case VCPU_SREG_GS:
1466                /*
1467                 * The accessed bit must always be set in the segment
1468                 * descriptor cache, although it can be cleared in the
1469                 * descriptor, the cached bit always remains at 1. Since
1470                 * Intel has a check on this, set it here to support
1471                 * cross-vendor migration.
1472                 */
1473                if (!var->unusable)
1474                        var->type |= 0x1;
1475                break;
1476        case VCPU_SREG_SS:
1477                /*
1478                 * On AMD CPUs sometimes the DB bit in the segment
1479                 * descriptor is left as 1, although the whole segment has
1480                 * been made unusable. Clear it here to pass an Intel VMX
1481                 * entry check when cross vendor migrating.
1482                 */
1483                if (var->unusable)
1484                        var->db = 0;
1485                break;
1486        }
1487}
1488
1489static int svm_get_cpl(struct kvm_vcpu *vcpu)
1490{
1491        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1492
1493        return save->cpl;
1494}
1495
1496static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1497{
1498        struct vcpu_svm *svm = to_svm(vcpu);
1499
1500        dt->size = svm->vmcb->save.idtr.limit;
1501        dt->address = svm->vmcb->save.idtr.base;
1502}
1503
1504static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1505{
1506        struct vcpu_svm *svm = to_svm(vcpu);
1507
1508        svm->vmcb->save.idtr.limit = dt->size;
1509        svm->vmcb->save.idtr.base = dt->address ;
1510        mark_dirty(svm->vmcb, VMCB_DT);
1511}
1512
1513static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1514{
1515        struct vcpu_svm *svm = to_svm(vcpu);
1516
1517        dt->size = svm->vmcb->save.gdtr.limit;
1518        dt->address = svm->vmcb->save.gdtr.base;
1519}
1520
1521static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1522{
1523        struct vcpu_svm *svm = to_svm(vcpu);
1524
1525        svm->vmcb->save.gdtr.limit = dt->size;
1526        svm->vmcb->save.gdtr.base = dt->address ;
1527        mark_dirty(svm->vmcb, VMCB_DT);
1528}
1529
1530static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1531{
1532}
1533
1534static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1535{
1536}
1537
1538static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1539{
1540}
1541
1542static void update_cr0_intercept(struct vcpu_svm *svm)
1543{
1544        ulong gcr0 = svm->vcpu.arch.cr0;
1545        u64 *hcr0 = &svm->vmcb->save.cr0;
1546
1547        if (!svm->vcpu.fpu_active)
1548                *hcr0 |= SVM_CR0_SELECTIVE_MASK;
1549        else
1550                *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1551                        | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1552
1553        mark_dirty(svm->vmcb, VMCB_CR);
1554
1555        if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1556                clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1557                clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1558        } else {
1559                set_cr_intercept(svm, INTERCEPT_CR0_READ);
1560                set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1561        }
1562}
1563
1564static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1565{
1566        struct vcpu_svm *svm = to_svm(vcpu);
1567
1568#ifdef CONFIG_X86_64
1569        if (vcpu->arch.efer & EFER_LME) {
1570                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1571                        vcpu->arch.efer |= EFER_LMA;
1572                        svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1573                }
1574
1575                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1576                        vcpu->arch.efer &= ~EFER_LMA;
1577                        svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1578                }
1579        }
1580#endif
1581        vcpu->arch.cr0 = cr0;
1582
1583        if (!npt_enabled)
1584                cr0 |= X86_CR0_PG | X86_CR0_WP;
1585
1586        if (!vcpu->fpu_active)
1587                cr0 |= X86_CR0_TS;
1588        /*
1589         * re-enable caching here because the QEMU bios
1590         * does not do it - this results in some delay at
1591         * reboot
1592         */
1593        cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1594        svm->vmcb->save.cr0 = cr0;
1595        mark_dirty(svm->vmcb, VMCB_CR);
1596        update_cr0_intercept(svm);
1597}
1598
1599static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1600{
1601        unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
1602        unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1603
1604        if (cr4 & X86_CR4_VMXE)
1605                return 1;
1606
1607        if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1608                svm_flush_tlb(vcpu);
1609
1610        vcpu->arch.cr4 = cr4;
1611        if (!npt_enabled)
1612                cr4 |= X86_CR4_PAE;
1613        cr4 |= host_cr4_mce;
1614        to_svm(vcpu)->vmcb->save.cr4 = cr4;
1615        mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1616        return 0;
1617}
1618
1619static void svm_set_segment(struct kvm_vcpu *vcpu,
1620                            struct kvm_segment *var, int seg)
1621{
1622        struct vcpu_svm *svm = to_svm(vcpu);
1623        struct vmcb_seg *s = svm_seg(vcpu, seg);
1624
1625        s->base = var->base;
1626        s->limit = var->limit;
1627        s->selector = var->selector;
1628        if (var->unusable)
1629                s->attrib = 0;
1630        else {
1631                s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1632                s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1633                s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1634                s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
1635                s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1636                s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1637                s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1638                s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1639        }
1640        if (seg == VCPU_SREG_CS)
1641                svm_update_cpl(vcpu);
1642
1643        mark_dirty(svm->vmcb, VMCB_SEG);
1644}
1645
1646static void update_db_intercept(struct kvm_vcpu *vcpu)
1647{
1648        struct vcpu_svm *svm = to_svm(vcpu);
1649
1650        clr_exception_intercept(svm, DB_VECTOR);
1651        clr_exception_intercept(svm, BP_VECTOR);
1652
1653        if (svm->nmi_singlestep)
1654                set_exception_intercept(svm, DB_VECTOR);
1655
1656        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1657                if (vcpu->guest_debug &
1658                    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1659                        set_exception_intercept(svm, DB_VECTOR);
1660                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1661                        set_exception_intercept(svm, BP_VECTOR);
1662        } else
1663                vcpu->guest_debug = 0;
1664}
1665
1666static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1667{
1668        struct vcpu_svm *svm = to_svm(vcpu);
1669
1670        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1671                svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1672        else
1673                svm->vmcb->save.dr7 = vcpu->arch.dr7;
1674
1675        mark_dirty(svm->vmcb, VMCB_DR);
1676
1677        update_db_intercept(vcpu);
1678}
1679
1680static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1681{
1682        if (sd->next_asid > sd->max_asid) {
1683                ++sd->asid_generation;
1684                sd->next_asid = 1;
1685                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1686        }
1687
1688        svm->asid_generation = sd->asid_generation;
1689        svm->vmcb->control.asid = sd->next_asid++;
1690
1691        mark_dirty(svm->vmcb, VMCB_ASID);
1692}
1693
1694static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1695{
1696        struct vcpu_svm *svm = to_svm(vcpu);
1697
1698        svm->vmcb->save.dr7 = value;
1699        mark_dirty(svm->vmcb, VMCB_DR);
1700}
1701
1702static int pf_interception(struct vcpu_svm *svm)
1703{
1704        u64 fault_address = svm->vmcb->control.exit_info_2;
1705        u32 error_code;
1706        int r = 1;
1707
1708        switch (svm->apf_reason) {
1709        default:
1710                error_code = svm->vmcb->control.exit_info_1;
1711
1712                trace_kvm_page_fault(fault_address, error_code);
1713                if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1714                        kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1715                r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1716                        svm->vmcb->control.insn_bytes,
1717                        svm->vmcb->control.insn_len);
1718                break;
1719        case KVM_PV_REASON_PAGE_NOT_PRESENT:
1720                svm->apf_reason = 0;
1721                local_irq_disable();
1722                kvm_async_pf_task_wait(fault_address);
1723                local_irq_enable();
1724                break;
1725        case KVM_PV_REASON_PAGE_READY:
1726                svm->apf_reason = 0;
1727                local_irq_disable();
1728                kvm_async_pf_task_wake(fault_address);
1729                local_irq_enable();
1730                break;
1731        }
1732        return r;
1733}
1734
1735static int db_interception(struct vcpu_svm *svm)
1736{
1737        struct kvm_run *kvm_run = svm->vcpu.run;
1738
1739        if (!(svm->vcpu.guest_debug &
1740              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1741                !svm->nmi_singlestep) {
1742                kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1743                return 1;
1744        }
1745
1746        if (svm->nmi_singlestep) {
1747                svm->nmi_singlestep = false;
1748                if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1749                        svm->vmcb->save.rflags &=
1750                                ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1751                update_db_intercept(&svm->vcpu);
1752        }
1753
1754        if (svm->vcpu.guest_debug &
1755            (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1756                kvm_run->exit_reason = KVM_EXIT_DEBUG;
1757                kvm_run->debug.arch.pc =
1758                        svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1759                kvm_run->debug.arch.exception = DB_VECTOR;
1760                return 0;
1761        }
1762
1763        return 1;
1764}
1765
1766static int bp_interception(struct vcpu_svm *svm)
1767{
1768        struct kvm_run *kvm_run = svm->vcpu.run;
1769
1770        kvm_run->exit_reason = KVM_EXIT_DEBUG;
1771        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1772        kvm_run->debug.arch.exception = BP_VECTOR;
1773        return 0;
1774}
1775
1776static int ud_interception(struct vcpu_svm *svm)
1777{
1778        int er;
1779
1780        er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
1781        if (er != EMULATE_DONE)
1782                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1783        return 1;
1784}
1785
1786static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1787{
1788        struct vcpu_svm *svm = to_svm(vcpu);
1789
1790        clr_exception_intercept(svm, NM_VECTOR);
1791
1792        svm->vcpu.fpu_active = 1;
1793        update_cr0_intercept(svm);
1794}
1795
1796static int nm_interception(struct vcpu_svm *svm)
1797{
1798        svm_fpu_activate(&svm->vcpu);
1799        return 1;
1800}
1801
1802static bool is_erratum_383(void)
1803{
1804        int err, i;
1805        u64 value;
1806
1807        if (!erratum_383_found)
1808                return false;
1809
1810        value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
1811        if (err)
1812                return false;
1813
1814        /* Bit 62 may or may not be set for this mce */
1815        value &= ~(1ULL << 62);
1816
1817        if (value != 0xb600000000010015ULL)
1818                return false;
1819
1820        /* Clear MCi_STATUS registers */
1821        for (i = 0; i < 6; ++i)
1822                native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
1823
1824        value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
1825        if (!err) {
1826                u32 low, high;
1827
1828                value &= ~(1ULL << 2);
1829                low    = lower_32_bits(value);
1830                high   = upper_32_bits(value);
1831
1832                native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
1833        }
1834
1835        /* Flush tlb to evict multi-match entries */
1836        __flush_tlb_all();
1837
1838        return true;
1839}
1840
1841static void svm_handle_mce(struct vcpu_svm *svm)
1842{
1843        if (is_erratum_383()) {
1844                /*
1845                 * Erratum 383 triggered. Guest state is corrupt so kill the
1846                 * guest.
1847                 */
1848                pr_err("KVM: Guest triggered AMD Erratum 383\n");
1849
1850                kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1851
1852                return;
1853        }
1854
1855        /*
1856         * On an #MC intercept the MCE handler is not called automatically in
1857         * the host. So do it by hand here.
1858         */
1859        asm volatile (
1860                "int $0x12\n");
1861        /* not sure if we ever come back to this point */
1862
1863        return;
1864}
1865
1866static int mc_interception(struct vcpu_svm *svm)
1867{
1868        return 1;
1869}
1870
1871static int shutdown_interception(struct vcpu_svm *svm)
1872{
1873        struct kvm_run *kvm_run = svm->vcpu.run;
1874
1875        /*
1876         * VMCB is undefined after a SHUTDOWN intercept
1877         * so reinitialize it.
1878         */
1879        clear_page(svm->vmcb);
1880        init_vmcb(svm);
1881
1882        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1883        return 0;
1884}
1885
1886static int io_interception(struct vcpu_svm *svm)
1887{
1888        struct kvm_vcpu *vcpu = &svm->vcpu;
1889        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1890        int size, in, string;
1891        unsigned port;
1892
1893        ++svm->vcpu.stat.io_exits;
1894        string = (io_info & SVM_IOIO_STR_MASK) != 0;
1895        in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1896        if (string || in)
1897                return emulate_instruction(vcpu, 0) == EMULATE_DONE;
1898
1899        port = io_info >> 16;
1900        size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1901        svm->next_rip = svm->vmcb->control.exit_info_2;
1902        skip_emulated_instruction(&svm->vcpu);
1903
1904        return kvm_fast_pio_out(vcpu, size, port);
1905}
1906
1907static int nmi_interception(struct vcpu_svm *svm)
1908{
1909        return 1;
1910}
1911
1912static int intr_interception(struct vcpu_svm *svm)
1913{
1914        ++svm->vcpu.stat.irq_exits;
1915        return 1;
1916}
1917
1918static int nop_on_interception(struct vcpu_svm *svm)
1919{
1920        return 1;
1921}
1922
1923static int halt_interception(struct vcpu_svm *svm)
1924{
1925        svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1926        skip_emulated_instruction(&svm->vcpu);
1927        return kvm_emulate_halt(&svm->vcpu);
1928}
1929
1930static int vmmcall_interception(struct vcpu_svm *svm)
1931{
1932        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1933        skip_emulated_instruction(&svm->vcpu);
1934        kvm_emulate_hypercall(&svm->vcpu);
1935        return 1;
1936}
1937
1938static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
1939{
1940        struct vcpu_svm *svm = to_svm(vcpu);
1941
1942        return svm->nested.nested_cr3;
1943}
1944
1945static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
1946{
1947        struct vcpu_svm *svm = to_svm(vcpu);
1948        u64 cr3 = svm->nested.nested_cr3;
1949        u64 pdpte;
1950        int ret;
1951
1952        ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
1953                                  offset_in_page(cr3) + index * 8, 8);
1954        if (ret)
1955                return 0;
1956        return pdpte;
1957}
1958
1959static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1960                                   unsigned long root)
1961{
1962        struct vcpu_svm *svm = to_svm(vcpu);
1963
1964        svm->vmcb->control.nested_cr3 = root;
1965        mark_dirty(svm->vmcb, VMCB_NPT);
1966        svm_flush_tlb(vcpu);
1967}
1968
1969static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
1970                                       struct x86_exception *fault)
1971{
1972        struct vcpu_svm *svm = to_svm(vcpu);
1973
1974        svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1975        svm->vmcb->control.exit_code_hi = 0;
1976        svm->vmcb->control.exit_info_1 = fault->error_code;
1977        svm->vmcb->control.exit_info_2 = fault->address;
1978
1979        nested_svm_vmexit(svm);
1980}
1981
1982static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
1983{
1984        int r;
1985
1986        r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
1987
1988        vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
1989        vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
1990        vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
1991        vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
1992        vcpu->arch.mmu.shadow_root_level = get_npt_level();
1993        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
1994
1995        return r;
1996}
1997
1998static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
1999{
2000        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
2001}
2002
2003static int nested_svm_check_permissions(struct vcpu_svm *svm)
2004{
2005        if (!(svm->vcpu.arch.efer & EFER_SVME)
2006            || !is_paging(&svm->vcpu)) {
2007                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2008                return 1;
2009        }
2010
2011        if (svm->vmcb->save.cpl) {
2012                kvm_inject_gp(&svm->vcpu, 0);
2013                return 1;
2014        }
2015
2016       return 0;
2017}
2018
2019static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
2020                                      bool has_error_code, u32 error_code)
2021{
2022        int vmexit;
2023
2024        if (!is_guest_mode(&svm->vcpu))
2025                return 0;
2026
2027        svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
2028        svm->vmcb->control.exit_code_hi = 0;
2029        svm->vmcb->control.exit_info_1 = error_code;
2030        svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
2031
2032        vmexit = nested_svm_intercept(svm);
2033        if (vmexit == NESTED_EXIT_DONE)
2034                svm->nested.exit_required = true;
2035
2036        return vmexit;
2037}
2038
2039/* This function returns true if it is save to enable the irq window */
2040static inline bool nested_svm_intr(struct vcpu_svm *svm)
2041{
2042        if (!is_guest_mode(&svm->vcpu))
2043                return true;
2044
2045        if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2046                return true;
2047
2048        if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
2049                return false;
2050
2051        /*
2052         * if vmexit was already requested (by intercepted exception
2053         * for instance) do not overwrite it with "external interrupt"
2054         * vmexit.
2055         */
2056        if (svm->nested.exit_required)
2057                return false;
2058
2059        svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
2060        svm->vmcb->control.exit_info_1 = 0;
2061        svm->vmcb->control.exit_info_2 = 0;
2062
2063        if (svm->nested.intercept & 1ULL) {
2064                /*
2065                 * The #vmexit can't be emulated here directly because this
2066                 * code path runs with irqs and preemtion disabled. A
2067                 * #vmexit emulation might sleep. Only signal request for
2068                 * the #vmexit here.
2069                 */
2070                svm->nested.exit_required = true;
2071                trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
2072                return false;
2073        }
2074
2075        return true;
2076}
2077
2078/* This function returns true if it is save to enable the nmi window */
2079static inline bool nested_svm_nmi(struct vcpu_svm *svm)
2080{
2081        if (!is_guest_mode(&svm->vcpu))
2082                return true;
2083
2084        if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
2085                return true;
2086
2087        svm->vmcb->control.exit_code = SVM_EXIT_NMI;
2088        svm->nested.exit_required = true;
2089
2090        return false;
2091}
2092
2093static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
2094{
2095        struct page *page;
2096
2097        might_sleep();
2098
2099        page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
2100        if (is_error_page(page))
2101                goto error;
2102
2103        *_page = page;
2104
2105        return kmap(page);
2106
2107error:
2108        kvm_release_page_clean(page);
2109        kvm_inject_gp(&svm->vcpu, 0);
2110
2111        return NULL;
2112}
2113
2114static void nested_svm_unmap(struct page *page)
2115{
2116        kunmap(page);
2117        kvm_release_page_dirty(page);
2118}
2119
2120static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
2121{
2122        unsigned port;
2123        u8 val, bit;
2124        u64 gpa;
2125
2126        if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
2127                return NESTED_EXIT_HOST;
2128
2129        port = svm->vmcb->control.exit_info_1 >> 16;
2130        gpa  = svm->nested.vmcb_iopm + (port / 8);
2131        bit  = port % 8;
2132        val  = 0;
2133
2134        if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
2135                val &= (1 << bit);
2136
2137        return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2138}
2139
2140static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
2141{
2142        u32 offset, msr, value;
2143        int write, mask;
2144
2145        if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2146                return NESTED_EXIT_HOST;
2147
2148        msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2149        offset = svm_msrpm_offset(msr);
2150        write  = svm->vmcb->control.exit_info_1 & 1;
2151        mask   = 1 << ((2 * (msr & 0xf)) + write);
2152
2153        if (offset == MSR_INVALID)
2154                return NESTED_EXIT_DONE;
2155
2156        /* Offset is in 32 bit units but need in 8 bit units */
2157        offset *= 4;
2158
2159        if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
2160                return NESTED_EXIT_DONE;
2161
2162        return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2163}
2164
2165static int nested_svm_exit_special(struct vcpu_svm *svm)
2166{
2167        u32 exit_code = svm->vmcb->control.exit_code;
2168
2169        switch (exit_code) {
2170        case SVM_EXIT_INTR:
2171        case SVM_EXIT_NMI:
2172        case SVM_EXIT_EXCP_BASE + MC_VECTOR:
2173                return NESTED_EXIT_HOST;
2174        case SVM_EXIT_NPF:
2175                /* For now we are always handling NPFs when using them */
2176                if (npt_enabled)
2177                        return NESTED_EXIT_HOST;
2178                break;
2179        case SVM_EXIT_EXCP_BASE + PF_VECTOR:
2180                /* When we're shadowing, trap PFs, but not async PF */
2181                if (!npt_enabled && svm->apf_reason == 0)
2182                        return NESTED_EXIT_HOST;
2183                break;
2184        case SVM_EXIT_EXCP_BASE + NM_VECTOR:
2185                nm_interception(svm);
2186                break;
2187        default:
2188                break;
2189        }
2190
2191        return NESTED_EXIT_CONTINUE;
2192}
2193
2194/*
2195 * If this function returns true, this #vmexit was already handled
2196 */
2197static int nested_svm_intercept(struct vcpu_svm *svm)
2198{
2199        u32 exit_code = svm->vmcb->control.exit_code;
2200        int vmexit = NESTED_EXIT_HOST;
2201
2202        switch (exit_code) {
2203        case SVM_EXIT_MSR:
2204                vmexit = nested_svm_exit_handled_msr(svm);
2205                break;
2206        case SVM_EXIT_IOIO:
2207                vmexit = nested_svm_intercept_ioio(svm);
2208                break;
2209        case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
2210                u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
2211                if (svm->nested.intercept_cr & bit)
2212                        vmexit = NESTED_EXIT_DONE;
2213                break;
2214        }
2215        case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
2216                u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
2217                if (svm->nested.intercept_dr & bit)
2218                        vmexit = NESTED_EXIT_DONE;
2219                break;
2220        }
2221        case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
2222                u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
2223                if (svm->nested.intercept_exceptions & excp_bits)
2224                        vmexit = NESTED_EXIT_DONE;
2225                /* async page fault always cause vmexit */
2226                else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2227                         svm->apf_reason != 0)
2228                        vmexit = NESTED_EXIT_DONE;
2229                break;
2230        }
2231        case SVM_EXIT_ERR: {
2232                vmexit = NESTED_EXIT_DONE;
2233                break;
2234        }
2235        default: {
2236                u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
2237                if (svm->nested.intercept & exit_bits)
2238                        vmexit = NESTED_EXIT_DONE;
2239        }
2240        }
2241
2242        return vmexit;
2243}
2244
2245static int nested_svm_exit_handled(struct vcpu_svm *svm)
2246{
2247        int vmexit;
2248
2249        vmexit = nested_svm_intercept(svm);
2250
2251        if (vmexit == NESTED_EXIT_DONE)
2252                nested_svm_vmexit(svm);
2253
2254        return vmexit;
2255}
2256
2257static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
2258{
2259        struct vmcb_control_area *dst  = &dst_vmcb->control;
2260        struct vmcb_control_area *from = &from_vmcb->control;
2261
2262        dst->intercept_cr         = from->intercept_cr;
2263        dst->intercept_dr         = from->intercept_dr;
2264        dst->intercept_exceptions = from->intercept_exceptions;
2265        dst->intercept            = from->intercept;
2266        dst->iopm_base_pa         = from->iopm_base_pa;
2267        dst->msrpm_base_pa        = from->msrpm_base_pa;
2268        dst->tsc_offset           = from->tsc_offset;
2269        dst->asid                 = from->asid;
2270        dst->tlb_ctl              = from->tlb_ctl;
2271        dst->int_ctl              = from->int_ctl;
2272        dst->int_vector           = from->int_vector;
2273        dst->int_state            = from->int_state;
2274        dst->exit_code            = from->exit_code;
2275        dst->exit_code_hi         = from->exit_code_hi;
2276        dst->exit_info_1          = from->exit_info_1;
2277        dst->exit_info_2          = from->exit_info_2;
2278        dst->exit_int_info        = from->exit_int_info;
2279        dst->exit_int_info_err    = from->exit_int_info_err;
2280        dst->nested_ctl           = from->nested_ctl;
2281        dst->event_inj            = from->event_inj;
2282        dst->event_inj_err        = from->event_inj_err;
2283        dst->nested_cr3           = from->nested_cr3;
2284        dst->lbr_ctl              = from->lbr_ctl;
2285}
2286
2287static int nested_svm_vmexit(struct vcpu_svm *svm)
2288{
2289        struct vmcb *nested_vmcb;
2290        struct vmcb *hsave = svm->nested.hsave;
2291        struct vmcb *vmcb = svm->vmcb;
2292        struct page *page;
2293
2294        trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
2295                                       vmcb->control.exit_info_1,
2296                                       vmcb->control.exit_info_2,
2297                                       vmcb->control.exit_int_info,
2298                                       vmcb->control.exit_int_info_err,
2299                                       KVM_ISA_SVM);
2300
2301        nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
2302        if (!nested_vmcb)
2303                return 1;
2304
2305        /* Exit Guest-Mode */
2306        leave_guest_mode(&svm->vcpu);
2307        svm->nested.vmcb = 0;
2308
2309        /* Give the current vmcb to the guest */
2310        disable_gif(svm);
2311
2312        nested_vmcb->save.es     = vmcb->save.es;
2313        nested_vmcb->save.cs     = vmcb->save.cs;
2314        nested_vmcb->save.ss     = vmcb->save.ss;
2315        nested_vmcb->save.ds     = vmcb->save.ds;
2316        nested_vmcb->save.gdtr   = vmcb->save.gdtr;
2317        nested_vmcb->save.idtr   = vmcb->save.idtr;
2318        nested_vmcb->save.efer   = svm->vcpu.arch.efer;
2319        nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
2320        nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
2321        nested_vmcb->save.cr2    = vmcb->save.cr2;
2322        nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
2323        nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
2324        nested_vmcb->save.rip    = vmcb->save.rip;
2325        nested_vmcb->save.rsp    = vmcb->save.rsp;
2326        nested_vmcb->save.rax    = vmcb->save.rax;
2327        nested_vmcb->save.dr7    = vmcb->save.dr7;
2328        nested_vmcb->save.dr6    = vmcb->save.dr6;
2329        nested_vmcb->save.cpl    = vmcb->save.cpl;
2330
2331        nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
2332        nested_vmcb->control.int_vector        = vmcb->control.int_vector;
2333        nested_vmcb->control.int_state         = vmcb->control.int_state;
2334        nested_vmcb->control.exit_code         = vmcb->control.exit_code;
2335        nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
2336        nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
2337        nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
2338        nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
2339        nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2340        nested_vmcb->control.next_rip          = vmcb->control.next_rip;
2341
2342        /*
2343         * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
2344         * to make sure that we do not lose injected events. So check event_inj
2345         * here and copy it to exit_int_info if it is valid.
2346         * Exit_int_info and event_inj can't be both valid because the case
2347         * below only happens on a VMRUN instruction intercept which has
2348         * no valid exit_int_info set.
2349         */
2350        if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
2351                struct vmcb_control_area *nc = &nested_vmcb->control;
2352
2353                nc->exit_int_info     = vmcb->control.event_inj;
2354                nc->exit_int_info_err = vmcb->control.event_inj_err;
2355        }
2356
2357        nested_vmcb->control.tlb_ctl           = 0;
2358        nested_vmcb->control.event_inj         = 0;
2359        nested_vmcb->control.event_inj_err     = 0;
2360
2361        /* We always set V_INTR_MASKING and remember the old value in hflags */
2362        if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2363                nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
2364
2365        /* Restore the original control entries */
2366        copy_vmcb_control_area(vmcb, hsave);
2367
2368        kvm_clear_exception_queue(&svm->vcpu);
2369        kvm_clear_interrupt_queue(&svm->vcpu);
2370
2371        svm->nested.nested_cr3 = 0;
2372
2373        /* Restore selected save entries */
2374        svm->vmcb->save.es = hsave->save.es;
2375        svm->vmcb->save.cs = hsave->save.cs;
2376        svm->vmcb->save.ss = hsave->save.ss;
2377        svm->vmcb->save.ds = hsave->save.ds;
2378        svm->vmcb->save.gdtr = hsave->save.gdtr;
2379        svm->vmcb->save.idtr = hsave->save.idtr;
2380        kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
2381        svm_set_efer(&svm->vcpu, hsave->save.efer);
2382        svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
2383        svm_set_cr4(&svm->vcpu, hsave->save.cr4);
2384        if (npt_enabled) {
2385                svm->vmcb->save.cr3 = hsave->save.cr3;
2386                svm->vcpu.arch.cr3 = hsave->save.cr3;
2387        } else {
2388                (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
2389        }
2390        kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
2391        kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
2392        kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
2393        svm->vmcb->save.dr7 = 0;
2394        svm->vmcb->save.cpl = 0;
2395        svm->vmcb->control.exit_int_info = 0;
2396
2397        mark_all_dirty(svm->vmcb);
2398
2399        nested_svm_unmap(page);
2400
2401        nested_svm_uninit_mmu_context(&svm->vcpu);
2402        kvm_mmu_reset_context(&svm->vcpu);
2403        kvm_mmu_load(&svm->vcpu);
2404
2405        return 0;
2406}
2407
2408static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2409{
2410        /*
2411         * This function merges the msr permission bitmaps of kvm and the
2412         * nested vmcb. It is omptimized in that it only merges the parts where
2413         * the kvm msr permission bitmap may contain zero bits
2414         */
2415        int i;
2416
2417        if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2418                return true;
2419
2420        for (i = 0; i < MSRPM_OFFSETS; i++) {
2421                u32 value, p;
2422                u64 offset;
2423
2424                if (msrpm_offsets[i] == 0xffffffff)
2425                        break;
2426
2427                p      = msrpm_offsets[i];
2428                offset = svm->nested.vmcb_msrpm + (p * 4);
2429
2430                if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
2431                        return false;
2432
2433                svm->nested.msrpm[p] = svm->msrpm[p] | value;
2434        }
2435
2436        svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
2437
2438        return true;
2439}
2440
2441static bool nested_vmcb_checks(struct vmcb *vmcb)
2442{
2443        if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2444                return false;
2445
2446        if (vmcb->control.asid == 0)
2447                return false;
2448
2449        if (vmcb->control.nested_ctl && !npt_enabled)
2450                return false;
2451
2452        return true;
2453}
2454
2455static bool nested_svm_vmrun(struct vcpu_svm *svm)
2456{
2457        struct vmcb *nested_vmcb;
2458        struct vmcb *hsave = svm->nested.hsave;
2459        struct vmcb *vmcb = svm->vmcb;
2460        struct page *page;
2461        u64 vmcb_gpa;
2462
2463        vmcb_gpa = svm->vmcb->save.rax;
2464
2465        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2466        if (!nested_vmcb)
2467                return false;
2468
2469        if (!nested_vmcb_checks(nested_vmcb)) {
2470                nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
2471                nested_vmcb->control.exit_code_hi = 0;
2472                nested_vmcb->control.exit_info_1  = 0;
2473                nested_vmcb->control.exit_info_2  = 0;
2474
2475                nested_svm_unmap(page);
2476
2477                return false;
2478        }
2479
2480        trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2481                               nested_vmcb->save.rip,
2482                               nested_vmcb->control.int_ctl,
2483                               nested_vmcb->control.event_inj,
2484                               nested_vmcb->control.nested_ctl);
2485
2486        trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2487                                    nested_vmcb->control.intercept_cr >> 16,
2488                                    nested_vmcb->control.intercept_exceptions,
2489                                    nested_vmcb->control.intercept);
2490
2491        /* Clear internal status */
2492        kvm_clear_exception_queue(&svm->vcpu);
2493        kvm_clear_interrupt_queue(&svm->vcpu);
2494
2495        /*
2496         * Save the old vmcb, so we don't need to pick what we save, but can
2497         * restore everything when a VMEXIT occurs
2498         */
2499        hsave->save.es     = vmcb->save.es;
2500        hsave->save.cs     = vmcb->save.cs;
2501        hsave->save.ss     = vmcb->save.ss;
2502        hsave->save.ds     = vmcb->save.ds;
2503        hsave->save.gdtr   = vmcb->save.gdtr;
2504        hsave->save.idtr   = vmcb->save.idtr;
2505        hsave->save.efer   = svm->vcpu.arch.efer;
2506        hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
2507        hsave->save.cr4    = svm->vcpu.arch.cr4;
2508        hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
2509        hsave->save.rip    = kvm_rip_read(&svm->vcpu);
2510        hsave->save.rsp    = vmcb->save.rsp;
2511        hsave->save.rax    = vmcb->save.rax;
2512        if (npt_enabled)
2513                hsave->save.cr3    = vmcb->save.cr3;
2514        else
2515                hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
2516
2517        copy_vmcb_control_area(hsave, vmcb);
2518
2519        if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
2520                svm->vcpu.arch.hflags |= HF_HIF_MASK;
2521        else
2522                svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2523
2524        if (nested_vmcb->control.nested_ctl) {
2525                kvm_mmu_unload(&svm->vcpu);
2526                svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2527                nested_svm_init_mmu_context(&svm->vcpu);
2528        }
2529
2530        /* Load the nested guest state */
2531        svm->vmcb->save.es = nested_vmcb->save.es;
2532        svm->vmcb->save.cs = nested_vmcb->save.cs;
2533        svm->vmcb->save.ss = nested_vmcb->save.ss;
2534        svm->vmcb->save.ds = nested_vmcb->save.ds;
2535        svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
2536        svm->vmcb->save.idtr = nested_vmcb->save.idtr;
2537        kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
2538        svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
2539        svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
2540        svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
2541        if (npt_enabled) {
2542                svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2543                svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2544        } else
2545                (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2546
2547        /* Guest paging mode is active - reset mmu */
2548        kvm_mmu_reset_context(&svm->vcpu);
2549
2550        svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
2551        kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
2552        kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
2553        kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2554
2555        /* In case we don't even reach vcpu_run, the fields are not updated */
2556        svm->vmcb->save.rax = nested_vmcb->save.rax;
2557        svm->vmcb->save.rsp = nested_vmcb->save.rsp;
2558        svm->vmcb->save.rip = nested_vmcb->save.rip;
2559        svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
2560        svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
2561        svm->vmcb->save.cpl = nested_vmcb->save.cpl;
2562
2563        svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
2564        svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
2565
2566        /* cache intercepts */
2567        svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
2568        svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
2569        svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2570        svm->nested.intercept            = nested_vmcb->control.intercept;
2571
2572        svm_flush_tlb(&svm->vcpu);
2573        svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2574        if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2575                svm->vcpu.arch.hflags |= HF_VINTR_MASK;
2576        else
2577                svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
2578
2579        if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2580                /* We only want the cr8 intercept bits of the guest */
2581                clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2582                clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2583        }
2584
2585        /* We don't want to see VMMCALLs from a nested guest */
2586        clr_intercept(svm, INTERCEPT_VMMCALL);
2587
2588        svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2589        svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
2590        svm->vmcb->control.int_state = nested_vmcb->control.int_state;
2591        svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
2592        svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
2593        svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
2594
2595        nested_svm_unmap(page);
2596
2597        /* Enter Guest-Mode */
2598        enter_guest_mode(&svm->vcpu);
2599
2600        /*
2601         * Merge guest and host intercepts - must be called  with vcpu in
2602         * guest-mode to take affect here
2603         */
2604        recalc_intercepts(svm);
2605
2606        svm->nested.vmcb = vmcb_gpa;
2607
2608        enable_gif(svm);
2609
2610        mark_all_dirty(svm->vmcb);
2611
2612        return true;
2613}
2614
2615static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
2616{
2617        to_vmcb->save.fs = from_vmcb->save.fs;
2618        to_vmcb->save.gs = from_vmcb->save.gs;
2619        to_vmcb->save.tr = from_vmcb->save.tr;
2620        to_vmcb->save.ldtr = from_vmcb->save.ldtr;
2621        to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
2622        to_vmcb->save.star = from_vmcb->save.star;
2623        to_vmcb->save.lstar = from_vmcb->save.lstar;
2624        to_vmcb->save.cstar = from_vmcb->save.cstar;
2625        to_vmcb->save.sfmask = from_vmcb->save.sfmask;
2626        to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
2627        to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
2628        to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
2629}
2630
2631static int vmload_interception(struct vcpu_svm *svm)
2632{
2633        struct vmcb *nested_vmcb;
2634        struct page *page;
2635
2636        if (nested_svm_check_permissions(svm))
2637                return 1;
2638
2639        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2640        if (!nested_vmcb)
2641                return 1;
2642
2643        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2644        skip_emulated_instruction(&svm->vcpu);
2645
2646        nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2647        nested_svm_unmap(page);
2648
2649        return 1;
2650}
2651
2652static int vmsave_interception(struct vcpu_svm *svm)
2653{
2654        struct vmcb *nested_vmcb;
2655        struct page *page;
2656
2657        if (nested_svm_check_permissions(svm))
2658                return 1;
2659
2660        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2661        if (!nested_vmcb)
2662                return 1;
2663
2664        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2665        skip_emulated_instruction(&svm->vcpu);
2666
2667        nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2668        nested_svm_unmap(page);
2669
2670        return 1;
2671}
2672
2673static int vmrun_interception(struct vcpu_svm *svm)
2674{
2675        if (nested_svm_check_permissions(svm))
2676                return 1;
2677
2678        /* Save rip after vmrun instruction */
2679        kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
2680
2681        if (!nested_svm_vmrun(svm))
2682                return 1;
2683
2684        if (!nested_svm_vmrun_msrpm(svm))
2685                goto failed;
2686
2687        return 1;
2688
2689failed:
2690
2691        svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
2692        svm->vmcb->control.exit_code_hi = 0;
2693        svm->vmcb->control.exit_info_1  = 0;
2694        svm->vmcb->control.exit_info_2  = 0;
2695
2696        nested_svm_vmexit(svm);
2697
2698        return 1;
2699}
2700
2701static int stgi_interception(struct vcpu_svm *svm)
2702{
2703        if (nested_svm_check_permissions(svm))
2704                return 1;
2705
2706        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2707        skip_emulated_instruction(&svm->vcpu);
2708        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2709
2710        enable_gif(svm);
2711
2712        return 1;
2713}
2714
2715static int clgi_interception(struct vcpu_svm *svm)
2716{
2717        if (nested_svm_check_permissions(svm))
2718                return 1;
2719
2720        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2721        skip_emulated_instruction(&svm->vcpu);
2722
2723        disable_gif(svm);
2724
2725        /* After a CLGI no interrupts should come */
2726        svm_clear_vintr(svm);
2727        svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2728
2729        mark_dirty(svm->vmcb, VMCB_INTR);
2730
2731        return 1;
2732}
2733
2734static int invlpga_interception(struct vcpu_svm *svm)
2735{
2736        struct kvm_vcpu *vcpu = &svm->vcpu;
2737
2738        trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
2739                          vcpu->arch.regs[VCPU_REGS_RAX]);
2740
2741        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2742        kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
2743
2744        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2745        skip_emulated_instruction(&svm->vcpu);
2746        return 1;
2747}
2748
2749static int skinit_interception(struct vcpu_svm *svm)
2750{
2751        trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
2752
2753        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2754        return 1;
2755}
2756
2757static int xsetbv_interception(struct vcpu_svm *svm)
2758{
2759        u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2760        u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
2761
2762        if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2763                svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2764                skip_emulated_instruction(&svm->vcpu);
2765        }
2766
2767        return 1;
2768}
2769
2770static int invalid_op_interception(struct vcpu_svm *svm)
2771{
2772        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2773        return 1;
2774}
2775
2776static int task_switch_interception(struct vcpu_svm *svm)
2777{
2778        u16 tss_selector;
2779        int reason;
2780        int int_type = svm->vmcb->control.exit_int_info &
2781                SVM_EXITINTINFO_TYPE_MASK;
2782        int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2783        uint32_t type =
2784                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2785        uint32_t idt_v =
2786                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2787        bool has_error_code = false;
2788        u32 error_code = 0;
2789
2790        tss_selector = (u16)svm->vmcb->control.exit_info_1;
2791
2792        if (svm->vmcb->control.exit_info_2 &
2793            (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2794                reason = TASK_SWITCH_IRET;
2795        else if (svm->vmcb->control.exit_info_2 &
2796                 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2797                reason = TASK_SWITCH_JMP;
2798        else if (idt_v)
2799                reason = TASK_SWITCH_GATE;
2800        else
2801                reason = TASK_SWITCH_CALL;
2802
2803        if (reason == TASK_SWITCH_GATE) {
2804                switch (type) {
2805                case SVM_EXITINTINFO_TYPE_NMI:
2806                        svm->vcpu.arch.nmi_injected = false;
2807                        break;
2808                case SVM_EXITINTINFO_TYPE_EXEPT:
2809                        if (svm->vmcb->control.exit_info_2 &
2810                            (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2811                                has_error_code = true;
2812                                error_code =
2813                                        (u32)svm->vmcb->control.exit_info_2;
2814                        }
2815                        kvm_clear_exception_queue(&svm->vcpu);
2816                        break;
2817                case SVM_EXITINTINFO_TYPE_INTR:
2818                        kvm_clear_interrupt_queue(&svm->vcpu);
2819                        break;
2820                default:
2821                        break;
2822                }
2823        }
2824
2825        if (reason != TASK_SWITCH_GATE ||
2826            int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2827            (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2828             (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2829                skip_emulated_instruction(&svm->vcpu);
2830
2831        if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2832                int_vec = -1;
2833
2834        if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
2835                                has_error_code, error_code) == EMULATE_FAIL) {
2836                svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2837                svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2838                svm->vcpu.run->internal.ndata = 0;
2839                return 0;
2840        }
2841        return 1;
2842}
2843
2844static int cpuid_interception(struct vcpu_svm *svm)
2845{
2846        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2847        kvm_emulate_cpuid(&svm->vcpu);
2848        return 1;
2849}
2850
2851static int iret_interception(struct vcpu_svm *svm)
2852{
2853        ++svm->vcpu.stat.nmi_window_exits;
2854        clr_intercept(svm, INTERCEPT_IRET);
2855        svm->vcpu.arch.hflags |= HF_IRET_MASK;
2856        svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2857        return 1;
2858}
2859
2860static int invlpg_interception(struct vcpu_svm *svm)
2861{
2862        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2863                return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2864
2865        kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2866        skip_emulated_instruction(&svm->vcpu);
2867        return 1;
2868}
2869
2870static int emulate_on_interception(struct vcpu_svm *svm)
2871{
2872        return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2873}
2874
2875static int rdpmc_interception(struct vcpu_svm *svm)
2876{
2877        int err;
2878
2879        if (!static_cpu_has(X86_FEATURE_NRIPS))
2880                return emulate_on_interception(svm);
2881
2882        err = kvm_rdpmc(&svm->vcpu);
2883        kvm_complete_insn_gp(&svm->vcpu, err);
2884
2885        return 1;
2886}
2887
2888bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
2889{
2890        unsigned long cr0 = svm->vcpu.arch.cr0;
2891        bool ret = false;
2892        u64 intercept;
2893
2894        intercept = svm->nested.intercept;
2895
2896        if (!is_guest_mode(&svm->vcpu) ||
2897            (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
2898                return false;
2899
2900        cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2901        val &= ~SVM_CR0_SELECTIVE_MASK;
2902
2903        if (cr0 ^ val) {
2904                svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2905                ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2906        }
2907
2908        return ret;
2909}
2910
2911#define CR_VALID (1ULL << 63)
2912
2913static int cr_interception(struct vcpu_svm *svm)
2914{
2915        int reg, cr;
2916        unsigned long val;
2917        int err;
2918
2919        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2920                return emulate_on_interception(svm);
2921
2922        if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2923                return emulate_on_interception(svm);
2924
2925        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2926        cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2927
2928        err = 0;
2929        if (cr >= 16) { /* mov to cr */
2930                cr -= 16;
2931                val = kvm_register_read(&svm->vcpu, reg);
2932                switch (cr) {
2933                case 0:
2934                        if (!check_selective_cr0_intercepted(svm, val))
2935                                err = kvm_set_cr0(&svm->vcpu, val);
2936                        else
2937                                return 1;
2938
2939                        break;
2940                case 3:
2941                        err = kvm_set_cr3(&svm->vcpu, val);
2942                        break;
2943                case 4:
2944                        err = kvm_set_cr4(&svm->vcpu, val);
2945                        break;
2946                case 8:
2947                        err = kvm_set_cr8(&svm->vcpu, val);
2948                        break;
2949                default:
2950                        WARN(1, "unhandled write to CR%d", cr);
2951                        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2952                        return 1;
2953                }
2954        } else { /* mov from cr */
2955                switch (cr) {
2956                case 0:
2957                        val = kvm_read_cr0(&svm->vcpu);
2958                        break;
2959                case 2:
2960                        val = svm->vcpu.arch.cr2;
2961                        break;
2962                case 3:
2963                        val = kvm_read_cr3(&svm->vcpu);
2964                        break;
2965                case 4:
2966                        val = kvm_read_cr4(&svm->vcpu);
2967                        break;
2968                case 8:
2969                        val = kvm_get_cr8(&svm->vcpu);
2970                        break;
2971                default:
2972                        WARN(1, "unhandled read from CR%d", cr);
2973                        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2974                        return 1;
2975                }
2976                kvm_register_write(&svm->vcpu, reg, val);
2977        }
2978        kvm_complete_insn_gp(&svm->vcpu, err);
2979
2980        return 1;
2981}
2982
2983static int dr_interception(struct vcpu_svm *svm)
2984{
2985        int reg, dr;
2986        unsigned long val;
2987        int err;
2988
2989        if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2990                return emulate_on_interception(svm);
2991
2992        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2993        dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2994
2995        if (dr >= 16) { /* mov to DRn */
2996                val = kvm_register_read(&svm->vcpu, reg);
2997                kvm_set_dr(&svm->vcpu, dr - 16, val);
2998        } else {
2999                err = kvm_get_dr(&svm->vcpu, dr, &val);
3000                if (!err)
3001                        kvm_register_write(&svm->vcpu, reg, val);
3002        }
3003
3004        skip_emulated_instruction(&svm->vcpu);
3005
3006        return 1;
3007}
3008
3009static int cr8_write_interception(struct vcpu_svm *svm)
3010{
3011        struct kvm_run *kvm_run = svm->vcpu.run;
3012        int r;
3013
3014        u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
3015        /* instruction emulation calls kvm_set_cr8() */
3016        r = cr_interception(svm);
3017        if (irqchip_in_kernel(svm->vcpu.kvm)) {
3018                clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3019                return r;
3020        }
3021        if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
3022                return r;
3023        kvm_run->exit_reason = KVM_EXIT_SET_TPR;
3024        return 0;
3025}
3026
3027u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu)
3028{
3029        struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
3030        return vmcb->control.tsc_offset +
3031                svm_scale_tsc(vcpu, native_read_tsc());
3032}
3033
3034static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
3035{
3036        struct vcpu_svm *svm = to_svm(vcpu);
3037
3038        switch (ecx) {
3039        case MSR_IA32_TSC: {
3040                *data = svm->vmcb->control.tsc_offset +
3041                        svm_scale_tsc(vcpu, native_read_tsc());
3042
3043                break;
3044        }
3045        case MSR_STAR:
3046                *data = svm->vmcb->save.star;
3047                break;
3048#ifdef CONFIG_X86_64
3049        case MSR_LSTAR:
3050                *data = svm->vmcb->save.lstar;
3051                break;
3052        case MSR_CSTAR:
3053                *data = svm->vmcb->save.cstar;
3054                break;
3055        case MSR_KERNEL_GS_BASE:
3056                *data = svm->vmcb->save.kernel_gs_base;
3057                break;
3058        case MSR_SYSCALL_MASK:
3059                *data = svm->vmcb->save.sfmask;
3060                break;
3061#endif
3062        case MSR_IA32_SYSENTER_CS:
3063                *data = svm->vmcb->save.sysenter_cs;
3064                break;
3065        case MSR_IA32_SYSENTER_EIP:
3066                *data = svm->sysenter_eip;
3067                break;
3068        case MSR_IA32_SYSENTER_ESP:
3069                *data = svm->sysenter_esp;
3070                break;
3071        /*
3072         * Nobody will change the following 5 values in the VMCB so we can
3073         * safely return them on rdmsr. They will always be 0 until LBRV is
3074         * implemented.
3075         */
3076        case MSR_IA32_DEBUGCTLMSR:
3077                *data = svm->vmcb->save.dbgctl;
3078                break;
3079        case MSR_IA32_LASTBRANCHFROMIP:
3080                *data = svm->vmcb->save.br_from;
3081                break;
3082        case MSR_IA32_LASTBRANCHTOIP:
3083                *data = svm->vmcb->save.br_to;
3084                break;
3085        case MSR_IA32_LASTINTFROMIP:
3086                *data = svm->vmcb->save.last_excp_from;
3087                break;
3088        case MSR_IA32_LASTINTTOIP:
3089                *data = svm->vmcb->save.last_excp_to;
3090                break;
3091        case MSR_VM_HSAVE_PA:
3092                *data = svm->nested.hsave_msr;
3093                break;
3094        case MSR_VM_CR:
3095                *data = svm->nested.vm_cr_msr;
3096                break;
3097        case MSR_IA32_UCODE_REV:
3098                *data = 0x01000065;
3099                break;
3100        default:
3101                return kvm_get_msr_common(vcpu, ecx, data);
3102        }
3103        return 0;
3104}
3105
3106static int rdmsr_interception(struct vcpu_svm *svm)
3107{
3108        u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
3109        u64 data;
3110
3111        if (svm_get_msr(&svm->vcpu, ecx, &data)) {
3112                trace_kvm_msr_read_ex(ecx);
3113                kvm_inject_gp(&svm->vcpu, 0);
3114        } else {
3115                trace_kvm_msr_read(ecx, data);
3116
3117                svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
3118                svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
3119                svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3120                skip_emulated_instruction(&svm->vcpu);
3121        }
3122        return 1;
3123}
3124
3125static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
3126{
3127        struct vcpu_svm *svm = to_svm(vcpu);
3128        int svm_dis, chg_mask;
3129
3130        if (data & ~SVM_VM_CR_VALID_MASK)
3131                return 1;
3132
3133        chg_mask = SVM_VM_CR_VALID_MASK;
3134
3135        if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
3136                chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
3137
3138        svm->nested.vm_cr_msr &= ~chg_mask;
3139        svm->nested.vm_cr_msr |= (data & chg_mask);
3140
3141        svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
3142
3143        /* check for svm_disable while efer.svme is set */
3144        if (svm_dis && (vcpu->arch.efer & EFER_SVME))
3145                return 1;
3146
3147        return 0;
3148}
3149
3150static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
3151{
3152        struct vcpu_svm *svm = to_svm(vcpu);
3153
3154        switch (ecx) {
3155        case MSR_IA32_TSC:
3156                kvm_write_tsc(vcpu, data);
3157                break;
3158        case MSR_STAR:
3159                svm->vmcb->save.star = data;
3160                break;
3161#ifdef CONFIG_X86_64
3162        case MSR_LSTAR:
3163                svm->vmcb->save.lstar = data;
3164                break;
3165        case MSR_CSTAR:
3166                svm->vmcb->save.cstar = data;
3167                break;
3168        case MSR_KERNEL_GS_BASE:
3169                svm->vmcb->save.kernel_gs_base = data;
3170                break;
3171        case MSR_SYSCALL_MASK:
3172                svm->vmcb->save.sfmask = data;
3173                break;
3174#endif
3175        case MSR_IA32_SYSENTER_CS:
3176                svm->vmcb->save.sysenter_cs = data;
3177                break;
3178        case MSR_IA32_SYSENTER_EIP:
3179                svm->sysenter_eip = data;
3180                svm->vmcb->save.sysenter_eip = data;
3181                break;
3182        case MSR_IA32_SYSENTER_ESP:
3183                svm->sysenter_esp = data;
3184                svm->vmcb->save.sysenter_esp = data;
3185                break;
3186        case MSR_IA32_DEBUGCTLMSR:
3187                if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3188                        vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3189                                    __func__, data);
3190                        break;
3191                }
3192                if (data & DEBUGCTL_RESERVED_BITS)
3193                        return 1;
3194
3195                svm->vmcb->save.dbgctl = data;
3196                mark_dirty(svm->vmcb, VMCB_LBR);
3197                if (data & (1ULL<<0))
3198                        svm_enable_lbrv(svm);
3199                else
3200                        svm_disable_lbrv(svm);
3201                break;
3202        case MSR_VM_HSAVE_PA:
3203                svm->nested.hsave_msr = data;
3204                break;
3205        case MSR_VM_CR:
3206                return svm_set_vm_cr(vcpu, data);
3207        case MSR_VM_IGNNE:
3208                vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3209                break;
3210        default:
3211                return kvm_set_msr_common(vcpu, ecx, data);
3212        }
3213        return 0;
3214}
3215
3216static int wrmsr_interception(struct vcpu_svm *svm)
3217{
3218        u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
3219        u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
3220                | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3221
3222
3223        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3224        if (svm_set_msr(&svm->vcpu, ecx, data)) {
3225                trace_kvm_msr_write_ex(ecx, data);
3226                kvm_inject_gp(&svm->vcpu, 0);
3227        } else {
3228                trace_kvm_msr_write(ecx, data);
3229                skip_emulated_instruction(&svm->vcpu);
3230        }
3231        return 1;
3232}
3233
3234static int msr_interception(struct vcpu_svm *svm)
3235{
3236        if (svm->vmcb->control.exit_info_1)
3237                return wrmsr_interception(svm);
3238        else
3239                return rdmsr_interception(svm);
3240}
3241
3242static int interrupt_window_interception(struct vcpu_svm *svm)
3243{
3244        struct kvm_run *kvm_run = svm->vcpu.run;
3245
3246        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3247        svm_clear_vintr(svm);
3248        svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3249        mark_dirty(svm->vmcb, VMCB_INTR);
3250        ++svm->vcpu.stat.irq_window_exits;
3251        /*
3252         * If the user space waits to inject interrupts, exit as soon as
3253         * possible
3254         */
3255        if (!irqchip_in_kernel(svm->vcpu.kvm) &&
3256            kvm_run->request_interrupt_window &&
3257            !kvm_cpu_has_interrupt(&svm->vcpu)) {
3258                kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3259                return 0;
3260        }
3261
3262        return 1;
3263}
3264
3265static int pause_interception(struct vcpu_svm *svm)
3266{
3267        kvm_vcpu_on_spin(&(svm->vcpu));
3268        return 1;
3269}
3270
3271static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
3272        [SVM_EXIT_READ_CR0]                     = cr_interception,
3273        [SVM_EXIT_READ_CR3]                     = cr_interception,
3274        [SVM_EXIT_READ_CR4]                     = cr_interception,
3275        [SVM_EXIT_READ_CR8]                     = cr_interception,
3276        [SVM_EXIT_CR0_SEL_WRITE]                = emulate_on_interception,
3277        [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3278        [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3279        [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3280        [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3281        [SVM_EXIT_READ_DR0]                     = dr_interception,
3282        [SVM_EXIT_READ_DR1]                     = dr_interception,
3283        [SVM_EXIT_READ_DR2]                     = dr_interception,
3284        [SVM_EXIT_READ_DR3]                     = dr_interception,
3285        [SVM_EXIT_READ_DR4]                     = dr_interception,
3286        [SVM_EXIT_READ_DR5]                     = dr_interception,
3287        [SVM_EXIT_READ_DR6]                     = dr_interception,
3288        [SVM_EXIT_READ_DR7]                     = dr_interception,
3289        [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3290        [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3291        [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3292        [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3293        [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3294        [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3295        [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3296        [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3297        [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3298        [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3299        [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3300        [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3301        [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
3302        [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3303        [SVM_EXIT_INTR]                         = intr_interception,
3304        [SVM_EXIT_NMI]                          = nmi_interception,
3305        [SVM_EXIT_SMI]                          = nop_on_interception,
3306        [SVM_EXIT_INIT]                         = nop_on_interception,
3307        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3308        [SVM_EXIT_RDPMC]                        = rdpmc_interception,
3309        [SVM_EXIT_CPUID]                        = cpuid_interception,
3310        [SVM_EXIT_IRET]                         = iret_interception,
3311        [SVM_EXIT_INVD]                         = emulate_on_interception,
3312        [SVM_EXIT_PAUSE]                        = pause_interception,
3313        [SVM_EXIT_HLT]                          = halt_interception,
3314        [SVM_EXIT_INVLPG]                       = invlpg_interception,
3315        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3316        [SVM_EXIT_IOIO]                         = io_interception,
3317        [SVM_EXIT_MSR]                          = msr_interception,
3318        [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3319        [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3320        [SVM_EXIT_VMRUN]                        = vmrun_interception,
3321        [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
3322        [SVM_EXIT_VMLOAD]                       = vmload_interception,
3323        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3324        [SVM_EXIT_STGI]                         = stgi_interception,
3325        [SVM_EXIT_CLGI]                         = clgi_interception,
3326        [SVM_EXIT_SKINIT]                       = skinit_interception,
3327        [SVM_EXIT_WBINVD]                       = emulate_on_interception,
3328        [SVM_EXIT_MONITOR]                      = invalid_op_interception,
3329        [SVM_EXIT_MWAIT]                        = invalid_op_interception,
3330        [SVM_EXIT_XSETBV]                       = xsetbv_interception,
3331        [SVM_EXIT_NPF]                          = pf_interception,
3332};
3333
3334static void dump_vmcb(struct kvm_vcpu *vcpu)
3335{
3336        struct vcpu_svm *svm = to_svm(vcpu);
3337        struct vmcb_control_area *control = &svm->vmcb->control;
3338        struct vmcb_save_area *save = &svm->vmcb->save;
3339
3340        pr_err("VMCB Control Area:\n");
3341        pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
3342        pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
3343        pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
3344        pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
3345        pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
3346        pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
3347        pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3348        pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3349        pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3350        pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3351        pr_err("%-20s%d\n", "asid:", control->asid);
3352        pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3353        pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3354        pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3355        pr_err("%-20s%08x\n", "int_state:", control->int_state);
3356        pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3357        pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3358        pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3359        pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3360        pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3361        pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3362        pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3363        pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3364        pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3365        pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
3366        pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3367        pr_err("VMCB State Save Area:\n");
3368        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3369               "es:",
3370               save->es.selector, save->es.attrib,
3371               save->es.limit, save->es.base);
3372        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3373               "cs:",
3374               save->cs.selector, save->cs.attrib,
3375               save->cs.limit, save->cs.base);
3376        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3377               "ss:",
3378               save->ss.selector, save->ss.attrib,
3379               save->ss.limit, save->ss.base);
3380        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3381               "ds:",
3382               save->ds.selector, save->ds.attrib,
3383               save->ds.limit, save->ds.base);
3384        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3385               "fs:",
3386               save->fs.selector, save->fs.attrib,
3387               save->fs.limit, save->fs.base);
3388        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3389               "gs:",
3390               save->gs.selector, save->gs.attrib,
3391               save->gs.limit, save->gs.base);
3392        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3393               "gdtr:",
3394               save->gdtr.selector, save->gdtr.attrib,
3395               save->gdtr.limit, save->gdtr.base);
3396        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3397               "ldtr:",
3398               save->ldtr.selector, save->ldtr.attrib,
3399               save->ldtr.limit, save->ldtr.base);
3400        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3401               "idtr:",
3402               save->idtr.selector, save->idtr.attrib,
3403               save->idtr.limit, save->idtr.base);
3404        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3405               "tr:",
3406               save->tr.selector, save->tr.attrib,
3407               save->tr.limit, save->tr.base);
3408        pr_err("cpl:            %d                efer:         %016llx\n",
3409                save->cpl, save->efer);
3410        pr_err("%-15s %016llx %-13s %016llx\n",
3411               "cr0:", save->cr0, "cr2:", save->cr2);
3412        pr_err("%-15s %016llx %-13s %016llx\n",
3413               "cr3:", save->cr3, "cr4:", save->cr4);
3414        pr_err("%-15s %016llx %-13s %016llx\n",
3415               "dr6:", save->dr6, "dr7:", save->dr7);
3416        pr_err("%-15s %016llx %-13s %016llx\n",
3417               "rip:", save->rip, "rflags:", save->rflags);
3418        pr_err("%-15s %016llx %-13s %016llx\n",
3419               "rsp:", save->rsp, "rax:", save->rax);
3420        pr_err("%-15s %016llx %-13s %016llx\n",
3421               "star:", save->star, "lstar:", save->lstar);
3422        pr_err("%-15s %016llx %-13s %016llx\n",
3423               "cstar:", save->cstar, "sfmask:", save->sfmask);
3424        pr_err("%-15s %016llx %-13s %016llx\n",
3425               "kernel_gs_base:", save->kernel_gs_base,
3426               "sysenter_cs:", save->sysenter_cs);
3427        pr_err("%-15s %016llx %-13s %016llx\n",
3428               "sysenter_esp:", save->sysenter_esp,
3429               "sysenter_eip:", save->sysenter_eip);
3430        pr_err("%-15s %016llx %-13s %016llx\n",
3431               "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3432        pr_err("%-15s %016llx %-13s %016llx\n",
3433               "br_from:", save->br_from, "br_to:", save->br_to);
3434        pr_err("%-15s %016llx %-13s %016llx\n",
3435               "excp_from:", save->last_excp_from,
3436               "excp_to:", save->last_excp_to);
3437}
3438
3439static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3440{
3441        struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3442
3443        *info1 = control->exit_info_1;
3444        *info2 = control->exit_info_2;
3445}
3446
3447static int handle_exit(struct kvm_vcpu *vcpu)
3448{
3449        struct vcpu_svm *svm = to_svm(vcpu);
3450        struct kvm_run *kvm_run = vcpu->run;
3451        u32 exit_code = svm->vmcb->control.exit_code;
3452
3453        if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
3454                vcpu->arch.cr0 = svm->vmcb->save.cr0;
3455        if (npt_enabled)
3456                vcpu->arch.cr3 = svm->vmcb->save.cr3;
3457
3458        if (unlikely(svm->nested.exit_required)) {
3459                nested_svm_vmexit(svm);
3460                svm->nested.exit_required = false;
3461
3462                return 1;
3463        }
3464
3465        if (is_guest_mode(vcpu)) {
3466                int vmexit;
3467
3468                trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
3469                                        svm->vmcb->control.exit_info_1,
3470                                        svm->vmcb->control.exit_info_2,
3471                                        svm->vmcb->control.exit_int_info,
3472                                        svm->vmcb->control.exit_int_info_err,
3473                                        KVM_ISA_SVM);
3474
3475                vmexit = nested_svm_exit_special(svm);
3476
3477                if (vmexit == NESTED_EXIT_CONTINUE)
3478                        vmexit = nested_svm_exit_handled(svm);
3479
3480                if (vmexit == NESTED_EXIT_DONE)
3481                        return 1;
3482        }
3483
3484        svm_complete_interrupts(svm);
3485
3486        if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3487                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3488                kvm_run->fail_entry.hardware_entry_failure_reason
3489                        = svm->vmcb->control.exit_code;
3490                pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
3491                dump_vmcb(vcpu);
3492                return 0;
3493        }
3494
3495        if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
3496            exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3497            exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3498            exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3499                printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
3500                       "exit_code 0x%x\n",
3501                       __func__, svm->vmcb->control.exit_int_info,
3502                       exit_code);
3503
3504        if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
3505            || !svm_exit_handlers[exit_code]) {
3506                kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3507                kvm_run->hw.hardware_exit_reason = exit_code;
3508                return 0;
3509        }
3510
3511        return svm_exit_handlers[exit_code](svm);
3512}
3513
3514static void reload_tss(struct kvm_vcpu *vcpu)
3515{
3516        int cpu = raw_smp_processor_id();
3517
3518        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3519        sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3520        load_TR_desc();
3521}
3522
3523static void pre_svm_run(struct vcpu_svm *svm)
3524{
3525        int cpu = raw_smp_processor_id();
3526
3527        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3528
3529        /* FIXME: handle wraparound of asid_generation */
3530        if (svm->asid_generation != sd->asid_generation)
3531                new_asid(svm, sd);
3532}
3533
3534static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3535{
3536        struct vcpu_svm *svm = to_svm(vcpu);
3537
3538        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3539        vcpu->arch.hflags |= HF_NMI_MASK;
3540        set_intercept(svm, INTERCEPT_IRET);
3541        ++vcpu->stat.nmi_injections;
3542}
3543
3544static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
3545{
3546        struct vmcb_control_area *control;
3547
3548        control = &svm->vmcb->control;
3549        control->int_vector = irq;
3550        control->int_ctl &= ~V_INTR_PRIO_MASK;
3551        control->int_ctl |= V_IRQ_MASK |
3552                ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
3553        mark_dirty(svm->vmcb, VMCB_INTR);
3554}
3555
3556static void svm_set_irq(struct kvm_vcpu *vcpu)
3557{
3558        struct vcpu_svm *svm = to_svm(vcpu);
3559
3560        BUG_ON(!(gif_set(svm)));
3561
3562        trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
3563        ++vcpu->stat.irq_injections;
3564
3565        svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3566                SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
3567}
3568
3569static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3570{
3571        struct vcpu_svm *svm = to_svm(vcpu);
3572
3573        if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3574                return;
3575
3576        if (irr == -1)
3577                return;
3578
3579        if (tpr >= irr)
3580                set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3581}
3582
3583static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
3584{
3585        struct vcpu_svm *svm = to_svm(vcpu);
3586        struct vmcb *vmcb = svm->vmcb;
3587        int ret;
3588        ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
3589              !(svm->vcpu.arch.hflags & HF_NMI_MASK);
3590        ret = ret && gif_set(svm) && nested_svm_nmi(svm);
3591
3592        return ret;
3593}
3594
3595static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3596{
3597        struct vcpu_svm *svm = to_svm(vcpu);
3598
3599        return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
3600}
3601
3602static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3603{
3604        struct vcpu_svm *svm = to_svm(vcpu);
3605
3606        if (masked) {
3607                svm->vcpu.arch.hflags |= HF_NMI_MASK;
3608                set_intercept(svm, INTERCEPT_IRET);
3609        } else {
3610                svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3611                clr_intercept(svm, INTERCEPT_IRET);
3612        }
3613}
3614
3615static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3616{
3617        struct vcpu_svm *svm = to_svm(vcpu);
3618        struct vmcb *vmcb = svm->vmcb;
3619        int ret;
3620
3621        if (!gif_set(svm) ||
3622             (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
3623                return 0;
3624
3625        ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
3626
3627        if (is_guest_mode(vcpu))
3628                return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
3629
3630        return ret;
3631}
3632
3633static void enable_irq_window(struct kvm_vcpu *vcpu)
3634{
3635        struct vcpu_svm *svm = to_svm(vcpu);
3636
3637        /*
3638         * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3639         * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3640         * get that intercept, this function will be called again though and
3641         * we'll get the vintr intercept.
3642         */
3643        if (gif_set(svm) && nested_svm_intr(svm)) {
3644                svm_set_vintr(svm);
3645                svm_inject_irq(svm, 0x0);
3646        }
3647}
3648
3649static void enable_nmi_window(struct kvm_vcpu *vcpu)
3650{
3651        struct vcpu_svm *svm = to_svm(vcpu);
3652
3653        if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
3654            == HF_NMI_MASK)
3655                return; /* IRET will cause a vm exit */
3656
3657        /*
3658         * Something prevents NMI from been injected. Single step over possible
3659         * problem (IRET or exception injection or interrupt shadow)
3660         */
3661        svm->nmi_singlestep = true;
3662        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3663        update_db_intercept(vcpu);
3664}
3665
3666static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3667{
3668        return 0;
3669}
3670
3671static void svm_flush_tlb(struct kvm_vcpu *vcpu)
3672{
3673        struct vcpu_svm *svm = to_svm(vcpu);
3674
3675        if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3676                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3677        else
3678                svm->asid_generation--;
3679}
3680
3681static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
3682{
3683}
3684
3685static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3686{
3687        struct vcpu_svm *svm = to_svm(vcpu);
3688
3689        if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3690                return;
3691
3692        if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3693                int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3694                kvm_set_cr8(vcpu, cr8);
3695        }
3696}
3697
3698static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3699{
3700        struct vcpu_svm *svm = to_svm(vcpu);
3701        u64 cr8;
3702
3703        if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3704                return;
3705
3706        cr8 = kvm_get_cr8(vcpu);
3707        svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3708        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3709}
3710
3711static void svm_complete_interrupts(struct vcpu_svm *svm)
3712{
3713        u8 vector;
3714        int type;
3715        u32 exitintinfo = svm->vmcb->control.exit_int_info;
3716        unsigned int3_injected = svm->int3_injected;
3717
3718        svm->int3_injected = 0;
3719
3720        /*
3721         * If we've made progress since setting HF_IRET_MASK, we've
3722         * executed an IRET and can allow NMI injection.
3723         */
3724        if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3725            && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3726                svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3727                kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3728        }
3729
3730        svm->vcpu.arch.nmi_injected = false;
3731        kvm_clear_exception_queue(&svm->vcpu);
3732        kvm_clear_interrupt_queue(&svm->vcpu);
3733
3734        if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3735                return;
3736
3737        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3738
3739        vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3740        type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3741
3742        switch (type) {
3743        case SVM_EXITINTINFO_TYPE_NMI:
3744                svm->vcpu.arch.nmi_injected = true;
3745                break;
3746        case SVM_EXITINTINFO_TYPE_EXEPT:
3747                /*
3748                 * In case of software exceptions, do not reinject the vector,
3749                 * but re-execute the instruction instead. Rewind RIP first
3750                 * if we emulated INT3 before.
3751                 */
3752                if (kvm_exception_is_soft(vector)) {
3753                        if (vector == BP_VECTOR && int3_injected &&
3754                            kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
3755                                kvm_rip_write(&svm->vcpu,
3756                                              kvm_rip_read(&svm->vcpu) -
3757                                              int3_injected);
3758                        break;
3759                }
3760                if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3761                        u32 err = svm->vmcb->control.exit_int_info_err;
3762                        kvm_requeue_exception_e(&svm->vcpu, vector, err);
3763
3764                } else
3765                        kvm_requeue_exception(&svm->vcpu, vector);
3766                break;
3767        case SVM_EXITINTINFO_TYPE_INTR:
3768                kvm_queue_interrupt(&svm->vcpu, vector, false);
3769                break;
3770        default:
3771                break;
3772        }
3773}
3774
3775static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3776{
3777        struct vcpu_svm *svm = to_svm(vcpu);
3778        struct vmcb_control_area *control = &svm->vmcb->control;
3779
3780        control->exit_int_info = control->event_inj;
3781        control->exit_int_info_err = control->event_inj_err;
3782        control->event_inj = 0;
3783        svm_complete_interrupts(svm);
3784}
3785
3786#ifdef CONFIG_X86_64
3787#define R "r"
3788#else
3789#define R "e"
3790#endif
3791
3792static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3793{
3794        struct vcpu_svm *svm = to_svm(vcpu);
3795
3796        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3797        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3798        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3799
3800        /*
3801         * A vmexit emulation is required before the vcpu can be executed
3802         * again.
3803         */
3804        if (unlikely(svm->nested.exit_required))
3805                return;
3806
3807        pre_svm_run(svm);
3808
3809        sync_lapic_to_cr8(vcpu);
3810
3811        svm->vmcb->save.cr2 = vcpu->arch.cr2;
3812
3813        clgi();
3814
3815        local_irq_enable();
3816
3817        asm volatile (
3818                "push %%"R"bp; \n\t"
3819                "mov %c[rbx](%[svm]), %%"R"bx \n\t"
3820                "mov %c[rcx](%[svm]), %%"R"cx \n\t"
3821                "mov %c[rdx](%[svm]), %%"R"dx \n\t"
3822                "mov %c[rsi](%[svm]), %%"R"si \n\t"
3823                "mov %c[rdi](%[svm]), %%"R"di \n\t"
3824                "mov %c[rbp](%[svm]), %%"R"bp \n\t"
3825#ifdef CONFIG_X86_64
3826                "mov %c[r8](%[svm]),  %%r8  \n\t"
3827                "mov %c[r9](%[svm]),  %%r9  \n\t"
3828                "mov %c[r10](%[svm]), %%r10 \n\t"
3829                "mov %c[r11](%[svm]), %%r11 \n\t"
3830                "mov %c[r12](%[svm]), %%r12 \n\t"
3831                "mov %c[r13](%[svm]), %%r13 \n\t"
3832                "mov %c[r14](%[svm]), %%r14 \n\t"
3833                "mov %c[r15](%[svm]), %%r15 \n\t"
3834#endif
3835
3836                /* Enter guest mode */
3837                "push %%"R"ax \n\t"
3838                "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
3839                __ex(SVM_VMLOAD) "\n\t"
3840                __ex(SVM_VMRUN) "\n\t"
3841                __ex(SVM_VMSAVE) "\n\t"
3842                "pop %%"R"ax \n\t"
3843
3844                /* Save guest registers, load host registers */
3845                "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
3846                "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
3847                "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
3848                "mov %%"R"si, %c[rsi](%[svm]) \n\t"
3849                "mov %%"R"di, %c[rdi](%[svm]) \n\t"
3850                "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
3851#ifdef CONFIG_X86_64
3852                "mov %%r8,  %c[r8](%[svm]) \n\t"
3853                "mov %%r9,  %c[r9](%[svm]) \n\t"
3854                "mov %%r10, %c[r10](%[svm]) \n\t"
3855                "mov %%r11, %c[r11](%[svm]) \n\t"
3856                "mov %%r12, %c[r12](%[svm]) \n\t"
3857                "mov %%r13, %c[r13](%[svm]) \n\t"
3858                "mov %%r14, %c[r14](%[svm]) \n\t"
3859                "mov %%r15, %c[r15](%[svm]) \n\t"
3860#endif
3861                "pop %%"R"bp"
3862                :
3863                : [svm]"a"(svm),
3864                  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
3865                  [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
3866                  [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
3867                  [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
3868                  [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
3869                  [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
3870                  [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
3871#ifdef CONFIG_X86_64
3872                  , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
3873                  [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
3874                  [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
3875                  [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
3876                  [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
3877                  [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
3878                  [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
3879                  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
3880#endif
3881                : "cc", "memory"
3882                , R"bx", R"cx", R"dx", R"si", R"di"
3883#ifdef CONFIG_X86_64
3884                , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
3885#endif
3886                );
3887
3888#ifdef CONFIG_X86_64
3889        wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3890#else
3891        loadsegment(fs, svm->host.fs);
3892#ifndef CONFIG_X86_32_LAZY_GS
3893        loadsegment(gs, svm->host.gs);
3894#endif
3895#endif
3896
3897        reload_tss(vcpu);
3898
3899        local_irq_disable();
3900
3901        vcpu->arch.cr2 = svm->vmcb->save.cr2;
3902        vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3903        vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3904        vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3905
3906        trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
3907
3908        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3909                kvm_before_handle_nmi(&svm->vcpu);
3910
3911        stgi();
3912
3913        /* Any pending NMI will happen here */
3914
3915        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3916                kvm_after_handle_nmi(&svm->vcpu);
3917
3918        sync_cr8_to_lapic(vcpu);
3919
3920        svm->next_rip = 0;
3921
3922        svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3923
3924        /* if exit due to PF check for async PF */
3925        if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3926                svm->apf_reason = kvm_read_and_reset_pf_reason();
3927
3928        if (npt_enabled) {
3929                vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
3930                vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
3931        }
3932
3933        /*
3934         * We need to handle MC intercepts here before the vcpu has a chance to
3935         * change the physical cpu
3936         */
3937        if (unlikely(svm->vmcb->control.exit_code ==
3938                     SVM_EXIT_EXCP_BASE + MC_VECTOR))
3939                svm_handle_mce(svm);
3940
3941        mark_all_clean(svm->vmcb);
3942}
3943
3944#undef R
3945
3946static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3947{
3948        struct vcpu_svm *svm = to_svm(vcpu);
3949
3950        svm->vmcb->save.cr3 = root;
3951        mark_dirty(svm->vmcb, VMCB_CR);
3952        svm_flush_tlb(vcpu);
3953}
3954
3955static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3956{
3957        struct vcpu_svm *svm = to_svm(vcpu);
3958
3959        svm->vmcb->control.nested_cr3 = root;
3960        mark_dirty(svm->vmcb, VMCB_NPT);
3961
3962        /* Also sync guest cr3 here in case we live migrate */
3963        svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
3964        mark_dirty(svm->vmcb, VMCB_CR);
3965
3966        svm_flush_tlb(vcpu);
3967}
3968
3969static int is_disabled(void)
3970{
3971        u64 vm_cr;
3972
3973        rdmsrl(MSR_VM_CR, vm_cr);
3974        if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
3975                return 1;
3976
3977        return 0;
3978}
3979
3980static void
3981svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3982{
3983        /*
3984         * Patch in the VMMCALL instruction:
3985         */
3986        hypercall[0] = 0x0f;
3987        hypercall[1] = 0x01;
3988        hypercall[2] = 0xd9;
3989}
3990
3991static void svm_check_processor_compat(void *rtn)
3992{
3993        *(int *)rtn = 0;
3994}
3995
3996static bool svm_cpu_has_accelerated_tpr(void)
3997{
3998        return false;
3999}
4000
4001static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4002{
4003        return 0;
4004}
4005
4006static void svm_cpuid_update(struct kvm_vcpu *vcpu)
4007{
4008}
4009
4010static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4011{
4012        switch (func) {
4013        case 0x80000001:
4014                if (nested)
4015                        entry->ecx |= (1 << 2); /* Set SVM bit */
4016                break;
4017        case 0x8000000A:
4018                entry->eax = 1; /* SVM revision 1 */
4019                entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
4020                                   ASID emulation to nested SVM */
4021                entry->ecx = 0; /* Reserved */
4022                entry->edx = 0; /* Per default do not support any
4023                                   additional features */
4024
4025                /* Support next_rip if host supports it */
4026                if (boot_cpu_has(X86_FEATURE_NRIPS))
4027                        entry->edx |= SVM_FEATURE_NRIP;
4028
4029                /* Support NPT for the guest if enabled */
4030                if (npt_enabled)
4031                        entry->edx |= SVM_FEATURE_NPT;
4032
4033                break;
4034        }
4035}
4036
4037static int svm_get_lpage_level(void)
4038{
4039        return PT_PDPE_LEVEL;
4040}
4041
4042static bool svm_rdtscp_supported(void)
4043{
4044        return false;
4045}
4046
4047static bool svm_invpcid_supported(void)
4048{
4049        return false;
4050}
4051
4052static bool svm_has_wbinvd_exit(void)
4053{
4054        return true;
4055}
4056
4057static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
4058{
4059        struct vcpu_svm *svm = to_svm(vcpu);
4060
4061        set_exception_intercept(svm, NM_VECTOR);
4062        update_cr0_intercept(svm);
4063}
4064
4065#define PRE_EX(exit)  { .exit_code = (exit), \
4066                        .stage = X86_ICPT_PRE_EXCEPT, }
4067#define POST_EX(exit) { .exit_code = (exit), \
4068                        .stage = X86_ICPT_POST_EXCEPT, }
4069#define POST_MEM(exit) { .exit_code = (exit), \
4070                        .stage = X86_ICPT_POST_MEMACCESS, }
4071
4072static struct __x86_intercept {
4073        u32 exit_code;
4074        enum x86_intercept_stage stage;
4075} x86_intercept_map[] = {
4076        [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4077        [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4078        [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4079        [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4080        [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4081        [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4082        [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4083        [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4084        [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4085        [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4086        [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4087        [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4088        [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4089        [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4090        [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4091        [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4092        [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4093        [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4094        [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4095        [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4096        [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4097        [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4098        [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4099        [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4100        [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4101        [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4102        [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4103        [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4104        [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4105        [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4106        [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4107        [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4108        [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4109        [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4110        [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4111        [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4112        [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4113        [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4114        [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4115        [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4116        [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4117        [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4118        [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4119        [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4120        [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4121        [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4122};
4123
4124#undef PRE_EX
4125#undef POST_EX
4126#undef POST_MEM
4127
4128static int svm_check_intercept(struct kvm_vcpu *vcpu,
4129                               struct x86_instruction_info *info,
4130                               enum x86_intercept_stage stage)
4131{
4132        struct vcpu_svm *svm = to_svm(vcpu);
4133        int vmexit, ret = X86EMUL_CONTINUE;
4134        struct __x86_intercept icpt_info;
4135        struct vmcb *vmcb = svm->vmcb;
4136
4137        if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4138                goto out;
4139
4140        icpt_info = x86_intercept_map[info->intercept];
4141
4142        if (stage != icpt_info.stage)
4143                goto out;
4144
4145        switch (icpt_info.exit_code) {
4146        case SVM_EXIT_READ_CR0:
4147                if (info->intercept == x86_intercept_cr_read)
4148                        icpt_info.exit_code += info->modrm_reg;
4149                break;
4150        case SVM_EXIT_WRITE_CR0: {
4151                unsigned long cr0, val;
4152                u64 intercept;
4153
4154                if (info->intercept == x86_intercept_cr_write)
4155                        icpt_info.exit_code += info->modrm_reg;
4156
4157                if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0)
4158                        break;
4159
4160                intercept = svm->nested.intercept;
4161
4162                if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
4163                        break;
4164
4165                cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4166                val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4167
4168                if (info->intercept == x86_intercept_lmsw) {
4169                        cr0 &= 0xfUL;
4170                        val &= 0xfUL;
4171                        /* lmsw can't clear PE - catch this here */
4172                        if (cr0 & X86_CR0_PE)
4173                                val |= X86_CR0_PE;
4174                }
4175
4176                if (cr0 ^ val)
4177                        icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4178
4179                break;
4180        }
4181        case SVM_EXIT_READ_DR0:
4182        case SVM_EXIT_WRITE_DR0:
4183                icpt_info.exit_code += info->modrm_reg;
4184                break;
4185        case SVM_EXIT_MSR:
4186                if (info->intercept == x86_intercept_wrmsr)
4187                        vmcb->control.exit_info_1 = 1;
4188                else
4189                        vmcb->control.exit_info_1 = 0;
4190                break;
4191        case SVM_EXIT_PAUSE:
4192                /*
4193                 * We get this for NOP only, but pause
4194                 * is rep not, check this here
4195                 */
4196                if (info->rep_prefix != REPE_PREFIX)
4197                        goto out;
4198        case SVM_EXIT_IOIO: {
4199                u64 exit_info;
4200                u32 bytes;
4201
4202                exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16;
4203
4204                if (info->intercept == x86_intercept_in ||
4205                    info->intercept == x86_intercept_ins) {
4206                        exit_info |= SVM_IOIO_TYPE_MASK;
4207                        bytes = info->src_bytes;
4208                } else {
4209                        bytes = info->dst_bytes;
4210                }
4211
4212                if (info->intercept == x86_intercept_outs ||
4213                    info->intercept == x86_intercept_ins)
4214                        exit_info |= SVM_IOIO_STR_MASK;
4215
4216                if (info->rep_prefix)
4217                        exit_info |= SVM_IOIO_REP_MASK;
4218
4219                bytes = min(bytes, 4u);
4220
4221                exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4222
4223                exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4224
4225                vmcb->control.exit_info_1 = exit_info;
4226                vmcb->control.exit_info_2 = info->next_rip;
4227
4228                break;
4229        }
4230        default:
4231                break;
4232        }
4233
4234        vmcb->control.next_rip  = info->next_rip;
4235        vmcb->control.exit_code = icpt_info.exit_code;
4236        vmexit = nested_svm_exit_handled(svm);
4237
4238        ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4239                                           : X86EMUL_CONTINUE;
4240
4241out:
4242        return ret;
4243}
4244
4245static struct kvm_x86_ops svm_x86_ops = {
4246        .cpu_has_kvm_support = has_svm,
4247        .disabled_by_bios = is_disabled,
4248        .hardware_setup = svm_hardware_setup,
4249        .hardware_unsetup = svm_hardware_unsetup,
4250        .check_processor_compatibility = svm_check_processor_compat,
4251        .hardware_enable = svm_hardware_enable,
4252        .hardware_disable = svm_hardware_disable,
4253        .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
4254
4255        .vcpu_create = svm_create_vcpu,
4256        .vcpu_free = svm_free_vcpu,
4257        .vcpu_reset = svm_vcpu_reset,
4258
4259        .prepare_guest_switch = svm_prepare_guest_switch,
4260        .vcpu_load = svm_vcpu_load,
4261        .vcpu_put = svm_vcpu_put,
4262
4263        .set_guest_debug = svm_guest_debug,
4264        .get_msr = svm_get_msr,
4265        .set_msr = svm_set_msr,
4266        .get_segment_base = svm_get_segment_base,
4267        .get_segment = svm_get_segment,
4268        .set_segment = svm_set_segment,
4269        .get_cpl = svm_get_cpl,
4270        .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
4271        .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
4272        .decache_cr3 = svm_decache_cr3,
4273        .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
4274        .set_cr0 = svm_set_cr0,
4275        .set_cr3 = svm_set_cr3,
4276        .set_cr4 = svm_set_cr4,
4277        .set_efer = svm_set_efer,
4278        .get_idt = svm_get_idt,
4279        .set_idt = svm_set_idt,
4280        .get_gdt = svm_get_gdt,
4281        .set_gdt = svm_set_gdt,
4282        .set_dr7 = svm_set_dr7,
4283        .cache_reg = svm_cache_reg,
4284        .get_rflags = svm_get_rflags,
4285        .set_rflags = svm_set_rflags,
4286        .fpu_activate = svm_fpu_activate,
4287        .fpu_deactivate = svm_fpu_deactivate,
4288
4289        .tlb_flush = svm_flush_tlb,
4290
4291        .run = svm_vcpu_run,
4292        .handle_exit = handle_exit,
4293        .skip_emulated_instruction = skip_emulated_instruction,
4294        .set_interrupt_shadow = svm_set_interrupt_shadow,
4295        .get_interrupt_shadow = svm_get_interrupt_shadow,
4296        .patch_hypercall = svm_patch_hypercall,
4297        .set_irq = svm_set_irq,
4298        .set_nmi = svm_inject_nmi,
4299        .queue_exception = svm_queue_exception,
4300        .cancel_injection = svm_cancel_injection,
4301        .interrupt_allowed = svm_interrupt_allowed,
4302        .nmi_allowed = svm_nmi_allowed,
4303        .get_nmi_mask = svm_get_nmi_mask,
4304        .set_nmi_mask = svm_set_nmi_mask,
4305        .enable_nmi_window = enable_nmi_window,
4306        .enable_irq_window = enable_irq_window,
4307        .update_cr8_intercept = update_cr8_intercept,
4308
4309        .set_tss_addr = svm_set_tss_addr,
4310        .get_tdp_level = get_npt_level,
4311        .get_mt_mask = svm_get_mt_mask,
4312
4313        .get_exit_info = svm_get_exit_info,
4314
4315        .get_lpage_level = svm_get_lpage_level,
4316
4317        .cpuid_update = svm_cpuid_update,
4318
4319        .rdtscp_supported = svm_rdtscp_supported,
4320        .invpcid_supported = svm_invpcid_supported,
4321
4322        .set_supported_cpuid = svm_set_supported_cpuid,
4323
4324        .has_wbinvd_exit = svm_has_wbinvd_exit,
4325
4326        .set_tsc_khz = svm_set_tsc_khz,
4327        .write_tsc_offset = svm_write_tsc_offset,
4328        .adjust_tsc_offset = svm_adjust_tsc_offset,
4329        .compute_tsc_offset = svm_compute_tsc_offset,
4330        .read_l1_tsc = svm_read_l1_tsc,
4331
4332        .set_tdp_cr3 = set_tdp_cr3,
4333
4334        .check_intercept = svm_check_intercept,
4335};
4336
4337static int __init svm_init(void)
4338{
4339        return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
4340                        __alignof__(struct vcpu_svm), THIS_MODULE);
4341}
4342
4343static void __exit svm_exit(void)
4344{
4345        kvm_exit();
4346}
4347
4348module_init(svm_init)
4349module_exit(svm_exit)
4350