linux/arch/x86/kvm/svm.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * AMD SVM support
   5 *
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   8 *
   9 * Authors:
  10 *   Yaniv Kamay  <yaniv@qumranet.com>
  11 *   Avi Kivity   <avi@qumranet.com>
  12 *
  13 * This work is licensed under the terms of the GNU GPL, version 2.  See
  14 * the COPYING file in the top-level directory.
  15 *
  16 */
  17#include <linux/kvm_host.h>
  18
  19#include "irq.h"
  20#include "mmu.h"
  21#include "kvm_cache_regs.h"
  22#include "x86.h"
  23#include "cpuid.h"
  24
  25#include <linux/module.h>
  26#include <linux/mod_devicetable.h>
  27#include <linux/kernel.h>
  28#include <linux/vmalloc.h>
  29#include <linux/highmem.h>
  30#include <linux/sched.h>
  31#include <linux/ftrace_event.h>
  32#include <linux/slab.h>
  33
  34#include <asm/perf_event.h>
  35#include <asm/tlbflush.h>
  36#include <asm/desc.h>
  37#include <asm/debugreg.h>
  38#include <asm/kvm_para.h>
  39
  40#include <asm/virtext.h>
  41#include "trace.h"
  42
  43#define __ex(x) __kvm_handle_fault_on_reboot(x)
  44
  45MODULE_AUTHOR("Qumranet");
  46MODULE_LICENSE("GPL");
  47
  48static const struct x86_cpu_id svm_cpu_id[] = {
  49        X86_FEATURE_MATCH(X86_FEATURE_SVM),
  50        {}
  51};
  52MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  53
  54#define IOPM_ALLOC_ORDER 2
  55#define MSRPM_ALLOC_ORDER 1
  56
  57#define SEG_TYPE_LDT 2
  58#define SEG_TYPE_BUSY_TSS16 3
  59
  60#define SVM_FEATURE_NPT            (1 <<  0)
  61#define SVM_FEATURE_LBRV           (1 <<  1)
  62#define SVM_FEATURE_SVML           (1 <<  2)
  63#define SVM_FEATURE_NRIP           (1 <<  3)
  64#define SVM_FEATURE_TSC_RATE       (1 <<  4)
  65#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
  66#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
  67#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
  68#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
  69
  70#define NESTED_EXIT_HOST        0       /* Exit handled on host level */
  71#define NESTED_EXIT_DONE        1       /* Exit caused nested vmexit  */
  72#define NESTED_EXIT_CONTINUE    2       /* Further checks needed      */
  73
  74#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
  75
  76#define TSC_RATIO_RSVD          0xffffff0000000000ULL
  77#define TSC_RATIO_MIN           0x0000000000000001ULL
  78#define TSC_RATIO_MAX           0x000000ffffffffffULL
  79
  80static bool erratum_383_found __read_mostly;
  81
  82static const u32 host_save_user_msrs[] = {
  83#ifdef CONFIG_X86_64
  84        MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
  85        MSR_FS_BASE,
  86#endif
  87        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
  88};
  89
  90#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
  91
  92struct kvm_vcpu;
  93
  94struct nested_state {
  95        struct vmcb *hsave;
  96        u64 hsave_msr;
  97        u64 vm_cr_msr;
  98        u64 vmcb;
  99
 100        /* These are the merged vectors */
 101        u32 *msrpm;
 102
 103        /* gpa pointers to the real vectors */
 104        u64 vmcb_msrpm;
 105        u64 vmcb_iopm;
 106
 107        /* A VMEXIT is required but not yet emulated */
 108        bool exit_required;
 109
 110        /* cache for intercepts of the guest */
 111        u32 intercept_cr;
 112        u32 intercept_dr;
 113        u32 intercept_exceptions;
 114        u64 intercept;
 115
 116        /* Nested Paging related state */
 117        u64 nested_cr3;
 118};
 119
 120#define MSRPM_OFFSETS   16
 121static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 122
 123/*
 124 * Set osvw_len to higher value when updated Revision Guides
 125 * are published and we know what the new status bits are
 126 */
 127static uint64_t osvw_len = 4, osvw_status;
 128
 129struct vcpu_svm {
 130        struct kvm_vcpu vcpu;
 131        struct vmcb *vmcb;
 132        unsigned long vmcb_pa;
 133        struct svm_cpu_data *svm_data;
 134        uint64_t asid_generation;
 135        uint64_t sysenter_esp;
 136        uint64_t sysenter_eip;
 137
 138        u64 next_rip;
 139
 140        u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
 141        struct {
 142                u16 fs;
 143                u16 gs;
 144                u16 ldt;
 145                u64 gs_base;
 146        } host;
 147
 148        u32 *msrpm;
 149
 150        ulong nmi_iret_rip;
 151
 152        struct nested_state nested;
 153
 154        bool nmi_singlestep;
 155
 156        unsigned int3_injected;
 157        unsigned long int3_rip;
 158        u32 apf_reason;
 159
 160        u64  tsc_ratio;
 161};
 162
 163static DEFINE_PER_CPU(u64, current_tsc_ratio);
 164#define TSC_RATIO_DEFAULT       0x0100000000ULL
 165
 166#define MSR_INVALID                     0xffffffffU
 167
 168static const struct svm_direct_access_msrs {
 169        u32 index;   /* Index of the MSR */
 170        bool always; /* True if intercept is always on */
 171} direct_access_msrs[] = {
 172        { .index = MSR_STAR,                            .always = true  },
 173        { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
 174#ifdef CONFIG_X86_64
 175        { .index = MSR_GS_BASE,                         .always = true  },
 176        { .index = MSR_FS_BASE,                         .always = true  },
 177        { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
 178        { .index = MSR_LSTAR,                           .always = true  },
 179        { .index = MSR_CSTAR,                           .always = true  },
 180        { .index = MSR_SYSCALL_MASK,                    .always = true  },
 181#endif
 182        { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
 183        { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
 184        { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
 185        { .index = MSR_IA32_LASTINTTOIP,                .always = false },
 186        { .index = MSR_INVALID,                         .always = false },
 187};
 188
 189/* enable NPT for AMD64 and X86 with PAE */
 190#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 191static bool npt_enabled = true;
 192#else
 193static bool npt_enabled;
 194#endif
 195
 196/* allow nested paging (virtualized MMU) for all guests */
 197static int npt = true;
 198module_param(npt, int, S_IRUGO);
 199
 200/* allow nested virtualization in KVM/SVM */
 201static int nested = true;
 202module_param(nested, int, S_IRUGO);
 203
 204static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 205static void svm_complete_interrupts(struct vcpu_svm *svm);
 206
 207static int nested_svm_exit_handled(struct vcpu_svm *svm);
 208static int nested_svm_intercept(struct vcpu_svm *svm);
 209static int nested_svm_vmexit(struct vcpu_svm *svm);
 210static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 211                                      bool has_error_code, u32 error_code);
 212static u64 __scale_tsc(u64 ratio, u64 tsc);
 213
 214enum {
 215        VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
 216                            pause filter count */
 217        VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
 218        VMCB_ASID,       /* ASID */
 219        VMCB_INTR,       /* int_ctl, int_vector */
 220        VMCB_NPT,        /* npt_en, nCR3, gPAT */
 221        VMCB_CR,         /* CR0, CR3, CR4, EFER */
 222        VMCB_DR,         /* DR6, DR7 */
 223        VMCB_DT,         /* GDT, IDT */
 224        VMCB_SEG,        /* CS, DS, SS, ES, CPL */
 225        VMCB_CR2,        /* CR2 only */
 226        VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
 227        VMCB_DIRTY_MAX,
 228};
 229
 230/* TPR and CR2 are always written before VMRUN */
 231#define VMCB_ALWAYS_DIRTY_MASK  ((1U << VMCB_INTR) | (1U << VMCB_CR2))
 232
 233static inline void mark_all_dirty(struct vmcb *vmcb)
 234{
 235        vmcb->control.clean = 0;
 236}
 237
 238static inline void mark_all_clean(struct vmcb *vmcb)
 239{
 240        vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
 241                               & ~VMCB_ALWAYS_DIRTY_MASK;
 242}
 243
 244static inline void mark_dirty(struct vmcb *vmcb, int bit)
 245{
 246        vmcb->control.clean &= ~(1 << bit);
 247}
 248
 249static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 250{
 251        return container_of(vcpu, struct vcpu_svm, vcpu);
 252}
 253
 254static void recalc_intercepts(struct vcpu_svm *svm)
 255{
 256        struct vmcb_control_area *c, *h;
 257        struct nested_state *g;
 258
 259        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 260
 261        if (!is_guest_mode(&svm->vcpu))
 262                return;
 263
 264        c = &svm->vmcb->control;
 265        h = &svm->nested.hsave->control;
 266        g = &svm->nested;
 267
 268        c->intercept_cr = h->intercept_cr | g->intercept_cr;
 269        c->intercept_dr = h->intercept_dr | g->intercept_dr;
 270        c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 271        c->intercept = h->intercept | g->intercept;
 272}
 273
 274static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
 275{
 276        if (is_guest_mode(&svm->vcpu))
 277                return svm->nested.hsave;
 278        else
 279                return svm->vmcb;
 280}
 281
 282static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
 283{
 284        struct vmcb *vmcb = get_host_vmcb(svm);
 285
 286        vmcb->control.intercept_cr |= (1U << bit);
 287
 288        recalc_intercepts(svm);
 289}
 290
 291static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
 292{
 293        struct vmcb *vmcb = get_host_vmcb(svm);
 294
 295        vmcb->control.intercept_cr &= ~(1U << bit);
 296
 297        recalc_intercepts(svm);
 298}
 299
 300static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
 301{
 302        struct vmcb *vmcb = get_host_vmcb(svm);
 303
 304        return vmcb->control.intercept_cr & (1U << bit);
 305}
 306
 307static inline void set_dr_intercepts(struct vcpu_svm *svm)
 308{
 309        struct vmcb *vmcb = get_host_vmcb(svm);
 310
 311        vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
 312                | (1 << INTERCEPT_DR1_READ)
 313                | (1 << INTERCEPT_DR2_READ)
 314                | (1 << INTERCEPT_DR3_READ)
 315                | (1 << INTERCEPT_DR4_READ)
 316                | (1 << INTERCEPT_DR5_READ)
 317                | (1 << INTERCEPT_DR6_READ)
 318                | (1 << INTERCEPT_DR7_READ)
 319                | (1 << INTERCEPT_DR0_WRITE)
 320                | (1 << INTERCEPT_DR1_WRITE)
 321                | (1 << INTERCEPT_DR2_WRITE)
 322                | (1 << INTERCEPT_DR3_WRITE)
 323                | (1 << INTERCEPT_DR4_WRITE)
 324                | (1 << INTERCEPT_DR5_WRITE)
 325                | (1 << INTERCEPT_DR6_WRITE)
 326                | (1 << INTERCEPT_DR7_WRITE);
 327
 328        recalc_intercepts(svm);
 329}
 330
 331static inline void clr_dr_intercepts(struct vcpu_svm *svm)
 332{
 333        struct vmcb *vmcb = get_host_vmcb(svm);
 334
 335        vmcb->control.intercept_dr = 0;
 336
 337        recalc_intercepts(svm);
 338}
 339
 340static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
 341{
 342        struct vmcb *vmcb = get_host_vmcb(svm);
 343
 344        vmcb->control.intercept_exceptions |= (1U << bit);
 345
 346        recalc_intercepts(svm);
 347}
 348
 349static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
 350{
 351        struct vmcb *vmcb = get_host_vmcb(svm);
 352
 353        vmcb->control.intercept_exceptions &= ~(1U << bit);
 354
 355        recalc_intercepts(svm);
 356}
 357
 358static inline void set_intercept(struct vcpu_svm *svm, int bit)
 359{
 360        struct vmcb *vmcb = get_host_vmcb(svm);
 361
 362        vmcb->control.intercept |= (1ULL << bit);
 363
 364        recalc_intercepts(svm);
 365}
 366
 367static inline void clr_intercept(struct vcpu_svm *svm, int bit)
 368{
 369        struct vmcb *vmcb = get_host_vmcb(svm);
 370
 371        vmcb->control.intercept &= ~(1ULL << bit);
 372
 373        recalc_intercepts(svm);
 374}
 375
 376static inline void enable_gif(struct vcpu_svm *svm)
 377{
 378        svm->vcpu.arch.hflags |= HF_GIF_MASK;
 379}
 380
 381static inline void disable_gif(struct vcpu_svm *svm)
 382{
 383        svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
 384}
 385
 386static inline bool gif_set(struct vcpu_svm *svm)
 387{
 388        return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
 389}
 390
 391static unsigned long iopm_base;
 392
 393struct kvm_ldttss_desc {
 394        u16 limit0;
 395        u16 base0;
 396        unsigned base1:8, type:5, dpl:2, p:1;
 397        unsigned limit1:4, zero0:3, g:1, base2:8;
 398        u32 base3;
 399        u32 zero1;
 400} __attribute__((packed));
 401
 402struct svm_cpu_data {
 403        int cpu;
 404
 405        u64 asid_generation;
 406        u32 max_asid;
 407        u32 next_asid;
 408        struct kvm_ldttss_desc *tss_desc;
 409
 410        struct page *save_area;
 411};
 412
 413static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 414
 415struct svm_init_data {
 416        int cpu;
 417        int r;
 418};
 419
 420static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 421
 422#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 423#define MSRS_RANGE_SIZE 2048
 424#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 425
 426static u32 svm_msrpm_offset(u32 msr)
 427{
 428        u32 offset;
 429        int i;
 430
 431        for (i = 0; i < NUM_MSR_MAPS; i++) {
 432                if (msr < msrpm_ranges[i] ||
 433                    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 434                        continue;
 435
 436                offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 437                offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 438
 439                /* Now we have the u8 offset - but need the u32 offset */
 440                return offset / 4;
 441        }
 442
 443        /* MSR not in any range */
 444        return MSR_INVALID;
 445}
 446
 447#define MAX_INST_SIZE 15
 448
 449static inline void clgi(void)
 450{
 451        asm volatile (__ex(SVM_CLGI));
 452}
 453
 454static inline void stgi(void)
 455{
 456        asm volatile (__ex(SVM_STGI));
 457}
 458
 459static inline void invlpga(unsigned long addr, u32 asid)
 460{
 461        asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
 462}
 463
 464static int get_npt_level(void)
 465{
 466#ifdef CONFIG_X86_64
 467        return PT64_ROOT_LEVEL;
 468#else
 469        return PT32E_ROOT_LEVEL;
 470#endif
 471}
 472
 473static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 474{
 475        vcpu->arch.efer = efer;
 476        if (!npt_enabled && !(efer & EFER_LMA))
 477                efer &= ~EFER_LME;
 478
 479        to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
 480        mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
 481}
 482
 483static int is_external_interrupt(u32 info)
 484{
 485        info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
 486        return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 487}
 488
 489static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 490{
 491        struct vcpu_svm *svm = to_svm(vcpu);
 492        u32 ret = 0;
 493
 494        if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 495                ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 496        return ret;
 497}
 498
 499static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 500{
 501        struct vcpu_svm *svm = to_svm(vcpu);
 502
 503        if (mask == 0)
 504                svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 505        else
 506                svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 507
 508}
 509
 510static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 511{
 512        struct vcpu_svm *svm = to_svm(vcpu);
 513
 514        if (svm->vmcb->control.next_rip != 0)
 515                svm->next_rip = svm->vmcb->control.next_rip;
 516
 517        if (!svm->next_rip) {
 518                if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
 519                                EMULATE_DONE)
 520                        printk(KERN_DEBUG "%s: NOP\n", __func__);
 521                return;
 522        }
 523        if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
 524                printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
 525                       __func__, kvm_rip_read(vcpu), svm->next_rip);
 526
 527        kvm_rip_write(vcpu, svm->next_rip);
 528        svm_set_interrupt_shadow(vcpu, 0);
 529}
 530
 531static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 532                                bool has_error_code, u32 error_code,
 533                                bool reinject)
 534{
 535        struct vcpu_svm *svm = to_svm(vcpu);
 536
 537        /*
 538         * If we are within a nested VM we'd better #VMEXIT and let the guest
 539         * handle the exception
 540         */
 541        if (!reinject &&
 542            nested_svm_check_exception(svm, nr, has_error_code, error_code))
 543                return;
 544
 545        if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
 546                unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
 547
 548                /*
 549                 * For guest debugging where we have to reinject #BP if some
 550                 * INT3 is guest-owned:
 551                 * Emulate nRIP by moving RIP forward. Will fail if injection
 552                 * raises a fault that is not intercepted. Still better than
 553                 * failing in all cases.
 554                 */
 555                skip_emulated_instruction(&svm->vcpu);
 556                rip = kvm_rip_read(&svm->vcpu);
 557                svm->int3_rip = rip + svm->vmcb->save.cs.base;
 558                svm->int3_injected = rip - old_rip;
 559        }
 560
 561        svm->vmcb->control.event_inj = nr
 562                | SVM_EVTINJ_VALID
 563                | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 564                | SVM_EVTINJ_TYPE_EXEPT;
 565        svm->vmcb->control.event_inj_err = error_code;
 566}
 567
 568static void svm_init_erratum_383(void)
 569{
 570        u32 low, high;
 571        int err;
 572        u64 val;
 573
 574        if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 575                return;
 576
 577        /* Use _safe variants to not break nested virtualization */
 578        val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 579        if (err)
 580                return;
 581
 582        val |= (1ULL << 47);
 583
 584        low  = lower_32_bits(val);
 585        high = upper_32_bits(val);
 586
 587        native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 588
 589        erratum_383_found = true;
 590}
 591
 592static void svm_init_osvw(struct kvm_vcpu *vcpu)
 593{
 594        /*
 595         * Guests should see errata 400 and 415 as fixed (assuming that
 596         * HLT and IO instructions are intercepted).
 597         */
 598        vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 599        vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 600
 601        /*
 602         * By increasing VCPU's osvw.length to 3 we are telling the guest that
 603         * all osvw.status bits inside that length, including bit 0 (which is
 604         * reserved for erratum 298), are valid. However, if host processor's
 605         * osvw_len is 0 then osvw_status[0] carries no information. We need to
 606         * be conservative here and therefore we tell the guest that erratum 298
 607         * is present (because we really don't know).
 608         */
 609        if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 610                vcpu->arch.osvw.status |= 1;
 611}
 612
 613static int has_svm(void)
 614{
 615        const char *msg;
 616
 617        if (!cpu_has_svm(&msg)) {
 618                printk(KERN_INFO "has_svm: %s\n", msg);
 619                return 0;
 620        }
 621
 622        return 1;
 623}
 624
 625static void svm_hardware_disable(void)
 626{
 627        /* Make sure we clean up behind us */
 628        if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
 629                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 630
 631        cpu_svm_disable();
 632
 633        amd_pmu_disable_virt();
 634}
 635
 636static int svm_hardware_enable(void)
 637{
 638
 639        struct svm_cpu_data *sd;
 640        uint64_t efer;
 641        struct desc_ptr gdt_descr;
 642        struct desc_struct *gdt;
 643        int me = raw_smp_processor_id();
 644
 645        rdmsrl(MSR_EFER, efer);
 646        if (efer & EFER_SVME)
 647                return -EBUSY;
 648
 649        if (!has_svm()) {
 650                pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
 651                return -EINVAL;
 652        }
 653        sd = per_cpu(svm_data, me);
 654        if (!sd) {
 655                pr_err("%s: svm_data is NULL on %d\n", __func__, me);
 656                return -EINVAL;
 657        }
 658
 659        sd->asid_generation = 1;
 660        sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 661        sd->next_asid = sd->max_asid + 1;
 662
 663        native_store_gdt(&gdt_descr);
 664        gdt = (struct desc_struct *)gdt_descr.address;
 665        sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 666
 667        wrmsrl(MSR_EFER, efer | EFER_SVME);
 668
 669        wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
 670
 671        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 672                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 673                __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
 674        }
 675
 676
 677        /*
 678         * Get OSVW bits.
 679         *
 680         * Note that it is possible to have a system with mixed processor
 681         * revisions and therefore different OSVW bits. If bits are not the same
 682         * on different processors then choose the worst case (i.e. if erratum
 683         * is present on one processor and not on another then assume that the
 684         * erratum is present everywhere).
 685         */
 686        if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 687                uint64_t len, status = 0;
 688                int err;
 689
 690                len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 691                if (!err)
 692                        status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 693                                                      &err);
 694
 695                if (err)
 696                        osvw_status = osvw_len = 0;
 697                else {
 698                        if (len < osvw_len)
 699                                osvw_len = len;
 700                        osvw_status |= status;
 701                        osvw_status &= (1ULL << osvw_len) - 1;
 702                }
 703        } else
 704                osvw_status = osvw_len = 0;
 705
 706        svm_init_erratum_383();
 707
 708        amd_pmu_enable_virt();
 709
 710        return 0;
 711}
 712
 713static void svm_cpu_uninit(int cpu)
 714{
 715        struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
 716
 717        if (!sd)
 718                return;
 719
 720        per_cpu(svm_data, raw_smp_processor_id()) = NULL;
 721        __free_page(sd->save_area);
 722        kfree(sd);
 723}
 724
 725static int svm_cpu_init(int cpu)
 726{
 727        struct svm_cpu_data *sd;
 728        int r;
 729
 730        sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
 731        if (!sd)
 732                return -ENOMEM;
 733        sd->cpu = cpu;
 734        sd->save_area = alloc_page(GFP_KERNEL);
 735        r = -ENOMEM;
 736        if (!sd->save_area)
 737                goto err_1;
 738
 739        per_cpu(svm_data, cpu) = sd;
 740
 741        return 0;
 742
 743err_1:
 744        kfree(sd);
 745        return r;
 746
 747}
 748
 749static bool valid_msr_intercept(u32 index)
 750{
 751        int i;
 752
 753        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 754                if (direct_access_msrs[i].index == index)
 755                        return true;
 756
 757        return false;
 758}
 759
 760static void set_msr_interception(u32 *msrpm, unsigned msr,
 761                                 int read, int write)
 762{
 763        u8 bit_read, bit_write;
 764        unsigned long tmp;
 765        u32 offset;
 766
 767        /*
 768         * If this warning triggers extend the direct_access_msrs list at the
 769         * beginning of the file
 770         */
 771        WARN_ON(!valid_msr_intercept(msr));
 772
 773        offset    = svm_msrpm_offset(msr);
 774        bit_read  = 2 * (msr & 0x0f);
 775        bit_write = 2 * (msr & 0x0f) + 1;
 776        tmp       = msrpm[offset];
 777
 778        BUG_ON(offset == MSR_INVALID);
 779
 780        read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 781        write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 782
 783        msrpm[offset] = tmp;
 784}
 785
 786static void svm_vcpu_init_msrpm(u32 *msrpm)
 787{
 788        int i;
 789
 790        memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
 791
 792        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 793                if (!direct_access_msrs[i].always)
 794                        continue;
 795
 796                set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
 797        }
 798}
 799
 800static void add_msr_offset(u32 offset)
 801{
 802        int i;
 803
 804        for (i = 0; i < MSRPM_OFFSETS; ++i) {
 805
 806                /* Offset already in list? */
 807                if (msrpm_offsets[i] == offset)
 808                        return;
 809
 810                /* Slot used by another offset? */
 811                if (msrpm_offsets[i] != MSR_INVALID)
 812                        continue;
 813
 814                /* Add offset to list */
 815                msrpm_offsets[i] = offset;
 816
 817                return;
 818        }
 819
 820        /*
 821         * If this BUG triggers the msrpm_offsets table has an overflow. Just
 822         * increase MSRPM_OFFSETS in this case.
 823         */
 824        BUG();
 825}
 826
 827static void init_msrpm_offsets(void)
 828{
 829        int i;
 830
 831        memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 832
 833        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 834                u32 offset;
 835
 836                offset = svm_msrpm_offset(direct_access_msrs[i].index);
 837                BUG_ON(offset == MSR_INVALID);
 838
 839                add_msr_offset(offset);
 840        }
 841}
 842
 843static void svm_enable_lbrv(struct vcpu_svm *svm)
 844{
 845        u32 *msrpm = svm->msrpm;
 846
 847        svm->vmcb->control.lbr_ctl = 1;
 848        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 849        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 850        set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 851        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 852}
 853
 854static void svm_disable_lbrv(struct vcpu_svm *svm)
 855{
 856        u32 *msrpm = svm->msrpm;
 857
 858        svm->vmcb->control.lbr_ctl = 0;
 859        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
 860        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 861        set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 862        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 863}
 864
 865static __init int svm_hardware_setup(void)
 866{
 867        int cpu;
 868        struct page *iopm_pages;
 869        void *iopm_va;
 870        int r;
 871
 872        iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
 873
 874        if (!iopm_pages)
 875                return -ENOMEM;
 876
 877        iopm_va = page_address(iopm_pages);
 878        memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
 879        iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 880
 881        init_msrpm_offsets();
 882
 883        if (boot_cpu_has(X86_FEATURE_NX))
 884                kvm_enable_efer_bits(EFER_NX);
 885
 886        if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
 887                kvm_enable_efer_bits(EFER_FFXSR);
 888
 889        if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 890                u64 max;
 891
 892                kvm_has_tsc_control = true;
 893
 894                /*
 895                 * Make sure the user can only configure tsc_khz values that
 896                 * fit into a signed integer.
 897                 * A min value is not calculated needed because it will always
 898                 * be 1 on all machines and a value of 0 is used to disable
 899                 * tsc-scaling for the vcpu.
 900                 */
 901                max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
 902
 903                kvm_max_guest_tsc_khz = max;
 904        }
 905
 906        if (nested) {
 907                printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
 908                kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
 909        }
 910
 911        for_each_possible_cpu(cpu) {
 912                r = svm_cpu_init(cpu);
 913                if (r)
 914                        goto err;
 915        }
 916
 917        if (!boot_cpu_has(X86_FEATURE_NPT))
 918                npt_enabled = false;
 919
 920        if (npt_enabled && !npt) {
 921                printk(KERN_INFO "kvm: Nested Paging disabled\n");
 922                npt_enabled = false;
 923        }
 924
 925        if (npt_enabled) {
 926                printk(KERN_INFO "kvm: Nested Paging enabled\n");
 927                kvm_enable_tdp();
 928        } else
 929                kvm_disable_tdp();
 930
 931        return 0;
 932
 933err:
 934        __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
 935        iopm_base = 0;
 936        return r;
 937}
 938
 939static __exit void svm_hardware_unsetup(void)
 940{
 941        int cpu;
 942
 943        for_each_possible_cpu(cpu)
 944                svm_cpu_uninit(cpu);
 945
 946        __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
 947        iopm_base = 0;
 948}
 949
 950static void init_seg(struct vmcb_seg *seg)
 951{
 952        seg->selector = 0;
 953        seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
 954                      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
 955        seg->limit = 0xffff;
 956        seg->base = 0;
 957}
 958
 959static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 960{
 961        seg->selector = 0;
 962        seg->attrib = SVM_SELECTOR_P_MASK | type;
 963        seg->limit = 0xffff;
 964        seg->base = 0;
 965}
 966
 967static u64 __scale_tsc(u64 ratio, u64 tsc)
 968{
 969        u64 mult, frac, _tsc;
 970
 971        mult  = ratio >> 32;
 972        frac  = ratio & ((1ULL << 32) - 1);
 973
 974        _tsc  = tsc;
 975        _tsc *= mult;
 976        _tsc += (tsc >> 32) * frac;
 977        _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
 978
 979        return _tsc;
 980}
 981
 982static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
 983{
 984        struct vcpu_svm *svm = to_svm(vcpu);
 985        u64 _tsc = tsc;
 986
 987        if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
 988                _tsc = __scale_tsc(svm->tsc_ratio, tsc);
 989
 990        return _tsc;
 991}
 992
 993static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 994{
 995        struct vcpu_svm *svm = to_svm(vcpu);
 996        u64 ratio;
 997        u64 khz;
 998
 999        /* Guest TSC same frequency as host TSC? */
1000        if (!scale) {
1001                svm->tsc_ratio = TSC_RATIO_DEFAULT;
1002                return;
1003        }
1004
1005        /* TSC scaling supported? */
1006        if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1007                if (user_tsc_khz > tsc_khz) {
1008                        vcpu->arch.tsc_catchup = 1;
1009                        vcpu->arch.tsc_always_catchup = 1;
1010                } else
1011                        WARN(1, "user requested TSC rate below hardware speed\n");
1012                return;
1013        }
1014
1015        khz = user_tsc_khz;
1016
1017        /* TSC scaling required  - calculate ratio */
1018        ratio = khz << 32;
1019        do_div(ratio, tsc_khz);
1020
1021        if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
1022                WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
1023                                user_tsc_khz);
1024                return;
1025        }
1026        svm->tsc_ratio             = ratio;
1027}
1028
1029static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
1030{
1031        struct vcpu_svm *svm = to_svm(vcpu);
1032
1033        return svm->vmcb->control.tsc_offset;
1034}
1035
1036static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1037{
1038        struct vcpu_svm *svm = to_svm(vcpu);
1039        u64 g_tsc_offset = 0;
1040
1041        if (is_guest_mode(vcpu)) {
1042                g_tsc_offset = svm->vmcb->control.tsc_offset -
1043                               svm->nested.hsave->control.tsc_offset;
1044                svm->nested.hsave->control.tsc_offset = offset;
1045        } else
1046                trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1047                                           svm->vmcb->control.tsc_offset,
1048                                           offset);
1049
1050        svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1051
1052        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1053}
1054
1055static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
1056{
1057        struct vcpu_svm *svm = to_svm(vcpu);
1058
1059        if (host) {
1060                if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
1061                        WARN_ON(adjustment < 0);
1062                adjustment = svm_scale_tsc(vcpu, (u64)adjustment);
1063        }
1064
1065        svm->vmcb->control.tsc_offset += adjustment;
1066        if (is_guest_mode(vcpu))
1067                svm->nested.hsave->control.tsc_offset += adjustment;
1068        else
1069                trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1070                                     svm->vmcb->control.tsc_offset - adjustment,
1071                                     svm->vmcb->control.tsc_offset);
1072
1073        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1074}
1075
1076static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1077{
1078        u64 tsc;
1079
1080        tsc = svm_scale_tsc(vcpu, native_read_tsc());
1081
1082        return target_tsc - tsc;
1083}
1084
1085static void init_vmcb(struct vcpu_svm *svm)
1086{
1087        struct vmcb_control_area *control = &svm->vmcb->control;
1088        struct vmcb_save_area *save = &svm->vmcb->save;
1089
1090        svm->vcpu.fpu_active = 1;
1091        svm->vcpu.arch.hflags = 0;
1092
1093        set_cr_intercept(svm, INTERCEPT_CR0_READ);
1094        set_cr_intercept(svm, INTERCEPT_CR3_READ);
1095        set_cr_intercept(svm, INTERCEPT_CR4_READ);
1096        set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1097        set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1098        set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1099        set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
1100
1101        set_dr_intercepts(svm);
1102
1103        set_exception_intercept(svm, PF_VECTOR);
1104        set_exception_intercept(svm, UD_VECTOR);
1105        set_exception_intercept(svm, MC_VECTOR);
1106
1107        set_intercept(svm, INTERCEPT_INTR);
1108        set_intercept(svm, INTERCEPT_NMI);
1109        set_intercept(svm, INTERCEPT_SMI);
1110        set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1111        set_intercept(svm, INTERCEPT_RDPMC);
1112        set_intercept(svm, INTERCEPT_CPUID);
1113        set_intercept(svm, INTERCEPT_INVD);
1114        set_intercept(svm, INTERCEPT_HLT);
1115        set_intercept(svm, INTERCEPT_INVLPG);
1116        set_intercept(svm, INTERCEPT_INVLPGA);
1117        set_intercept(svm, INTERCEPT_IOIO_PROT);
1118        set_intercept(svm, INTERCEPT_MSR_PROT);
1119        set_intercept(svm, INTERCEPT_TASK_SWITCH);
1120        set_intercept(svm, INTERCEPT_SHUTDOWN);
1121        set_intercept(svm, INTERCEPT_VMRUN);
1122        set_intercept(svm, INTERCEPT_VMMCALL);
1123        set_intercept(svm, INTERCEPT_VMLOAD);
1124        set_intercept(svm, INTERCEPT_VMSAVE);
1125        set_intercept(svm, INTERCEPT_STGI);
1126        set_intercept(svm, INTERCEPT_CLGI);
1127        set_intercept(svm, INTERCEPT_SKINIT);
1128        set_intercept(svm, INTERCEPT_WBINVD);
1129        set_intercept(svm, INTERCEPT_MONITOR);
1130        set_intercept(svm, INTERCEPT_MWAIT);
1131        set_intercept(svm, INTERCEPT_XSETBV);
1132
1133        control->iopm_base_pa = iopm_base;
1134        control->msrpm_base_pa = __pa(svm->msrpm);
1135        control->int_ctl = V_INTR_MASKING_MASK;
1136
1137        init_seg(&save->es);
1138        init_seg(&save->ss);
1139        init_seg(&save->ds);
1140        init_seg(&save->fs);
1141        init_seg(&save->gs);
1142
1143        save->cs.selector = 0xf000;
1144        save->cs.base = 0xffff0000;
1145        /* Executable/Readable Code Segment */
1146        save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1147                SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1148        save->cs.limit = 0xffff;
1149
1150        save->gdtr.limit = 0xffff;
1151        save->idtr.limit = 0xffff;
1152
1153        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1154        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1155
1156        svm_set_efer(&svm->vcpu, 0);
1157        save->dr6 = 0xffff0ff0;
1158        kvm_set_rflags(&svm->vcpu, 2);
1159        save->rip = 0x0000fff0;
1160        svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
1161
1162        /*
1163         * This is the guest-visible cr0 value.
1164         * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1165         */
1166        svm->vcpu.arch.cr0 = 0;
1167        (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1168
1169        save->cr4 = X86_CR4_PAE;
1170        /* rdx = ?? */
1171
1172        if (npt_enabled) {
1173                /* Setup VMCB for Nested Paging */
1174                control->nested_ctl = 1;
1175                clr_intercept(svm, INTERCEPT_INVLPG);
1176                clr_exception_intercept(svm, PF_VECTOR);
1177                clr_cr_intercept(svm, INTERCEPT_CR3_READ);
1178                clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1179                save->g_pat = 0x0007040600070406ULL;
1180                save->cr3 = 0;
1181                save->cr4 = 0;
1182        }
1183        svm->asid_generation = 0;
1184
1185        svm->nested.vmcb = 0;
1186        svm->vcpu.arch.hflags = 0;
1187
1188        if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1189                control->pause_filter_count = 3000;
1190                set_intercept(svm, INTERCEPT_PAUSE);
1191        }
1192
1193        mark_all_dirty(svm->vmcb);
1194
1195        enable_gif(svm);
1196}
1197
1198static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
1199{
1200        struct vcpu_svm *svm = to_svm(vcpu);
1201        u32 dummy;
1202        u32 eax = 1;
1203
1204        init_vmcb(svm);
1205
1206        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1207        kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1208}
1209
1210static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1211{
1212        struct vcpu_svm *svm;
1213        struct page *page;
1214        struct page *msrpm_pages;
1215        struct page *hsave_page;
1216        struct page *nested_msrpm_pages;
1217        int err;
1218
1219        svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1220        if (!svm) {
1221                err = -ENOMEM;
1222                goto out;
1223        }
1224
1225        svm->tsc_ratio = TSC_RATIO_DEFAULT;
1226
1227        err = kvm_vcpu_init(&svm->vcpu, kvm, id);
1228        if (err)
1229                goto free_svm;
1230
1231        err = -ENOMEM;
1232        page = alloc_page(GFP_KERNEL);
1233        if (!page)
1234                goto uninit;
1235
1236        msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1237        if (!msrpm_pages)
1238                goto free_page1;
1239
1240        nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1241        if (!nested_msrpm_pages)
1242                goto free_page2;
1243
1244        hsave_page = alloc_page(GFP_KERNEL);
1245        if (!hsave_page)
1246                goto free_page3;
1247
1248        svm->nested.hsave = page_address(hsave_page);
1249
1250        svm->msrpm = page_address(msrpm_pages);
1251        svm_vcpu_init_msrpm(svm->msrpm);
1252
1253        svm->nested.msrpm = page_address(nested_msrpm_pages);
1254        svm_vcpu_init_msrpm(svm->nested.msrpm);
1255
1256        svm->vmcb = page_address(page);
1257        clear_page(svm->vmcb);
1258        svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1259        svm->asid_generation = 0;
1260        init_vmcb(svm);
1261
1262        svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1263                                   MSR_IA32_APICBASE_ENABLE;
1264        if (kvm_vcpu_is_bsp(&svm->vcpu))
1265                svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1266
1267        svm_init_osvw(&svm->vcpu);
1268
1269        return &svm->vcpu;
1270
1271free_page3:
1272        __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1273free_page2:
1274        __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
1275free_page1:
1276        __free_page(page);
1277uninit:
1278        kvm_vcpu_uninit(&svm->vcpu);
1279free_svm:
1280        kmem_cache_free(kvm_vcpu_cache, svm);
1281out:
1282        return ERR_PTR(err);
1283}
1284
1285static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1286{
1287        struct vcpu_svm *svm = to_svm(vcpu);
1288
1289        __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
1290        __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1291        __free_page(virt_to_page(svm->nested.hsave));
1292        __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1293        kvm_vcpu_uninit(vcpu);
1294        kmem_cache_free(kvm_vcpu_cache, svm);
1295}
1296
1297static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1298{
1299        struct vcpu_svm *svm = to_svm(vcpu);
1300        int i;
1301
1302        if (unlikely(cpu != vcpu->cpu)) {
1303                svm->asid_generation = 0;
1304                mark_all_dirty(svm->vmcb);
1305        }
1306
1307#ifdef CONFIG_X86_64
1308        rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1309#endif
1310        savesegment(fs, svm->host.fs);
1311        savesegment(gs, svm->host.gs);
1312        svm->host.ldt = kvm_read_ldt();
1313
1314        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1315                rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1316
1317        if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
1318            svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
1319                __this_cpu_write(current_tsc_ratio, svm->tsc_ratio);
1320                wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
1321        }
1322}
1323
1324static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1325{
1326        struct vcpu_svm *svm = to_svm(vcpu);
1327        int i;
1328
1329        ++vcpu->stat.host_state_reload;
1330        kvm_load_ldt(svm->host.ldt);
1331#ifdef CONFIG_X86_64
1332        loadsegment(fs, svm->host.fs);
1333        wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1334        load_gs_index(svm->host.gs);
1335#else
1336#ifdef CONFIG_X86_32_LAZY_GS
1337        loadsegment(gs, svm->host.gs);
1338#endif
1339#endif
1340        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1341                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1342}
1343
1344static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1345{
1346        return to_svm(vcpu)->vmcb->save.rflags;
1347}
1348
1349static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1350{
1351       /*
1352        * Any change of EFLAGS.VM is accompained by a reload of SS
1353        * (caused by either a task switch or an inter-privilege IRET),
1354        * so we do not need to update the CPL here.
1355        */
1356        to_svm(vcpu)->vmcb->save.rflags = rflags;
1357}
1358
1359static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1360{
1361        switch (reg) {
1362        case VCPU_EXREG_PDPTR:
1363                BUG_ON(!npt_enabled);
1364                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1365                break;
1366        default:
1367                BUG();
1368        }
1369}
1370
1371static void svm_set_vintr(struct vcpu_svm *svm)
1372{
1373        set_intercept(svm, INTERCEPT_VINTR);
1374}
1375
1376static void svm_clear_vintr(struct vcpu_svm *svm)
1377{
1378        clr_intercept(svm, INTERCEPT_VINTR);
1379}
1380
1381static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1382{
1383        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1384
1385        switch (seg) {
1386        case VCPU_SREG_CS: return &save->cs;
1387        case VCPU_SREG_DS: return &save->ds;
1388        case VCPU_SREG_ES: return &save->es;
1389        case VCPU_SREG_FS: return &save->fs;
1390        case VCPU_SREG_GS: return &save->gs;
1391        case VCPU_SREG_SS: return &save->ss;
1392        case VCPU_SREG_TR: return &save->tr;
1393        case VCPU_SREG_LDTR: return &save->ldtr;
1394        }
1395        BUG();
1396        return NULL;
1397}
1398
1399static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1400{
1401        struct vmcb_seg *s = svm_seg(vcpu, seg);
1402
1403        return s->base;
1404}
1405
1406static void svm_get_segment(struct kvm_vcpu *vcpu,
1407                            struct kvm_segment *var, int seg)
1408{
1409        struct vmcb_seg *s = svm_seg(vcpu, seg);
1410
1411        var->base = s->base;
1412        var->limit = s->limit;
1413        var->selector = s->selector;
1414        var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1415        var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1416        var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1417        var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1418        var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1419        var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1420        var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1421
1422        /*
1423         * AMD CPUs circa 2014 track the G bit for all segments except CS.
1424         * However, the SVM spec states that the G bit is not observed by the
1425         * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1426         * So let's synthesize a legal G bit for all segments, this helps
1427         * running KVM nested. It also helps cross-vendor migration, because
1428         * Intel's vmentry has a check on the 'G' bit.
1429         */
1430        var->g = s->limit > 0xfffff;
1431
1432        /*
1433         * AMD's VMCB does not have an explicit unusable field, so emulate it
1434         * for cross vendor migration purposes by "not present"
1435         */
1436        var->unusable = !var->present || (var->type == 0);
1437
1438        switch (seg) {
1439        case VCPU_SREG_TR:
1440                /*
1441                 * Work around a bug where the busy flag in the tr selector
1442                 * isn't exposed
1443                 */
1444                var->type |= 0x2;
1445                break;
1446        case VCPU_SREG_DS:
1447        case VCPU_SREG_ES:
1448        case VCPU_SREG_FS:
1449        case VCPU_SREG_GS:
1450                /*
1451                 * The accessed bit must always be set in the segment
1452                 * descriptor cache, although it can be cleared in the
1453                 * descriptor, the cached bit always remains at 1. Since
1454                 * Intel has a check on this, set it here to support
1455                 * cross-vendor migration.
1456                 */
1457                if (!var->unusable)
1458                        var->type |= 0x1;
1459                break;
1460        case VCPU_SREG_SS:
1461                /*
1462                 * On AMD CPUs sometimes the DB bit in the segment
1463                 * descriptor is left as 1, although the whole segment has
1464                 * been made unusable. Clear it here to pass an Intel VMX
1465                 * entry check when cross vendor migrating.
1466                 */
1467                if (var->unusable)
1468                        var->db = 0;
1469                var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1470                break;
1471        }
1472}
1473
1474static int svm_get_cpl(struct kvm_vcpu *vcpu)
1475{
1476        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1477
1478        return save->cpl;
1479}
1480
1481static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1482{
1483        struct vcpu_svm *svm = to_svm(vcpu);
1484
1485        dt->size = svm->vmcb->save.idtr.limit;
1486        dt->address = svm->vmcb->save.idtr.base;
1487}
1488
1489static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1490{
1491        struct vcpu_svm *svm = to_svm(vcpu);
1492
1493        svm->vmcb->save.idtr.limit = dt->size;
1494        svm->vmcb->save.idtr.base = dt->address ;
1495        mark_dirty(svm->vmcb, VMCB_DT);
1496}
1497
1498static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1499{
1500        struct vcpu_svm *svm = to_svm(vcpu);
1501
1502        dt->size = svm->vmcb->save.gdtr.limit;
1503        dt->address = svm->vmcb->save.gdtr.base;
1504}
1505
1506static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1507{
1508        struct vcpu_svm *svm = to_svm(vcpu);
1509
1510        svm->vmcb->save.gdtr.limit = dt->size;
1511        svm->vmcb->save.gdtr.base = dt->address ;
1512        mark_dirty(svm->vmcb, VMCB_DT);
1513}
1514
1515static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1516{
1517}
1518
1519static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1520{
1521}
1522
1523static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1524{
1525}
1526
1527static void update_cr0_intercept(struct vcpu_svm *svm)
1528{
1529        ulong gcr0 = svm->vcpu.arch.cr0;
1530        u64 *hcr0 = &svm->vmcb->save.cr0;
1531
1532        if (!svm->vcpu.fpu_active)
1533                *hcr0 |= SVM_CR0_SELECTIVE_MASK;
1534        else
1535                *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1536                        | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1537
1538        mark_dirty(svm->vmcb, VMCB_CR);
1539
1540        if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1541                clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1542                clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1543        } else {
1544                set_cr_intercept(svm, INTERCEPT_CR0_READ);
1545                set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1546        }
1547}
1548
1549static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1550{
1551        struct vcpu_svm *svm = to_svm(vcpu);
1552
1553#ifdef CONFIG_X86_64
1554        if (vcpu->arch.efer & EFER_LME) {
1555                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1556                        vcpu->arch.efer |= EFER_LMA;
1557                        svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1558                }
1559
1560                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1561                        vcpu->arch.efer &= ~EFER_LMA;
1562                        svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1563                }
1564        }
1565#endif
1566        vcpu->arch.cr0 = cr0;
1567
1568        if (!npt_enabled)
1569                cr0 |= X86_CR0_PG | X86_CR0_WP;
1570
1571        if (!vcpu->fpu_active)
1572                cr0 |= X86_CR0_TS;
1573        /*
1574         * re-enable caching here because the QEMU bios
1575         * does not do it - this results in some delay at
1576         * reboot
1577         */
1578        cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1579        svm->vmcb->save.cr0 = cr0;
1580        mark_dirty(svm->vmcb, VMCB_CR);
1581        update_cr0_intercept(svm);
1582}
1583
1584static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1585{
1586        unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1587        unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1588
1589        if (cr4 & X86_CR4_VMXE)
1590                return 1;
1591
1592        if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1593                svm_flush_tlb(vcpu);
1594
1595        vcpu->arch.cr4 = cr4;
1596        if (!npt_enabled)
1597                cr4 |= X86_CR4_PAE;
1598        cr4 |= host_cr4_mce;
1599        to_svm(vcpu)->vmcb->save.cr4 = cr4;
1600        mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1601        return 0;
1602}
1603
1604static void svm_set_segment(struct kvm_vcpu *vcpu,
1605                            struct kvm_segment *var, int seg)
1606{
1607        struct vcpu_svm *svm = to_svm(vcpu);
1608        struct vmcb_seg *s = svm_seg(vcpu, seg);
1609
1610        s->base = var->base;
1611        s->limit = var->limit;
1612        s->selector = var->selector;
1613        if (var->unusable)
1614                s->attrib = 0;
1615        else {
1616                s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1617                s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1618                s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1619                s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
1620                s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1621                s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1622                s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1623                s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1624        }
1625
1626        /*
1627         * This is always accurate, except if SYSRET returned to a segment
1628         * with SS.DPL != 3.  Intel does not have this quirk, and always
1629         * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1630         * would entail passing the CPL to userspace and back.
1631         */
1632        if (seg == VCPU_SREG_SS)
1633                svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1634
1635        mark_dirty(svm->vmcb, VMCB_SEG);
1636}
1637
1638static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
1639{
1640        struct vcpu_svm *svm = to_svm(vcpu);
1641
1642        clr_exception_intercept(svm, DB_VECTOR);
1643        clr_exception_intercept(svm, BP_VECTOR);
1644
1645        if (svm->nmi_singlestep)
1646                set_exception_intercept(svm, DB_VECTOR);
1647
1648        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1649                if (vcpu->guest_debug &
1650                    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1651                        set_exception_intercept(svm, DB_VECTOR);
1652                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1653                        set_exception_intercept(svm, BP_VECTOR);
1654        } else
1655                vcpu->guest_debug = 0;
1656}
1657
1658static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1659{
1660        if (sd->next_asid > sd->max_asid) {
1661                ++sd->asid_generation;
1662                sd->next_asid = 1;
1663                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1664        }
1665
1666        svm->asid_generation = sd->asid_generation;
1667        svm->vmcb->control.asid = sd->next_asid++;
1668
1669        mark_dirty(svm->vmcb, VMCB_ASID);
1670}
1671
1672static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
1673{
1674        return to_svm(vcpu)->vmcb->save.dr6;
1675}
1676
1677static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
1678{
1679        struct vcpu_svm *svm = to_svm(vcpu);
1680
1681        svm->vmcb->save.dr6 = value;
1682        mark_dirty(svm->vmcb, VMCB_DR);
1683}
1684
1685static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1686{
1687        struct vcpu_svm *svm = to_svm(vcpu);
1688
1689        get_debugreg(vcpu->arch.db[0], 0);
1690        get_debugreg(vcpu->arch.db[1], 1);
1691        get_debugreg(vcpu->arch.db[2], 2);
1692        get_debugreg(vcpu->arch.db[3], 3);
1693        vcpu->arch.dr6 = svm_get_dr6(vcpu);
1694        vcpu->arch.dr7 = svm->vmcb->save.dr7;
1695
1696        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1697        set_dr_intercepts(svm);
1698}
1699
1700static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1701{
1702        struct vcpu_svm *svm = to_svm(vcpu);
1703
1704        svm->vmcb->save.dr7 = value;
1705        mark_dirty(svm->vmcb, VMCB_DR);
1706}
1707
1708static int pf_interception(struct vcpu_svm *svm)
1709{
1710        u64 fault_address = svm->vmcb->control.exit_info_2;
1711        u32 error_code;
1712        int r = 1;
1713
1714        switch (svm->apf_reason) {
1715        default:
1716                error_code = svm->vmcb->control.exit_info_1;
1717
1718                trace_kvm_page_fault(fault_address, error_code);
1719                if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1720                        kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1721                r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1722                        svm->vmcb->control.insn_bytes,
1723                        svm->vmcb->control.insn_len);
1724                break;
1725        case KVM_PV_REASON_PAGE_NOT_PRESENT:
1726                svm->apf_reason = 0;
1727                local_irq_disable();
1728                kvm_async_pf_task_wait(fault_address);
1729                local_irq_enable();
1730                break;
1731        case KVM_PV_REASON_PAGE_READY:
1732                svm->apf_reason = 0;
1733                local_irq_disable();
1734                kvm_async_pf_task_wake(fault_address);
1735                local_irq_enable();
1736                break;
1737        }
1738        return r;
1739}
1740
1741static int db_interception(struct vcpu_svm *svm)
1742{
1743        struct kvm_run *kvm_run = svm->vcpu.run;
1744
1745        if (!(svm->vcpu.guest_debug &
1746              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1747                !svm->nmi_singlestep) {
1748                kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1749                return 1;
1750        }
1751
1752        if (svm->nmi_singlestep) {
1753                svm->nmi_singlestep = false;
1754                if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1755                        svm->vmcb->save.rflags &=
1756                                ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1757                update_db_bp_intercept(&svm->vcpu);
1758        }
1759
1760        if (svm->vcpu.guest_debug &
1761            (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1762                kvm_run->exit_reason = KVM_EXIT_DEBUG;
1763                kvm_run->debug.arch.pc =
1764                        svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1765                kvm_run->debug.arch.exception = DB_VECTOR;
1766                return 0;
1767        }
1768
1769        return 1;
1770}
1771
1772static int bp_interception(struct vcpu_svm *svm)
1773{
1774        struct kvm_run *kvm_run = svm->vcpu.run;
1775
1776        kvm_run->exit_reason = KVM_EXIT_DEBUG;
1777        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1778        kvm_run->debug.arch.exception = BP_VECTOR;
1779        return 0;
1780}
1781
1782static int ud_interception(struct vcpu_svm *svm)
1783{
1784        int er;
1785
1786        er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
1787        if (er != EMULATE_DONE)
1788                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1789        return 1;
1790}
1791
1792static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1793{
1794        struct vcpu_svm *svm = to_svm(vcpu);
1795
1796        clr_exception_intercept(svm, NM_VECTOR);
1797
1798        svm->vcpu.fpu_active = 1;
1799        update_cr0_intercept(svm);
1800}
1801
1802static int nm_interception(struct vcpu_svm *svm)
1803{
1804        svm_fpu_activate(&svm->vcpu);
1805        return 1;
1806}
1807
1808static bool is_erratum_383(void)
1809{
1810        int err, i;
1811        u64 value;
1812
1813        if (!erratum_383_found)
1814                return false;
1815
1816        value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
1817        if (err)
1818                return false;
1819
1820        /* Bit 62 may or may not be set for this mce */
1821        value &= ~(1ULL << 62);
1822
1823        if (value != 0xb600000000010015ULL)
1824                return false;
1825
1826        /* Clear MCi_STATUS registers */
1827        for (i = 0; i < 6; ++i)
1828                native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
1829
1830        value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
1831        if (!err) {
1832                u32 low, high;
1833
1834                value &= ~(1ULL << 2);
1835                low    = lower_32_bits(value);
1836                high   = upper_32_bits(value);
1837
1838                native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
1839        }
1840
1841        /* Flush tlb to evict multi-match entries */
1842        __flush_tlb_all();
1843
1844        return true;
1845}
1846
1847static void svm_handle_mce(struct vcpu_svm *svm)
1848{
1849        if (is_erratum_383()) {
1850                /*
1851                 * Erratum 383 triggered. Guest state is corrupt so kill the
1852                 * guest.
1853                 */
1854                pr_err("KVM: Guest triggered AMD Erratum 383\n");
1855
1856                kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1857
1858                return;
1859        }
1860
1861        /*
1862         * On an #MC intercept the MCE handler is not called automatically in
1863         * the host. So do it by hand here.
1864         */
1865        asm volatile (
1866                "int $0x12\n");
1867        /* not sure if we ever come back to this point */
1868
1869        return;
1870}
1871
1872static int mc_interception(struct vcpu_svm *svm)
1873{
1874        return 1;
1875}
1876
1877static int shutdown_interception(struct vcpu_svm *svm)
1878{
1879        struct kvm_run *kvm_run = svm->vcpu.run;
1880
1881        /*
1882         * VMCB is undefined after a SHUTDOWN intercept
1883         * so reinitialize it.
1884         */
1885        clear_page(svm->vmcb);
1886        init_vmcb(svm);
1887
1888        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1889        return 0;
1890}
1891
1892static int io_interception(struct vcpu_svm *svm)
1893{
1894        struct kvm_vcpu *vcpu = &svm->vcpu;
1895        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1896        int size, in, string;
1897        unsigned port;
1898
1899        ++svm->vcpu.stat.io_exits;
1900        string = (io_info & SVM_IOIO_STR_MASK) != 0;
1901        in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1902        if (string || in)
1903                return emulate_instruction(vcpu, 0) == EMULATE_DONE;
1904
1905        port = io_info >> 16;
1906        size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1907        svm->next_rip = svm->vmcb->control.exit_info_2;
1908        skip_emulated_instruction(&svm->vcpu);
1909
1910        return kvm_fast_pio_out(vcpu, size, port);
1911}
1912
1913static int nmi_interception(struct vcpu_svm *svm)
1914{
1915        return 1;
1916}
1917
1918static int intr_interception(struct vcpu_svm *svm)
1919{
1920        ++svm->vcpu.stat.irq_exits;
1921        return 1;
1922}
1923
1924static int nop_on_interception(struct vcpu_svm *svm)
1925{
1926        return 1;
1927}
1928
1929static int halt_interception(struct vcpu_svm *svm)
1930{
1931        svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1932        skip_emulated_instruction(&svm->vcpu);
1933        return kvm_emulate_halt(&svm->vcpu);
1934}
1935
1936static int vmmcall_interception(struct vcpu_svm *svm)
1937{
1938        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1939        skip_emulated_instruction(&svm->vcpu);
1940        kvm_emulate_hypercall(&svm->vcpu);
1941        return 1;
1942}
1943
1944static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
1945{
1946        struct vcpu_svm *svm = to_svm(vcpu);
1947
1948        return svm->nested.nested_cr3;
1949}
1950
1951static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
1952{
1953        struct vcpu_svm *svm = to_svm(vcpu);
1954        u64 cr3 = svm->nested.nested_cr3;
1955        u64 pdpte;
1956        int ret;
1957
1958        ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
1959                                  offset_in_page(cr3) + index * 8, 8);
1960        if (ret)
1961                return 0;
1962        return pdpte;
1963}
1964
1965static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1966                                   unsigned long root)
1967{
1968        struct vcpu_svm *svm = to_svm(vcpu);
1969
1970        svm->vmcb->control.nested_cr3 = root;
1971        mark_dirty(svm->vmcb, VMCB_NPT);
1972        svm_flush_tlb(vcpu);
1973}
1974
1975static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
1976                                       struct x86_exception *fault)
1977{
1978        struct vcpu_svm *svm = to_svm(vcpu);
1979
1980        if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
1981                /*
1982                 * TODO: track the cause of the nested page fault, and
1983                 * correctly fill in the high bits of exit_info_1.
1984                 */
1985                svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1986                svm->vmcb->control.exit_code_hi = 0;
1987                svm->vmcb->control.exit_info_1 = (1ULL << 32);
1988                svm->vmcb->control.exit_info_2 = fault->address;
1989        }
1990
1991        svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
1992        svm->vmcb->control.exit_info_1 |= fault->error_code;
1993
1994        /*
1995         * The present bit is always zero for page structure faults on real
1996         * hardware.
1997         */
1998        if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
1999                svm->vmcb->control.exit_info_1 &= ~1;
2000
2001        nested_svm_vmexit(svm);
2002}
2003
2004static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
2005{
2006        WARN_ON(mmu_is_nested(vcpu));
2007        kvm_init_shadow_mmu(vcpu);
2008        vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
2009        vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
2010        vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
2011        vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
2012        vcpu->arch.mmu.shadow_root_level = get_npt_level();
2013        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
2014}
2015
2016static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
2017{
2018        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
2019}
2020
2021static int nested_svm_check_permissions(struct vcpu_svm *svm)
2022{
2023        if (!(svm->vcpu.arch.efer & EFER_SVME)
2024            || !is_paging(&svm->vcpu)) {
2025                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2026                return 1;
2027        }
2028
2029        if (svm->vmcb->save.cpl) {
2030                kvm_inject_gp(&svm->vcpu, 0);
2031                return 1;
2032        }
2033
2034       return 0;
2035}
2036
2037static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
2038                                      bool has_error_code, u32 error_code)
2039{
2040        int vmexit;
2041
2042        if (!is_guest_mode(&svm->vcpu))
2043                return 0;
2044
2045        svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
2046        svm->vmcb->control.exit_code_hi = 0;
2047        svm->vmcb->control.exit_info_1 = error_code;
2048        svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
2049
2050        vmexit = nested_svm_intercept(svm);
2051        if (vmexit == NESTED_EXIT_DONE)
2052                svm->nested.exit_required = true;
2053
2054        return vmexit;
2055}
2056
2057/* This function returns true if it is save to enable the irq window */
2058static inline bool nested_svm_intr(struct vcpu_svm *svm)
2059{
2060        if (!is_guest_mode(&svm->vcpu))
2061                return true;
2062
2063        if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2064                return true;
2065
2066        if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
2067                return false;
2068
2069        /*
2070         * if vmexit was already requested (by intercepted exception
2071         * for instance) do not overwrite it with "external interrupt"
2072         * vmexit.
2073         */
2074        if (svm->nested.exit_required)
2075                return false;
2076
2077        svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
2078        svm->vmcb->control.exit_info_1 = 0;
2079        svm->vmcb->control.exit_info_2 = 0;
2080
2081        if (svm->nested.intercept & 1ULL) {
2082                /*
2083                 * The #vmexit can't be emulated here directly because this
2084                 * code path runs with irqs and preemption disabled. A
2085                 * #vmexit emulation might sleep. Only signal request for
2086                 * the #vmexit here.
2087                 */
2088                svm->nested.exit_required = true;
2089                trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
2090                return false;
2091        }
2092
2093        return true;
2094}
2095
2096/* This function returns true if it is save to enable the nmi window */
2097static inline bool nested_svm_nmi(struct vcpu_svm *svm)
2098{
2099        if (!is_guest_mode(&svm->vcpu))
2100                return true;
2101
2102        if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
2103                return true;
2104
2105        svm->vmcb->control.exit_code = SVM_EXIT_NMI;
2106        svm->nested.exit_required = true;
2107
2108        return false;
2109}
2110
2111static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
2112{
2113        struct page *page;
2114
2115        might_sleep();
2116
2117        page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
2118        if (is_error_page(page))
2119                goto error;
2120
2121        *_page = page;
2122
2123        return kmap(page);
2124
2125error:
2126        kvm_inject_gp(&svm->vcpu, 0);
2127
2128        return NULL;
2129}
2130
2131static void nested_svm_unmap(struct page *page)
2132{
2133        kunmap(page);
2134        kvm_release_page_dirty(page);
2135}
2136
2137static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
2138{
2139        unsigned port, size, iopm_len;
2140        u16 val, mask;
2141        u8 start_bit;
2142        u64 gpa;
2143
2144        if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
2145                return NESTED_EXIT_HOST;
2146
2147        port = svm->vmcb->control.exit_info_1 >> 16;
2148        size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
2149                SVM_IOIO_SIZE_SHIFT;
2150        gpa  = svm->nested.vmcb_iopm + (port / 8);
2151        start_bit = port % 8;
2152        iopm_len = (start_bit + size > 8) ? 2 : 1;
2153        mask = (0xf >> (4 - size)) << start_bit;
2154        val = 0;
2155
2156        if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len))
2157                return NESTED_EXIT_DONE;
2158
2159        return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2160}
2161
2162static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
2163{
2164        u32 offset, msr, value;
2165        int write, mask;
2166
2167        if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2168                return NESTED_EXIT_HOST;
2169
2170        msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2171        offset = svm_msrpm_offset(msr);
2172        write  = svm->vmcb->control.exit_info_1 & 1;
2173        mask   = 1 << ((2 * (msr & 0xf)) + write);
2174
2175        if (offset == MSR_INVALID)
2176                return NESTED_EXIT_DONE;
2177
2178        /* Offset is in 32 bit units but need in 8 bit units */
2179        offset *= 4;
2180
2181        if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
2182                return NESTED_EXIT_DONE;
2183
2184        return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2185}
2186
2187static int nested_svm_exit_special(struct vcpu_svm *svm)
2188{
2189        u32 exit_code = svm->vmcb->control.exit_code;
2190
2191        switch (exit_code) {
2192        case SVM_EXIT_INTR:
2193        case SVM_EXIT_NMI:
2194        case SVM_EXIT_EXCP_BASE + MC_VECTOR:
2195                return NESTED_EXIT_HOST;
2196        case SVM_EXIT_NPF:
2197                /* For now we are always handling NPFs when using them */
2198                if (npt_enabled)
2199                        return NESTED_EXIT_HOST;
2200                break;
2201        case SVM_EXIT_EXCP_BASE + PF_VECTOR:
2202                /* When we're shadowing, trap PFs, but not async PF */
2203                if (!npt_enabled && svm->apf_reason == 0)
2204                        return NESTED_EXIT_HOST;
2205                break;
2206        case SVM_EXIT_EXCP_BASE + NM_VECTOR:
2207                nm_interception(svm);
2208                break;
2209        default:
2210                break;
2211        }
2212
2213        return NESTED_EXIT_CONTINUE;
2214}
2215
2216/*
2217 * If this function returns true, this #vmexit was already handled
2218 */
2219static int nested_svm_intercept(struct vcpu_svm *svm)
2220{
2221        u32 exit_code = svm->vmcb->control.exit_code;
2222        int vmexit = NESTED_EXIT_HOST;
2223
2224        switch (exit_code) {
2225        case SVM_EXIT_MSR:
2226                vmexit = nested_svm_exit_handled_msr(svm);
2227                break;
2228        case SVM_EXIT_IOIO:
2229                vmexit = nested_svm_intercept_ioio(svm);
2230                break;
2231        case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
2232                u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
2233                if (svm->nested.intercept_cr & bit)
2234                        vmexit = NESTED_EXIT_DONE;
2235                break;
2236        }
2237        case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
2238                u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
2239                if (svm->nested.intercept_dr & bit)
2240                        vmexit = NESTED_EXIT_DONE;
2241                break;
2242        }
2243        case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
2244                u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
2245                if (svm->nested.intercept_exceptions & excp_bits)
2246                        vmexit = NESTED_EXIT_DONE;
2247                /* async page fault always cause vmexit */
2248                else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2249                         svm->apf_reason != 0)
2250                        vmexit = NESTED_EXIT_DONE;
2251                break;
2252        }
2253        case SVM_EXIT_ERR: {
2254                vmexit = NESTED_EXIT_DONE;
2255                break;
2256        }
2257        default: {
2258                u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
2259                if (svm->nested.intercept & exit_bits)
2260                        vmexit = NESTED_EXIT_DONE;
2261        }
2262        }
2263
2264        return vmexit;
2265}
2266
2267static int nested_svm_exit_handled(struct vcpu_svm *svm)
2268{
2269        int vmexit;
2270
2271        vmexit = nested_svm_intercept(svm);
2272
2273        if (vmexit == NESTED_EXIT_DONE)
2274                nested_svm_vmexit(svm);
2275
2276        return vmexit;
2277}
2278
2279static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
2280{
2281        struct vmcb_control_area *dst  = &dst_vmcb->control;
2282        struct vmcb_control_area *from = &from_vmcb->control;
2283
2284        dst->intercept_cr         = from->intercept_cr;
2285        dst->intercept_dr         = from->intercept_dr;
2286        dst->intercept_exceptions = from->intercept_exceptions;
2287        dst->intercept            = from->intercept;
2288        dst->iopm_base_pa         = from->iopm_base_pa;
2289        dst->msrpm_base_pa        = from->msrpm_base_pa;
2290        dst->tsc_offset           = from->tsc_offset;
2291        dst->asid                 = from->asid;
2292        dst->tlb_ctl              = from->tlb_ctl;
2293        dst->int_ctl              = from->int_ctl;
2294        dst->int_vector           = from->int_vector;
2295        dst->int_state            = from->int_state;
2296        dst->exit_code            = from->exit_code;
2297        dst->exit_code_hi         = from->exit_code_hi;
2298        dst->exit_info_1          = from->exit_info_1;
2299        dst->exit_info_2          = from->exit_info_2;
2300        dst->exit_int_info        = from->exit_int_info;
2301        dst->exit_int_info_err    = from->exit_int_info_err;
2302        dst->nested_ctl           = from->nested_ctl;
2303        dst->event_inj            = from->event_inj;
2304        dst->event_inj_err        = from->event_inj_err;
2305        dst->nested_cr3           = from->nested_cr3;
2306        dst->lbr_ctl              = from->lbr_ctl;
2307}
2308
2309static int nested_svm_vmexit(struct vcpu_svm *svm)
2310{
2311        struct vmcb *nested_vmcb;
2312        struct vmcb *hsave = svm->nested.hsave;
2313        struct vmcb *vmcb = svm->vmcb;
2314        struct page *page;
2315
2316        trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
2317                                       vmcb->control.exit_info_1,
2318                                       vmcb->control.exit_info_2,
2319                                       vmcb->control.exit_int_info,
2320                                       vmcb->control.exit_int_info_err,
2321                                       KVM_ISA_SVM);
2322
2323        nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
2324        if (!nested_vmcb)
2325                return 1;
2326
2327        /* Exit Guest-Mode */
2328        leave_guest_mode(&svm->vcpu);
2329        svm->nested.vmcb = 0;
2330
2331        /* Give the current vmcb to the guest */
2332        disable_gif(svm);
2333
2334        nested_vmcb->save.es     = vmcb->save.es;
2335        nested_vmcb->save.cs     = vmcb->save.cs;
2336        nested_vmcb->save.ss     = vmcb->save.ss;
2337        nested_vmcb->save.ds     = vmcb->save.ds;
2338        nested_vmcb->save.gdtr   = vmcb->save.gdtr;
2339        nested_vmcb->save.idtr   = vmcb->save.idtr;
2340        nested_vmcb->save.efer   = svm->vcpu.arch.efer;
2341        nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
2342        nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
2343        nested_vmcb->save.cr2    = vmcb->save.cr2;
2344        nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
2345        nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
2346        nested_vmcb->save.rip    = vmcb->save.rip;
2347        nested_vmcb->save.rsp    = vmcb->save.rsp;
2348        nested_vmcb->save.rax    = vmcb->save.rax;
2349        nested_vmcb->save.dr7    = vmcb->save.dr7;
2350        nested_vmcb->save.dr6    = vmcb->save.dr6;
2351        nested_vmcb->save.cpl    = vmcb->save.cpl;
2352
2353        nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
2354        nested_vmcb->control.int_vector        = vmcb->control.int_vector;
2355        nested_vmcb->control.int_state         = vmcb->control.int_state;
2356        nested_vmcb->control.exit_code         = vmcb->control.exit_code;
2357        nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
2358        nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
2359        nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
2360        nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
2361        nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2362        nested_vmcb->control.next_rip          = vmcb->control.next_rip;
2363
2364        /*
2365         * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
2366         * to make sure that we do not lose injected events. So check event_inj
2367         * here and copy it to exit_int_info if it is valid.
2368         * Exit_int_info and event_inj can't be both valid because the case
2369         * below only happens on a VMRUN instruction intercept which has
2370         * no valid exit_int_info set.
2371         */
2372        if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
2373                struct vmcb_control_area *nc = &nested_vmcb->control;
2374
2375                nc->exit_int_info     = vmcb->control.event_inj;
2376                nc->exit_int_info_err = vmcb->control.event_inj_err;
2377        }
2378
2379        nested_vmcb->control.tlb_ctl           = 0;
2380        nested_vmcb->control.event_inj         = 0;
2381        nested_vmcb->control.event_inj_err     = 0;
2382
2383        /* We always set V_INTR_MASKING and remember the old value in hflags */
2384        if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2385                nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
2386
2387        /* Restore the original control entries */
2388        copy_vmcb_control_area(vmcb, hsave);
2389
2390        kvm_clear_exception_queue(&svm->vcpu);
2391        kvm_clear_interrupt_queue(&svm->vcpu);
2392
2393        svm->nested.nested_cr3 = 0;
2394
2395        /* Restore selected save entries */
2396        svm->vmcb->save.es = hsave->save.es;
2397        svm->vmcb->save.cs = hsave->save.cs;
2398        svm->vmcb->save.ss = hsave->save.ss;
2399        svm->vmcb->save.ds = hsave->save.ds;
2400        svm->vmcb->save.gdtr = hsave->save.gdtr;
2401        svm->vmcb->save.idtr = hsave->save.idtr;
2402        kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
2403        svm_set_efer(&svm->vcpu, hsave->save.efer);
2404        svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
2405        svm_set_cr4(&svm->vcpu, hsave->save.cr4);
2406        if (npt_enabled) {
2407                svm->vmcb->save.cr3 = hsave->save.cr3;
2408                svm->vcpu.arch.cr3 = hsave->save.cr3;
2409        } else {
2410                (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
2411        }
2412        kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
2413        kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
2414        kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
2415        svm->vmcb->save.dr7 = 0;
2416        svm->vmcb->save.cpl = 0;
2417        svm->vmcb->control.exit_int_info = 0;
2418
2419        mark_all_dirty(svm->vmcb);
2420
2421        nested_svm_unmap(page);
2422
2423        nested_svm_uninit_mmu_context(&svm->vcpu);
2424        kvm_mmu_reset_context(&svm->vcpu);
2425        kvm_mmu_load(&svm->vcpu);
2426
2427        return 0;
2428}
2429
2430static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2431{
2432        /*
2433         * This function merges the msr permission bitmaps of kvm and the
2434         * nested vmcb. It is optimized in that it only merges the parts where
2435         * the kvm msr permission bitmap may contain zero bits
2436         */
2437        int i;
2438
2439        if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2440                return true;
2441
2442        for (i = 0; i < MSRPM_OFFSETS; i++) {
2443                u32 value, p;
2444                u64 offset;
2445
2446                if (msrpm_offsets[i] == 0xffffffff)
2447                        break;
2448
2449                p      = msrpm_offsets[i];
2450                offset = svm->nested.vmcb_msrpm + (p * 4);
2451
2452                if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
2453                        return false;
2454
2455                svm->nested.msrpm[p] = svm->msrpm[p] | value;
2456        }
2457
2458        svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
2459
2460        return true;
2461}
2462
2463static bool nested_vmcb_checks(struct vmcb *vmcb)
2464{
2465        if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2466                return false;
2467
2468        if (vmcb->control.asid == 0)
2469                return false;
2470
2471        if (vmcb->control.nested_ctl && !npt_enabled)
2472                return false;
2473
2474        return true;
2475}
2476
2477static bool nested_svm_vmrun(struct vcpu_svm *svm)
2478{
2479        struct vmcb *nested_vmcb;
2480        struct vmcb *hsave = svm->nested.hsave;
2481        struct vmcb *vmcb = svm->vmcb;
2482        struct page *page;
2483        u64 vmcb_gpa;
2484
2485        vmcb_gpa = svm->vmcb->save.rax;
2486
2487        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2488        if (!nested_vmcb)
2489                return false;
2490
2491        if (!nested_vmcb_checks(nested_vmcb)) {
2492                nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
2493                nested_vmcb->control.exit_code_hi = 0;
2494                nested_vmcb->control.exit_info_1  = 0;
2495                nested_vmcb->control.exit_info_2  = 0;
2496
2497                nested_svm_unmap(page);
2498
2499                return false;
2500        }
2501
2502        trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2503                               nested_vmcb->save.rip,
2504                               nested_vmcb->control.int_ctl,
2505                               nested_vmcb->control.event_inj,
2506                               nested_vmcb->control.nested_ctl);
2507
2508        trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2509                                    nested_vmcb->control.intercept_cr >> 16,
2510                                    nested_vmcb->control.intercept_exceptions,
2511                                    nested_vmcb->control.intercept);
2512
2513        /* Clear internal status */
2514        kvm_clear_exception_queue(&svm->vcpu);
2515        kvm_clear_interrupt_queue(&svm->vcpu);
2516
2517        /*
2518         * Save the old vmcb, so we don't need to pick what we save, but can
2519         * restore everything when a VMEXIT occurs
2520         */
2521        hsave->save.es     = vmcb->save.es;
2522        hsave->save.cs     = vmcb->save.cs;
2523        hsave->save.ss     = vmcb->save.ss;
2524        hsave->save.ds     = vmcb->save.ds;
2525        hsave->save.gdtr   = vmcb->save.gdtr;
2526        hsave->save.idtr   = vmcb->save.idtr;
2527        hsave->save.efer   = svm->vcpu.arch.efer;
2528        hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
2529        hsave->save.cr4    = svm->vcpu.arch.cr4;
2530        hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
2531        hsave->save.rip    = kvm_rip_read(&svm->vcpu);
2532        hsave->save.rsp    = vmcb->save.rsp;
2533        hsave->save.rax    = vmcb->save.rax;
2534        if (npt_enabled)
2535                hsave->save.cr3    = vmcb->save.cr3;
2536        else
2537                hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
2538
2539        copy_vmcb_control_area(hsave, vmcb);
2540
2541        if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
2542                svm->vcpu.arch.hflags |= HF_HIF_MASK;
2543        else
2544                svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2545
2546        if (nested_vmcb->control.nested_ctl) {
2547                kvm_mmu_unload(&svm->vcpu);
2548                svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2549                nested_svm_init_mmu_context(&svm->vcpu);
2550        }
2551
2552        /* Load the nested guest state */
2553        svm->vmcb->save.es = nested_vmcb->save.es;
2554        svm->vmcb->save.cs = nested_vmcb->save.cs;
2555        svm->vmcb->save.ss = nested_vmcb->save.ss;
2556        svm->vmcb->save.ds = nested_vmcb->save.ds;
2557        svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
2558        svm->vmcb->save.idtr = nested_vmcb->save.idtr;
2559        kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
2560        svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
2561        svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
2562        svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
2563        if (npt_enabled) {
2564                svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2565                svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2566        } else
2567                (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2568
2569        /* Guest paging mode is active - reset mmu */
2570        kvm_mmu_reset_context(&svm->vcpu);
2571
2572        svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
2573        kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
2574        kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
2575        kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2576
2577        /* In case we don't even reach vcpu_run, the fields are not updated */
2578        svm->vmcb->save.rax = nested_vmcb->save.rax;
2579        svm->vmcb->save.rsp = nested_vmcb->save.rsp;
2580        svm->vmcb->save.rip = nested_vmcb->save.rip;
2581        svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
2582        svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
2583        svm->vmcb->save.cpl = nested_vmcb->save.cpl;
2584
2585        svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
2586        svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
2587
2588        /* cache intercepts */
2589        svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
2590        svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
2591        svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2592        svm->nested.intercept            = nested_vmcb->control.intercept;
2593
2594        svm_flush_tlb(&svm->vcpu);
2595        svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2596        if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2597                svm->vcpu.arch.hflags |= HF_VINTR_MASK;
2598        else
2599                svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
2600
2601        if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2602                /* We only want the cr8 intercept bits of the guest */
2603                clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2604                clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2605        }
2606
2607        /* We don't want to see VMMCALLs from a nested guest */
2608        clr_intercept(svm, INTERCEPT_VMMCALL);
2609
2610        svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2611        svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
2612        svm->vmcb->control.int_state = nested_vmcb->control.int_state;
2613        svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
2614        svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
2615        svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
2616
2617        nested_svm_unmap(page);
2618
2619        /* Enter Guest-Mode */
2620        enter_guest_mode(&svm->vcpu);
2621
2622        /*
2623         * Merge guest and host intercepts - must be called  with vcpu in
2624         * guest-mode to take affect here
2625         */
2626        recalc_intercepts(svm);
2627
2628        svm->nested.vmcb = vmcb_gpa;
2629
2630        enable_gif(svm);
2631
2632        mark_all_dirty(svm->vmcb);
2633
2634        return true;
2635}
2636
2637static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
2638{
2639        to_vmcb->save.fs = from_vmcb->save.fs;
2640        to_vmcb->save.gs = from_vmcb->save.gs;
2641        to_vmcb->save.tr = from_vmcb->save.tr;
2642        to_vmcb->save.ldtr = from_vmcb->save.ldtr;
2643        to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
2644        to_vmcb->save.star = from_vmcb->save.star;
2645        to_vmcb->save.lstar = from_vmcb->save.lstar;
2646        to_vmcb->save.cstar = from_vmcb->save.cstar;
2647        to_vmcb->save.sfmask = from_vmcb->save.sfmask;
2648        to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
2649        to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
2650        to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
2651}
2652
2653static int vmload_interception(struct vcpu_svm *svm)
2654{
2655        struct vmcb *nested_vmcb;
2656        struct page *page;
2657
2658        if (nested_svm_check_permissions(svm))
2659                return 1;
2660
2661        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2662        if (!nested_vmcb)
2663                return 1;
2664
2665        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2666        skip_emulated_instruction(&svm->vcpu);
2667
2668        nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2669        nested_svm_unmap(page);
2670
2671        return 1;
2672}
2673
2674static int vmsave_interception(struct vcpu_svm *svm)
2675{
2676        struct vmcb *nested_vmcb;
2677        struct page *page;
2678
2679        if (nested_svm_check_permissions(svm))
2680                return 1;
2681
2682        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2683        if (!nested_vmcb)
2684                return 1;
2685
2686        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2687        skip_emulated_instruction(&svm->vcpu);
2688
2689        nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2690        nested_svm_unmap(page);
2691
2692        return 1;
2693}
2694
2695static int vmrun_interception(struct vcpu_svm *svm)
2696{
2697        if (nested_svm_check_permissions(svm))
2698                return 1;
2699
2700        /* Save rip after vmrun instruction */
2701        kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
2702
2703        if (!nested_svm_vmrun(svm))
2704                return 1;
2705
2706        if (!nested_svm_vmrun_msrpm(svm))
2707                goto failed;
2708
2709        return 1;
2710
2711failed:
2712
2713        svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
2714        svm->vmcb->control.exit_code_hi = 0;
2715        svm->vmcb->control.exit_info_1  = 0;
2716        svm->vmcb->control.exit_info_2  = 0;
2717
2718        nested_svm_vmexit(svm);
2719
2720        return 1;
2721}
2722
2723static int stgi_interception(struct vcpu_svm *svm)
2724{
2725        if (nested_svm_check_permissions(svm))
2726                return 1;
2727
2728        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2729        skip_emulated_instruction(&svm->vcpu);
2730        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2731
2732        enable_gif(svm);
2733
2734        return 1;
2735}
2736
2737static int clgi_interception(struct vcpu_svm *svm)
2738{
2739        if (nested_svm_check_permissions(svm))
2740                return 1;
2741
2742        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2743        skip_emulated_instruction(&svm->vcpu);
2744
2745        disable_gif(svm);
2746
2747        /* After a CLGI no interrupts should come */
2748        svm_clear_vintr(svm);
2749        svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2750
2751        mark_dirty(svm->vmcb, VMCB_INTR);
2752
2753        return 1;
2754}
2755
2756static int invlpga_interception(struct vcpu_svm *svm)
2757{
2758        struct kvm_vcpu *vcpu = &svm->vcpu;
2759
2760        trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
2761                          vcpu->arch.regs[VCPU_REGS_RAX]);
2762
2763        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2764        kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
2765
2766        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2767        skip_emulated_instruction(&svm->vcpu);
2768        return 1;
2769}
2770
2771static int skinit_interception(struct vcpu_svm *svm)
2772{
2773        trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
2774
2775        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2776        return 1;
2777}
2778
2779static int xsetbv_interception(struct vcpu_svm *svm)
2780{
2781        u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2782        u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
2783
2784        if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2785                svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2786                skip_emulated_instruction(&svm->vcpu);
2787        }
2788
2789        return 1;
2790}
2791
2792static int task_switch_interception(struct vcpu_svm *svm)
2793{
2794        u16 tss_selector;
2795        int reason;
2796        int int_type = svm->vmcb->control.exit_int_info &
2797                SVM_EXITINTINFO_TYPE_MASK;
2798        int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2799        uint32_t type =
2800                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2801        uint32_t idt_v =
2802                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2803        bool has_error_code = false;
2804        u32 error_code = 0;
2805
2806        tss_selector = (u16)svm->vmcb->control.exit_info_1;
2807
2808        if (svm->vmcb->control.exit_info_2 &
2809            (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2810                reason = TASK_SWITCH_IRET;
2811        else if (svm->vmcb->control.exit_info_2 &
2812                 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2813                reason = TASK_SWITCH_JMP;
2814        else if (idt_v)
2815                reason = TASK_SWITCH_GATE;
2816        else
2817                reason = TASK_SWITCH_CALL;
2818
2819        if (reason == TASK_SWITCH_GATE) {
2820                switch (type) {
2821                case SVM_EXITINTINFO_TYPE_NMI:
2822                        svm->vcpu.arch.nmi_injected = false;
2823                        break;
2824                case SVM_EXITINTINFO_TYPE_EXEPT:
2825                        if (svm->vmcb->control.exit_info_2 &
2826                            (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2827                                has_error_code = true;
2828                                error_code =
2829                                        (u32)svm->vmcb->control.exit_info_2;
2830                        }
2831                        kvm_clear_exception_queue(&svm->vcpu);
2832                        break;
2833                case SVM_EXITINTINFO_TYPE_INTR:
2834                        kvm_clear_interrupt_queue(&svm->vcpu);
2835                        break;
2836                default:
2837                        break;
2838                }
2839        }
2840
2841        if (reason != TASK_SWITCH_GATE ||
2842            int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2843            (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2844             (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2845                skip_emulated_instruction(&svm->vcpu);
2846
2847        if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2848                int_vec = -1;
2849
2850        if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
2851                                has_error_code, error_code) == EMULATE_FAIL) {
2852                svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2853                svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2854                svm->vcpu.run->internal.ndata = 0;
2855                return 0;
2856        }
2857        return 1;
2858}
2859
2860static int cpuid_interception(struct vcpu_svm *svm)
2861{
2862        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2863        kvm_emulate_cpuid(&svm->vcpu);
2864        return 1;
2865}
2866
2867static int iret_interception(struct vcpu_svm *svm)
2868{
2869        ++svm->vcpu.stat.nmi_window_exits;
2870        clr_intercept(svm, INTERCEPT_IRET);
2871        svm->vcpu.arch.hflags |= HF_IRET_MASK;
2872        svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2873        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2874        return 1;
2875}
2876
2877static int invlpg_interception(struct vcpu_svm *svm)
2878{
2879        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2880                return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2881
2882        kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2883        skip_emulated_instruction(&svm->vcpu);
2884        return 1;
2885}
2886
2887static int emulate_on_interception(struct vcpu_svm *svm)
2888{
2889        return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2890}
2891
2892static int rdpmc_interception(struct vcpu_svm *svm)
2893{
2894        int err;
2895
2896        if (!static_cpu_has(X86_FEATURE_NRIPS))
2897                return emulate_on_interception(svm);
2898
2899        err = kvm_rdpmc(&svm->vcpu);
2900        kvm_complete_insn_gp(&svm->vcpu, err);
2901
2902        return 1;
2903}
2904
2905bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
2906{
2907        unsigned long cr0 = svm->vcpu.arch.cr0;
2908        bool ret = false;
2909        u64 intercept;
2910
2911        intercept = svm->nested.intercept;
2912
2913        if (!is_guest_mode(&svm->vcpu) ||
2914            (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
2915                return false;
2916
2917        cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2918        val &= ~SVM_CR0_SELECTIVE_MASK;
2919
2920        if (cr0 ^ val) {
2921                svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2922                ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2923        }
2924
2925        return ret;
2926}
2927
2928#define CR_VALID (1ULL << 63)
2929
2930static int cr_interception(struct vcpu_svm *svm)
2931{
2932        int reg, cr;
2933        unsigned long val;
2934        int err;
2935
2936        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2937                return emulate_on_interception(svm);
2938
2939        if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2940                return emulate_on_interception(svm);
2941
2942        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2943        cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2944
2945        err = 0;
2946        if (cr >= 16) { /* mov to cr */
2947                cr -= 16;
2948                val = kvm_register_read(&svm->vcpu, reg);
2949                switch (cr) {
2950                case 0:
2951                        if (!check_selective_cr0_intercepted(svm, val))
2952                                err = kvm_set_cr0(&svm->vcpu, val);
2953                        else
2954                                return 1;
2955
2956                        break;
2957                case 3:
2958                        err = kvm_set_cr3(&svm->vcpu, val);
2959                        break;
2960                case 4:
2961                        err = kvm_set_cr4(&svm->vcpu, val);
2962                        break;
2963                case 8:
2964                        err = kvm_set_cr8(&svm->vcpu, val);
2965                        break;
2966                default:
2967                        WARN(1, "unhandled write to CR%d", cr);
2968                        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2969                        return 1;
2970                }
2971        } else { /* mov from cr */
2972                switch (cr) {
2973                case 0:
2974                        val = kvm_read_cr0(&svm->vcpu);
2975                        break;
2976                case 2:
2977                        val = svm->vcpu.arch.cr2;
2978                        break;
2979                case 3:
2980                        val = kvm_read_cr3(&svm->vcpu);
2981                        break;
2982                case 4:
2983                        val = kvm_read_cr4(&svm->vcpu);
2984                        break;
2985                case 8:
2986                        val = kvm_get_cr8(&svm->vcpu);
2987                        break;
2988                default:
2989                        WARN(1, "unhandled read from CR%d", cr);
2990                        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2991                        return 1;
2992                }
2993                kvm_register_write(&svm->vcpu, reg, val);
2994        }
2995        kvm_complete_insn_gp(&svm->vcpu, err);
2996
2997        return 1;
2998}
2999
3000static int dr_interception(struct vcpu_svm *svm)
3001{
3002        int reg, dr;
3003        unsigned long val;
3004
3005        if (svm->vcpu.guest_debug == 0) {
3006                /*
3007                 * No more DR vmexits; force a reload of the debug registers
3008                 * and reenter on this instruction.  The next vmexit will
3009                 * retrieve the full state of the debug registers.
3010                 */
3011                clr_dr_intercepts(svm);
3012                svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
3013                return 1;
3014        }
3015
3016        if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
3017                return emulate_on_interception(svm);
3018
3019        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3020        dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
3021
3022        if (dr >= 16) { /* mov to DRn */
3023                if (!kvm_require_dr(&svm->vcpu, dr - 16))
3024                        return 1;
3025                val = kvm_register_read(&svm->vcpu, reg);
3026                kvm_set_dr(&svm->vcpu, dr - 16, val);
3027        } else {
3028                if (!kvm_require_dr(&svm->vcpu, dr))
3029                        return 1;
3030                kvm_get_dr(&svm->vcpu, dr, &val);
3031                kvm_register_write(&svm->vcpu, reg, val);
3032        }
3033
3034        skip_emulated_instruction(&svm->vcpu);
3035
3036        return 1;
3037}
3038
3039static int cr8_write_interception(struct vcpu_svm *svm)
3040{
3041        struct kvm_run *kvm_run = svm->vcpu.run;
3042        int r;
3043
3044        u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
3045        /* instruction emulation calls kvm_set_cr8() */
3046        r = cr_interception(svm);
3047        if (irqchip_in_kernel(svm->vcpu.kvm))
3048                return r;
3049        if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
3050                return r;
3051        kvm_run->exit_reason = KVM_EXIT_SET_TPR;
3052        return 0;
3053}
3054
3055static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
3056{
3057        struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
3058        return vmcb->control.tsc_offset +
3059                svm_scale_tsc(vcpu, host_tsc);
3060}
3061
3062static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
3063{
3064        struct vcpu_svm *svm = to_svm(vcpu);
3065
3066        switch (ecx) {
3067        case MSR_IA32_TSC: {
3068                *data = svm->vmcb->control.tsc_offset +
3069                        svm_scale_tsc(vcpu, native_read_tsc());
3070
3071                break;
3072        }
3073        case MSR_STAR:
3074                *data = svm->vmcb->save.star;
3075                break;
3076#ifdef CONFIG_X86_64
3077        case MSR_LSTAR:
3078                *data = svm->vmcb->save.lstar;
3079                break;
3080        case MSR_CSTAR:
3081                *data = svm->vmcb->save.cstar;
3082                break;
3083        case MSR_KERNEL_GS_BASE:
3084                *data = svm->vmcb->save.kernel_gs_base;
3085                break;
3086        case MSR_SYSCALL_MASK:
3087                *data = svm->vmcb->save.sfmask;
3088                break;
3089#endif
3090        case MSR_IA32_SYSENTER_CS:
3091                *data = svm->vmcb->save.sysenter_cs;
3092                break;
3093        case MSR_IA32_SYSENTER_EIP:
3094                *data = svm->sysenter_eip;
3095                break;
3096        case MSR_IA32_SYSENTER_ESP:
3097                *data = svm->sysenter_esp;
3098                break;
3099        /*
3100         * Nobody will change the following 5 values in the VMCB so we can
3101         * safely return them on rdmsr. They will always be 0 until LBRV is
3102         * implemented.
3103         */
3104        case MSR_IA32_DEBUGCTLMSR:
3105                *data = svm->vmcb->save.dbgctl;
3106                break;
3107        case MSR_IA32_LASTBRANCHFROMIP:
3108                *data = svm->vmcb->save.br_from;
3109                break;
3110        case MSR_IA32_LASTBRANCHTOIP:
3111                *data = svm->vmcb->save.br_to;
3112                break;
3113        case MSR_IA32_LASTINTFROMIP:
3114                *data = svm->vmcb->save.last_excp_from;
3115                break;
3116        case MSR_IA32_LASTINTTOIP:
3117                *data = svm->vmcb->save.last_excp_to;
3118                break;
3119        case MSR_VM_HSAVE_PA:
3120                *data = svm->nested.hsave_msr;
3121                break;
3122        case MSR_VM_CR:
3123                *data = svm->nested.vm_cr_msr;
3124                break;
3125        case MSR_IA32_UCODE_REV:
3126                *data = 0x01000065;
3127                break;
3128        default:
3129                return kvm_get_msr_common(vcpu, ecx, data);
3130        }
3131        return 0;
3132}
3133
3134static int rdmsr_interception(struct vcpu_svm *svm)
3135{
3136        u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
3137        u64 data;
3138
3139        if (svm_get_msr(&svm->vcpu, ecx, &data)) {
3140                trace_kvm_msr_read_ex(ecx);
3141                kvm_inject_gp(&svm->vcpu, 0);
3142        } else {
3143                trace_kvm_msr_read(ecx, data);
3144
3145                svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
3146                svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
3147                svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3148                skip_emulated_instruction(&svm->vcpu);
3149        }
3150        return 1;
3151}
3152
3153static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
3154{
3155        struct vcpu_svm *svm = to_svm(vcpu);
3156        int svm_dis, chg_mask;
3157
3158        if (data & ~SVM_VM_CR_VALID_MASK)
3159                return 1;
3160
3161        chg_mask = SVM_VM_CR_VALID_MASK;
3162
3163        if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
3164                chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
3165
3166        svm->nested.vm_cr_msr &= ~chg_mask;
3167        svm->nested.vm_cr_msr |= (data & chg_mask);
3168
3169        svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
3170
3171        /* check for svm_disable while efer.svme is set */
3172        if (svm_dis && (vcpu->arch.efer & EFER_SVME))
3173                return 1;
3174
3175        return 0;
3176}
3177
3178static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3179{
3180        struct vcpu_svm *svm = to_svm(vcpu);
3181
3182        u32 ecx = msr->index;
3183        u64 data = msr->data;
3184        switch (ecx) {
3185        case MSR_IA32_TSC:
3186                kvm_write_tsc(vcpu, msr);
3187                break;
3188        case MSR_STAR:
3189                svm->vmcb->save.star = data;
3190                break;
3191#ifdef CONFIG_X86_64
3192        case MSR_LSTAR:
3193                svm->vmcb->save.lstar = data;
3194                break;
3195        case MSR_CSTAR:
3196                svm->vmcb->save.cstar = data;
3197                break;
3198        case MSR_KERNEL_GS_BASE:
3199                svm->vmcb->save.kernel_gs_base = data;
3200                break;
3201        case MSR_SYSCALL_MASK:
3202                svm->vmcb->save.sfmask = data;
3203                break;
3204#endif
3205        case MSR_IA32_SYSENTER_CS:
3206                svm->vmcb->save.sysenter_cs = data;
3207                break;
3208        case MSR_IA32_SYSENTER_EIP:
3209                svm->sysenter_eip = data;
3210                svm->vmcb->save.sysenter_eip = data;
3211                break;
3212        case MSR_IA32_SYSENTER_ESP:
3213                svm->sysenter_esp = data;
3214                svm->vmcb->save.sysenter_esp = data;
3215                break;
3216        case MSR_IA32_DEBUGCTLMSR:
3217                if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3218                        vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3219                                    __func__, data);
3220                        break;
3221                }
3222                if (data & DEBUGCTL_RESERVED_BITS)
3223                        return 1;
3224
3225                svm->vmcb->save.dbgctl = data;
3226                mark_dirty(svm->vmcb, VMCB_LBR);
3227                if (data & (1ULL<<0))
3228                        svm_enable_lbrv(svm);
3229                else
3230                        svm_disable_lbrv(svm);
3231                break;
3232        case MSR_VM_HSAVE_PA:
3233                svm->nested.hsave_msr = data;
3234                break;
3235        case MSR_VM_CR:
3236                return svm_set_vm_cr(vcpu, data);
3237        case MSR_VM_IGNNE:
3238                vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3239                break;
3240        default:
3241                return kvm_set_msr_common(vcpu, msr);
3242        }
3243        return 0;
3244}
3245
3246static int wrmsr_interception(struct vcpu_svm *svm)
3247{
3248        struct msr_data msr;
3249        u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
3250        u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
3251                | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3252
3253        msr.data = data;
3254        msr.index = ecx;
3255        msr.host_initiated = false;
3256
3257        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3258        if (kvm_set_msr(&svm->vcpu, &msr)) {
3259                trace_kvm_msr_write_ex(ecx, data);
3260                kvm_inject_gp(&svm->vcpu, 0);
3261        } else {
3262                trace_kvm_msr_write(ecx, data);
3263                skip_emulated_instruction(&svm->vcpu);
3264        }
3265        return 1;
3266}
3267
3268static int msr_interception(struct vcpu_svm *svm)
3269{
3270        if (svm->vmcb->control.exit_info_1)
3271                return wrmsr_interception(svm);
3272        else
3273                return rdmsr_interception(svm);
3274}
3275
3276static int interrupt_window_interception(struct vcpu_svm *svm)
3277{
3278        struct kvm_run *kvm_run = svm->vcpu.run;
3279
3280        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3281        svm_clear_vintr(svm);
3282        svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3283        mark_dirty(svm->vmcb, VMCB_INTR);
3284        ++svm->vcpu.stat.irq_window_exits;
3285        /*
3286         * If the user space waits to inject interrupts, exit as soon as
3287         * possible
3288         */
3289        if (!irqchip_in_kernel(svm->vcpu.kvm) &&
3290            kvm_run->request_interrupt_window &&
3291            !kvm_cpu_has_interrupt(&svm->vcpu)) {
3292                kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3293                return 0;
3294        }
3295
3296        return 1;
3297}
3298
3299static int pause_interception(struct vcpu_svm *svm)
3300{
3301        kvm_vcpu_on_spin(&(svm->vcpu));
3302        return 1;
3303}
3304
3305static int nop_interception(struct vcpu_svm *svm)
3306{
3307        skip_emulated_instruction(&(svm->vcpu));
3308        return 1;
3309}
3310
3311static int monitor_interception(struct vcpu_svm *svm)
3312{
3313        printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
3314        return nop_interception(svm);
3315}
3316
3317static int mwait_interception(struct vcpu_svm *svm)
3318{
3319        printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
3320        return nop_interception(svm);
3321}
3322
3323static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3324        [SVM_EXIT_READ_CR0]                     = cr_interception,
3325        [SVM_EXIT_READ_CR3]                     = cr_interception,
3326        [SVM_EXIT_READ_CR4]                     = cr_interception,
3327        [SVM_EXIT_READ_CR8]                     = cr_interception,
3328        [SVM_EXIT_CR0_SEL_WRITE]                = emulate_on_interception,
3329        [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3330        [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3331        [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3332        [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3333        [SVM_EXIT_READ_DR0]                     = dr_interception,
3334        [SVM_EXIT_READ_DR1]                     = dr_interception,
3335        [SVM_EXIT_READ_DR2]                     = dr_interception,
3336        [SVM_EXIT_READ_DR3]                     = dr_interception,
3337        [SVM_EXIT_READ_DR4]                     = dr_interception,
3338        [SVM_EXIT_READ_DR5]                     = dr_interception,
3339        [SVM_EXIT_READ_DR6]                     = dr_interception,
3340        [SVM_EXIT_READ_DR7]                     = dr_interception,
3341        [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3342        [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3343        [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3344        [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3345        [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3346        [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3347        [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3348        [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3349        [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3350        [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3351        [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3352        [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3353        [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
3354        [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3355        [SVM_EXIT_INTR]                         = intr_interception,
3356        [SVM_EXIT_NMI]                          = nmi_interception,
3357        [SVM_EXIT_SMI]                          = nop_on_interception,
3358        [SVM_EXIT_INIT]                         = nop_on_interception,
3359        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3360        [SVM_EXIT_RDPMC]                        = rdpmc_interception,
3361        [SVM_EXIT_CPUID]                        = cpuid_interception,
3362        [SVM_EXIT_IRET]                         = iret_interception,
3363        [SVM_EXIT_INVD]                         = emulate_on_interception,
3364        [SVM_EXIT_PAUSE]                        = pause_interception,
3365        [SVM_EXIT_HLT]                          = halt_interception,
3366        [SVM_EXIT_INVLPG]                       = invlpg_interception,
3367        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3368        [SVM_EXIT_IOIO]                         = io_interception,
3369        [SVM_EXIT_MSR]                          = msr_interception,
3370        [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3371        [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3372        [SVM_EXIT_VMRUN]                        = vmrun_interception,
3373        [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
3374        [SVM_EXIT_VMLOAD]                       = vmload_interception,
3375        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3376        [SVM_EXIT_STGI]                         = stgi_interception,
3377        [SVM_EXIT_CLGI]                         = clgi_interception,
3378        [SVM_EXIT_SKINIT]                       = skinit_interception,
3379        [SVM_EXIT_WBINVD]                       = emulate_on_interception,
3380        [SVM_EXIT_MONITOR]                      = monitor_interception,
3381        [SVM_EXIT_MWAIT]                        = mwait_interception,
3382        [SVM_EXIT_XSETBV]                       = xsetbv_interception,
3383        [SVM_EXIT_NPF]                          = pf_interception,
3384};
3385
3386static void dump_vmcb(struct kvm_vcpu *vcpu)
3387{
3388        struct vcpu_svm *svm = to_svm(vcpu);
3389        struct vmcb_control_area *control = &svm->vmcb->control;
3390        struct vmcb_save_area *save = &svm->vmcb->save;
3391
3392        pr_err("VMCB Control Area:\n");
3393        pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
3394        pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
3395        pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
3396        pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
3397        pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
3398        pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
3399        pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3400        pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3401        pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3402        pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3403        pr_err("%-20s%d\n", "asid:", control->asid);
3404        pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3405        pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3406        pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3407        pr_err("%-20s%08x\n", "int_state:", control->int_state);
3408        pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3409        pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3410        pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3411        pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3412        pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3413        pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3414        pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3415        pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3416        pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3417        pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
3418        pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3419        pr_err("VMCB State Save Area:\n");
3420        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3421               "es:",
3422               save->es.selector, save->es.attrib,
3423               save->es.limit, save->es.base);
3424        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3425               "cs:",
3426               save->cs.selector, save->cs.attrib,
3427               save->cs.limit, save->cs.base);
3428        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3429               "ss:",
3430               save->ss.selector, save->ss.attrib,
3431               save->ss.limit, save->ss.base);
3432        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3433               "ds:",
3434               save->ds.selector, save->ds.attrib,
3435               save->ds.limit, save->ds.base);
3436        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3437               "fs:",
3438               save->fs.selector, save->fs.attrib,
3439               save->fs.limit, save->fs.base);
3440        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3441               "gs:",
3442               save->gs.selector, save->gs.attrib,
3443               save->gs.limit, save->gs.base);
3444        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3445               "gdtr:",
3446               save->gdtr.selector, save->gdtr.attrib,
3447               save->gdtr.limit, save->gdtr.base);
3448        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3449               "ldtr:",
3450               save->ldtr.selector, save->ldtr.attrib,
3451               save->ldtr.limit, save->ldtr.base);
3452        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3453               "idtr:",
3454               save->idtr.selector, save->idtr.attrib,
3455               save->idtr.limit, save->idtr.base);
3456        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3457               "tr:",
3458               save->tr.selector, save->tr.attrib,
3459               save->tr.limit, save->tr.base);
3460        pr_err("cpl:            %d                efer:         %016llx\n",
3461                save->cpl, save->efer);
3462        pr_err("%-15s %016llx %-13s %016llx\n",
3463               "cr0:", save->cr0, "cr2:", save->cr2);
3464        pr_err("%-15s %016llx %-13s %016llx\n",
3465               "cr3:", save->cr3, "cr4:", save->cr4);
3466        pr_err("%-15s %016llx %-13s %016llx\n",
3467               "dr6:", save->dr6, "dr7:", save->dr7);
3468        pr_err("%-15s %016llx %-13s %016llx\n",
3469               "rip:", save->rip, "rflags:", save->rflags);
3470        pr_err("%-15s %016llx %-13s %016llx\n",
3471               "rsp:", save->rsp, "rax:", save->rax);
3472        pr_err("%-15s %016llx %-13s %016llx\n",
3473               "star:", save->star, "lstar:", save->lstar);
3474        pr_err("%-15s %016llx %-13s %016llx\n",
3475               "cstar:", save->cstar, "sfmask:", save->sfmask);
3476        pr_err("%-15s %016llx %-13s %016llx\n",
3477               "kernel_gs_base:", save->kernel_gs_base,
3478               "sysenter_cs:", save->sysenter_cs);
3479        pr_err("%-15s %016llx %-13s %016llx\n",
3480               "sysenter_esp:", save->sysenter_esp,
3481               "sysenter_eip:", save->sysenter_eip);
3482        pr_err("%-15s %016llx %-13s %016llx\n",
3483               "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3484        pr_err("%-15s %016llx %-13s %016llx\n",
3485               "br_from:", save->br_from, "br_to:", save->br_to);
3486        pr_err("%-15s %016llx %-13s %016llx\n",
3487               "excp_from:", save->last_excp_from,
3488               "excp_to:", save->last_excp_to);
3489}
3490
3491static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3492{
3493        struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3494
3495        *info1 = control->exit_info_1;
3496        *info2 = control->exit_info_2;
3497}
3498
3499static int handle_exit(struct kvm_vcpu *vcpu)
3500{
3501        struct vcpu_svm *svm = to_svm(vcpu);
3502        struct kvm_run *kvm_run = vcpu->run;
3503        u32 exit_code = svm->vmcb->control.exit_code;
3504
3505        if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
3506                vcpu->arch.cr0 = svm->vmcb->save.cr0;
3507        if (npt_enabled)
3508                vcpu->arch.cr3 = svm->vmcb->save.cr3;
3509
3510        if (unlikely(svm->nested.exit_required)) {
3511                nested_svm_vmexit(svm);
3512                svm->nested.exit_required = false;
3513
3514                return 1;
3515        }
3516
3517        if (is_guest_mode(vcpu)) {
3518                int vmexit;
3519
3520                trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
3521                                        svm->vmcb->control.exit_info_1,
3522                                        svm->vmcb->control.exit_info_2,
3523                                        svm->vmcb->control.exit_int_info,
3524                                        svm->vmcb->control.exit_int_info_err,
3525                                        KVM_ISA_SVM);
3526
3527                vmexit = nested_svm_exit_special(svm);
3528
3529                if (vmexit == NESTED_EXIT_CONTINUE)
3530                        vmexit = nested_svm_exit_handled(svm);
3531
3532                if (vmexit == NESTED_EXIT_DONE)
3533                        return 1;
3534        }
3535
3536        svm_complete_interrupts(svm);
3537
3538        if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3539                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3540                kvm_run->fail_entry.hardware_entry_failure_reason
3541                        = svm->vmcb->control.exit_code;
3542                pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
3543                dump_vmcb(vcpu);
3544                return 0;
3545        }
3546
3547        if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
3548            exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3549            exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3550            exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3551                printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
3552                       "exit_code 0x%x\n",
3553                       __func__, svm->vmcb->control.exit_int_info,
3554                       exit_code);
3555
3556        if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
3557            || !svm_exit_handlers[exit_code]) {
3558                WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code);
3559                kvm_queue_exception(vcpu, UD_VECTOR);
3560                return 1;
3561        }
3562
3563        return svm_exit_handlers[exit_code](svm);
3564}
3565
3566static void reload_tss(struct kvm_vcpu *vcpu)
3567{
3568        int cpu = raw_smp_processor_id();
3569
3570        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3571        sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3572        load_TR_desc();
3573}
3574
3575static void pre_svm_run(struct vcpu_svm *svm)
3576{
3577        int cpu = raw_smp_processor_id();
3578
3579        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3580
3581        /* FIXME: handle wraparound of asid_generation */
3582        if (svm->asid_generation != sd->asid_generation)
3583                new_asid(svm, sd);
3584}
3585
3586static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3587{
3588        struct vcpu_svm *svm = to_svm(vcpu);
3589
3590        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3591        vcpu->arch.hflags |= HF_NMI_MASK;
3592        set_intercept(svm, INTERCEPT_IRET);
3593        ++vcpu->stat.nmi_injections;
3594}
3595
3596static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
3597{
3598        struct vmcb_control_area *control;
3599
3600        control = &svm->vmcb->control;
3601        control->int_vector = irq;
3602        control->int_ctl &= ~V_INTR_PRIO_MASK;
3603        control->int_ctl |= V_IRQ_MASK |
3604                ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
3605        mark_dirty(svm->vmcb, VMCB_INTR);
3606}
3607
3608static void svm_set_irq(struct kvm_vcpu *vcpu)
3609{
3610        struct vcpu_svm *svm = to_svm(vcpu);
3611
3612        BUG_ON(!(gif_set(svm)));
3613
3614        trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
3615        ++vcpu->stat.irq_injections;
3616
3617        svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3618                SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
3619}
3620
3621static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3622{
3623        struct vcpu_svm *svm = to_svm(vcpu);
3624
3625        if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3626                return;
3627
3628        clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3629
3630        if (irr == -1)
3631                return;
3632
3633        if (tpr >= irr)
3634                set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3635}
3636
3637static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
3638{
3639        return;
3640}
3641
3642static int svm_vm_has_apicv(struct kvm *kvm)
3643{
3644        return 0;
3645}
3646
3647static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
3648{
3649        return;
3650}
3651
3652static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
3653{
3654        return;
3655}
3656
3657static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
3658{
3659        struct vcpu_svm *svm = to_svm(vcpu);
3660        struct vmcb *vmcb = svm->vmcb;
3661        int ret;
3662        ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
3663              !(svm->vcpu.arch.hflags & HF_NMI_MASK);
3664        ret = ret && gif_set(svm) && nested_svm_nmi(svm);
3665
3666        return ret;
3667}
3668
3669static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3670{
3671        struct vcpu_svm *svm = to_svm(vcpu);
3672
3673        return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
3674}
3675
3676static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3677{
3678        struct vcpu_svm *svm = to_svm(vcpu);
3679
3680        if (masked) {
3681                svm->vcpu.arch.hflags |= HF_NMI_MASK;
3682                set_intercept(svm, INTERCEPT_IRET);
3683        } else {
3684                svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3685                clr_intercept(svm, INTERCEPT_IRET);
3686        }
3687}
3688
3689static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3690{
3691        struct vcpu_svm *svm = to_svm(vcpu);
3692        struct vmcb *vmcb = svm->vmcb;
3693        int ret;
3694
3695        if (!gif_set(svm) ||
3696             (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
3697                return 0;
3698
3699        ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
3700
3701        if (is_guest_mode(vcpu))
3702                return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
3703
3704        return ret;
3705}
3706
3707static void enable_irq_window(struct kvm_vcpu *vcpu)
3708{
3709        struct vcpu_svm *svm = to_svm(vcpu);
3710
3711        /*
3712         * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3713         * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3714         * get that intercept, this function will be called again though and
3715         * we'll get the vintr intercept.
3716         */
3717        if (gif_set(svm) && nested_svm_intr(svm)) {
3718                svm_set_vintr(svm);
3719                svm_inject_irq(svm, 0x0);
3720        }
3721}
3722
3723static void enable_nmi_window(struct kvm_vcpu *vcpu)
3724{
3725        struct vcpu_svm *svm = to_svm(vcpu);
3726
3727        if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
3728            == HF_NMI_MASK)
3729                return; /* IRET will cause a vm exit */
3730
3731        /*
3732         * Something prevents NMI from been injected. Single step over possible
3733         * problem (IRET or exception injection or interrupt shadow)
3734         */
3735        svm->nmi_singlestep = true;
3736        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3737        update_db_bp_intercept(vcpu);
3738}
3739
3740static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3741{
3742        return 0;
3743}
3744
3745static void svm_flush_tlb(struct kvm_vcpu *vcpu)
3746{
3747        struct vcpu_svm *svm = to_svm(vcpu);
3748
3749        if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3750                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3751        else
3752                svm->asid_generation--;
3753}
3754
3755static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
3756{
3757}
3758
3759static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3760{
3761        struct vcpu_svm *svm = to_svm(vcpu);
3762
3763        if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3764                return;
3765
3766        if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3767                int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3768                kvm_set_cr8(vcpu, cr8);
3769        }
3770}
3771
3772static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3773{
3774        struct vcpu_svm *svm = to_svm(vcpu);
3775        u64 cr8;
3776
3777        if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3778                return;
3779
3780        cr8 = kvm_get_cr8(vcpu);
3781        svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3782        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3783}
3784
3785static void svm_complete_interrupts(struct vcpu_svm *svm)
3786{
3787        u8 vector;
3788        int type;
3789        u32 exitintinfo = svm->vmcb->control.exit_int_info;
3790        unsigned int3_injected = svm->int3_injected;
3791
3792        svm->int3_injected = 0;
3793
3794        /*
3795         * If we've made progress since setting HF_IRET_MASK, we've
3796         * executed an IRET and can allow NMI injection.
3797         */
3798        if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3799            && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3800                svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3801                kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3802        }
3803
3804        svm->vcpu.arch.nmi_injected = false;
3805        kvm_clear_exception_queue(&svm->vcpu);
3806        kvm_clear_interrupt_queue(&svm->vcpu);
3807
3808        if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3809                return;
3810
3811        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3812
3813        vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3814        type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3815
3816        switch (type) {
3817        case SVM_EXITINTINFO_TYPE_NMI:
3818                svm->vcpu.arch.nmi_injected = true;
3819                break;
3820        case SVM_EXITINTINFO_TYPE_EXEPT:
3821                /*
3822                 * In case of software exceptions, do not reinject the vector,
3823                 * but re-execute the instruction instead. Rewind RIP first
3824                 * if we emulated INT3 before.
3825                 */
3826                if (kvm_exception_is_soft(vector)) {
3827                        if (vector == BP_VECTOR && int3_injected &&
3828                            kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
3829                                kvm_rip_write(&svm->vcpu,
3830                                              kvm_rip_read(&svm->vcpu) -
3831                                              int3_injected);
3832                        break;
3833                }
3834                if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3835                        u32 err = svm->vmcb->control.exit_int_info_err;
3836                        kvm_requeue_exception_e(&svm->vcpu, vector, err);
3837
3838                } else
3839                        kvm_requeue_exception(&svm->vcpu, vector);
3840                break;
3841        case SVM_EXITINTINFO_TYPE_INTR:
3842                kvm_queue_interrupt(&svm->vcpu, vector, false);
3843                break;
3844        default:
3845                break;
3846        }
3847}
3848
3849static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3850{
3851        struct vcpu_svm *svm = to_svm(vcpu);
3852        struct vmcb_control_area *control = &svm->vmcb->control;
3853
3854        control->exit_int_info = control->event_inj;
3855        control->exit_int_info_err = control->event_inj_err;
3856        control->event_inj = 0;
3857        svm_complete_interrupts(svm);
3858}
3859
3860static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3861{
3862        struct vcpu_svm *svm = to_svm(vcpu);
3863
3864        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3865        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3866        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3867
3868        /*
3869         * A vmexit emulation is required before the vcpu can be executed
3870         * again.
3871         */
3872        if (unlikely(svm->nested.exit_required))
3873                return;
3874
3875        pre_svm_run(svm);
3876
3877        sync_lapic_to_cr8(vcpu);
3878
3879        svm->vmcb->save.cr2 = vcpu->arch.cr2;
3880
3881        clgi();
3882
3883        local_irq_enable();
3884
3885        asm volatile (
3886                "push %%" _ASM_BP "; \n\t"
3887                "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
3888                "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
3889                "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
3890                "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
3891                "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
3892                "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
3893#ifdef CONFIG_X86_64
3894                "mov %c[r8](%[svm]),  %%r8  \n\t"
3895                "mov %c[r9](%[svm]),  %%r9  \n\t"
3896                "mov %c[r10](%[svm]), %%r10 \n\t"
3897                "mov %c[r11](%[svm]), %%r11 \n\t"
3898                "mov %c[r12](%[svm]), %%r12 \n\t"
3899                "mov %c[r13](%[svm]), %%r13 \n\t"
3900                "mov %c[r14](%[svm]), %%r14 \n\t"
3901                "mov %c[r15](%[svm]), %%r15 \n\t"
3902#endif
3903
3904                /* Enter guest mode */
3905                "push %%" _ASM_AX " \n\t"
3906                "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
3907                __ex(SVM_VMLOAD) "\n\t"
3908                __ex(SVM_VMRUN) "\n\t"
3909                __ex(SVM_VMSAVE) "\n\t"
3910                "pop %%" _ASM_AX " \n\t"
3911
3912                /* Save guest registers, load host registers */
3913                "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
3914                "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
3915                "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
3916                "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
3917                "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
3918                "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
3919#ifdef CONFIG_X86_64
3920                "mov %%r8,  %c[r8](%[svm]) \n\t"
3921                "mov %%r9,  %c[r9](%[svm]) \n\t"
3922                "mov %%r10, %c[r10](%[svm]) \n\t"
3923                "mov %%r11, %c[r11](%[svm]) \n\t"
3924                "mov %%r12, %c[r12](%[svm]) \n\t"
3925                "mov %%r13, %c[r13](%[svm]) \n\t"
3926                "mov %%r14, %c[r14](%[svm]) \n\t"
3927                "mov %%r15, %c[r15](%[svm]) \n\t"
3928#endif
3929                "pop %%" _ASM_BP
3930                :
3931                : [svm]"a"(svm),
3932                  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
3933                  [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
3934                  [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
3935                  [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
3936                  [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
3937                  [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
3938                  [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
3939#ifdef CONFIG_X86_64
3940                  , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
3941                  [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
3942                  [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
3943                  [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
3944                  [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
3945                  [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
3946                  [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
3947                  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
3948#endif
3949                : "cc", "memory"
3950#ifdef CONFIG_X86_64
3951                , "rbx", "rcx", "rdx", "rsi", "rdi"
3952                , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
3953#else
3954                , "ebx", "ecx", "edx", "esi", "edi"
3955#endif
3956                );
3957
3958#ifdef CONFIG_X86_64
3959        wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3960#else
3961        loadsegment(fs, svm->host.fs);
3962#ifndef CONFIG_X86_32_LAZY_GS
3963        loadsegment(gs, svm->host.gs);
3964#endif
3965#endif
3966
3967        reload_tss(vcpu);
3968
3969        local_irq_disable();
3970
3971        vcpu->arch.cr2 = svm->vmcb->save.cr2;
3972        vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3973        vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3974        vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3975
3976        trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
3977
3978        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3979                kvm_before_handle_nmi(&svm->vcpu);
3980
3981        stgi();
3982
3983        /* Any pending NMI will happen here */
3984
3985        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3986                kvm_after_handle_nmi(&svm->vcpu);
3987
3988        sync_cr8_to_lapic(vcpu);
3989
3990        svm->next_rip = 0;
3991
3992        svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3993
3994        /* if exit due to PF check for async PF */
3995        if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3996                svm->apf_reason = kvm_read_and_reset_pf_reason();
3997
3998        if (npt_enabled) {
3999                vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
4000                vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
4001        }
4002
4003        /*
4004         * We need to handle MC intercepts here before the vcpu has a chance to
4005         * change the physical cpu
4006         */
4007        if (unlikely(svm->vmcb->control.exit_code ==
4008                     SVM_EXIT_EXCP_BASE + MC_VECTOR))
4009                svm_handle_mce(svm);
4010
4011        mark_all_clean(svm->vmcb);
4012}
4013
4014static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
4015{
4016        struct vcpu_svm *svm = to_svm(vcpu);
4017
4018        svm->vmcb->save.cr3 = root;
4019        mark_dirty(svm->vmcb, VMCB_CR);
4020        svm_flush_tlb(vcpu);
4021}
4022
4023static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
4024{
4025        struct vcpu_svm *svm = to_svm(vcpu);
4026
4027        svm->vmcb->control.nested_cr3 = root;
4028        mark_dirty(svm->vmcb, VMCB_NPT);
4029
4030        /* Also sync guest cr3 here in case we live migrate */
4031        svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
4032        mark_dirty(svm->vmcb, VMCB_CR);
4033
4034        svm_flush_tlb(vcpu);
4035}
4036
4037static int is_disabled(void)
4038{
4039        u64 vm_cr;
4040
4041        rdmsrl(MSR_VM_CR, vm_cr);
4042        if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4043                return 1;
4044
4045        return 0;
4046}
4047
4048static void
4049svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4050{
4051        /*
4052         * Patch in the VMMCALL instruction:
4053         */
4054        hypercall[0] = 0x0f;
4055        hypercall[1] = 0x01;
4056        hypercall[2] = 0xd9;
4057}
4058
4059static void svm_check_processor_compat(void *rtn)
4060{
4061        *(int *)rtn = 0;
4062}
4063
4064static bool svm_cpu_has_accelerated_tpr(void)
4065{
4066        return false;
4067}
4068
4069static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4070{
4071        return 0;
4072}
4073
4074static void svm_cpuid_update(struct kvm_vcpu *vcpu)
4075{
4076}
4077
4078static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4079{
4080        switch (func) {
4081        case 0x80000001:
4082                if (nested)
4083                        entry->ecx |= (1 << 2); /* Set SVM bit */
4084                break;
4085        case 0x8000000A:
4086                entry->eax = 1; /* SVM revision 1 */
4087                entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
4088                                   ASID emulation to nested SVM */
4089                entry->ecx = 0; /* Reserved */
4090                entry->edx = 0; /* Per default do not support any
4091                                   additional features */
4092
4093                /* Support next_rip if host supports it */
4094                if (boot_cpu_has(X86_FEATURE_NRIPS))
4095                        entry->edx |= SVM_FEATURE_NRIP;
4096
4097                /* Support NPT for the guest if enabled */
4098                if (npt_enabled)
4099                        entry->edx |= SVM_FEATURE_NPT;
4100
4101                break;
4102        }
4103}
4104
4105static int svm_get_lpage_level(void)
4106{
4107        return PT_PDPE_LEVEL;
4108}
4109
4110static bool svm_rdtscp_supported(void)
4111{
4112        return false;
4113}
4114
4115static bool svm_invpcid_supported(void)
4116{
4117        return false;
4118}
4119
4120static bool svm_mpx_supported(void)
4121{
4122        return false;
4123}
4124
4125static bool svm_xsaves_supported(void)
4126{
4127        return false;
4128}
4129
4130static bool svm_has_wbinvd_exit(void)
4131{
4132        return true;
4133}
4134
4135static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
4136{
4137        struct vcpu_svm *svm = to_svm(vcpu);
4138
4139        set_exception_intercept(svm, NM_VECTOR);
4140        update_cr0_intercept(svm);
4141}
4142
4143#define PRE_EX(exit)  { .exit_code = (exit), \
4144                        .stage = X86_ICPT_PRE_EXCEPT, }
4145#define POST_EX(exit) { .exit_code = (exit), \
4146                        .stage = X86_ICPT_POST_EXCEPT, }
4147#define POST_MEM(exit) { .exit_code = (exit), \
4148                        .stage = X86_ICPT_POST_MEMACCESS, }
4149
4150static const struct __x86_intercept {
4151        u32 exit_code;
4152        enum x86_intercept_stage stage;
4153} x86_intercept_map[] = {
4154        [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4155        [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4156        [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4157        [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4158        [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4159        [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4160        [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4161        [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4162        [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4163        [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4164        [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4165        [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4166        [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4167        [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4168        [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4169        [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4170        [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4171        [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4172        [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4173        [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4174        [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4175        [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4176        [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4177        [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4178        [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4179        [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4180        [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4181        [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4182        [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4183        [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4184        [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4185        [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4186        [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4187        [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4188        [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4189        [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4190        [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4191        [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4192        [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4193        [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4194        [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4195        [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4196        [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4197        [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4198        [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4199        [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4200};
4201
4202#undef PRE_EX
4203#undef POST_EX
4204#undef POST_MEM
4205
4206static int svm_check_intercept(struct kvm_vcpu *vcpu,
4207                               struct x86_instruction_info *info,
4208                               enum x86_intercept_stage stage)
4209{
4210        struct vcpu_svm *svm = to_svm(vcpu);
4211        int vmexit, ret = X86EMUL_CONTINUE;
4212        struct __x86_intercept icpt_info;
4213        struct vmcb *vmcb = svm->vmcb;
4214
4215        if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4216                goto out;
4217
4218        icpt_info = x86_intercept_map[info->intercept];
4219
4220        if (stage != icpt_info.stage)
4221                goto out;
4222
4223        switch (icpt_info.exit_code) {
4224        case SVM_EXIT_READ_CR0:
4225                if (info->intercept == x86_intercept_cr_read)
4226                        icpt_info.exit_code += info->modrm_reg;
4227                break;
4228        case SVM_EXIT_WRITE_CR0: {
4229                unsigned long cr0, val;
4230                u64 intercept;
4231
4232                if (info->intercept == x86_intercept_cr_write)
4233                        icpt_info.exit_code += info->modrm_reg;
4234
4235                if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4236                    info->intercept == x86_intercept_clts)
4237                        break;
4238
4239                intercept = svm->nested.intercept;
4240
4241                if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
4242                        break;
4243
4244                cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4245                val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4246
4247                if (info->intercept == x86_intercept_lmsw) {
4248                        cr0 &= 0xfUL;
4249                        val &= 0xfUL;
4250                        /* lmsw can't clear PE - catch this here */
4251                        if (cr0 & X86_CR0_PE)
4252                                val |= X86_CR0_PE;
4253                }
4254
4255                if (cr0 ^ val)
4256                        icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4257
4258                break;
4259        }
4260        case SVM_EXIT_READ_DR0:
4261        case SVM_EXIT_WRITE_DR0:
4262                icpt_info.exit_code += info->modrm_reg;
4263                break;
4264        case SVM_EXIT_MSR:
4265                if (info->intercept == x86_intercept_wrmsr)
4266                        vmcb->control.exit_info_1 = 1;
4267                else
4268                        vmcb->control.exit_info_1 = 0;
4269                break;
4270        case SVM_EXIT_PAUSE:
4271                /*
4272                 * We get this for NOP only, but pause
4273                 * is rep not, check this here
4274                 */
4275                if (info->rep_prefix != REPE_PREFIX)
4276                        goto out;
4277        case SVM_EXIT_IOIO: {
4278                u64 exit_info;
4279                u32 bytes;
4280
4281                if (info->intercept == x86_intercept_in ||
4282                    info->intercept == x86_intercept_ins) {
4283                        exit_info = ((info->src_val & 0xffff) << 16) |
4284                                SVM_IOIO_TYPE_MASK;
4285                        bytes = info->dst_bytes;
4286                } else {
4287                        exit_info = (info->dst_val & 0xffff) << 16;
4288                        bytes = info->src_bytes;
4289                }
4290
4291                if (info->intercept == x86_intercept_outs ||
4292                    info->intercept == x86_intercept_ins)
4293                        exit_info |= SVM_IOIO_STR_MASK;
4294
4295                if (info->rep_prefix)
4296                        exit_info |= SVM_IOIO_REP_MASK;
4297
4298                bytes = min(bytes, 4u);
4299
4300                exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4301
4302                exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4303
4304                vmcb->control.exit_info_1 = exit_info;
4305                vmcb->control.exit_info_2 = info->next_rip;
4306
4307                break;
4308        }
4309        default:
4310                break;
4311        }
4312
4313        vmcb->control.next_rip  = info->next_rip;
4314        vmcb->control.exit_code = icpt_info.exit_code;
4315        vmexit = nested_svm_exit_handled(svm);
4316
4317        ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4318                                           : X86EMUL_CONTINUE;
4319
4320out:
4321        return ret;
4322}
4323
4324static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
4325{
4326        local_irq_enable();
4327}
4328
4329static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4330{
4331}
4332
4333static struct kvm_x86_ops svm_x86_ops = {
4334        .cpu_has_kvm_support = has_svm,
4335        .disabled_by_bios = is_disabled,
4336        .hardware_setup = svm_hardware_setup,
4337        .hardware_unsetup = svm_hardware_unsetup,
4338        .check_processor_compatibility = svm_check_processor_compat,
4339        .hardware_enable = svm_hardware_enable,
4340        .hardware_disable = svm_hardware_disable,
4341        .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
4342
4343        .vcpu_create = svm_create_vcpu,
4344        .vcpu_free = svm_free_vcpu,
4345        .vcpu_reset = svm_vcpu_reset,
4346
4347        .prepare_guest_switch = svm_prepare_guest_switch,
4348        .vcpu_load = svm_vcpu_load,
4349        .vcpu_put = svm_vcpu_put,
4350
4351        .update_db_bp_intercept = update_db_bp_intercept,
4352        .get_msr = svm_get_msr,
4353        .set_msr = svm_set_msr,
4354        .get_segment_base = svm_get_segment_base,
4355        .get_segment = svm_get_segment,
4356        .set_segment = svm_set_segment,
4357        .get_cpl = svm_get_cpl,
4358        .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
4359        .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
4360        .decache_cr3 = svm_decache_cr3,
4361        .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
4362        .set_cr0 = svm_set_cr0,
4363        .set_cr3 = svm_set_cr3,
4364        .set_cr4 = svm_set_cr4,
4365        .set_efer = svm_set_efer,
4366        .get_idt = svm_get_idt,
4367        .set_idt = svm_set_idt,
4368        .get_gdt = svm_get_gdt,
4369        .set_gdt = svm_set_gdt,
4370        .get_dr6 = svm_get_dr6,
4371        .set_dr6 = svm_set_dr6,
4372        .set_dr7 = svm_set_dr7,
4373        .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4374        .cache_reg = svm_cache_reg,
4375        .get_rflags = svm_get_rflags,
4376        .set_rflags = svm_set_rflags,
4377        .fpu_deactivate = svm_fpu_deactivate,
4378
4379        .tlb_flush = svm_flush_tlb,
4380
4381        .run = svm_vcpu_run,
4382        .handle_exit = handle_exit,
4383        .skip_emulated_instruction = skip_emulated_instruction,
4384        .set_interrupt_shadow = svm_set_interrupt_shadow,
4385        .get_interrupt_shadow = svm_get_interrupt_shadow,
4386        .patch_hypercall = svm_patch_hypercall,
4387        .set_irq = svm_set_irq,
4388        .set_nmi = svm_inject_nmi,
4389        .queue_exception = svm_queue_exception,
4390        .cancel_injection = svm_cancel_injection,
4391        .interrupt_allowed = svm_interrupt_allowed,
4392        .nmi_allowed = svm_nmi_allowed,
4393        .get_nmi_mask = svm_get_nmi_mask,
4394        .set_nmi_mask = svm_set_nmi_mask,
4395        .enable_nmi_window = enable_nmi_window,
4396        .enable_irq_window = enable_irq_window,
4397        .update_cr8_intercept = update_cr8_intercept,
4398        .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
4399        .vm_has_apicv = svm_vm_has_apicv,
4400        .load_eoi_exitmap = svm_load_eoi_exitmap,
4401        .sync_pir_to_irr = svm_sync_pir_to_irr,
4402
4403        .set_tss_addr = svm_set_tss_addr,
4404        .get_tdp_level = get_npt_level,
4405        .get_mt_mask = svm_get_mt_mask,
4406
4407        .get_exit_info = svm_get_exit_info,
4408
4409        .get_lpage_level = svm_get_lpage_level,
4410
4411        .cpuid_update = svm_cpuid_update,
4412
4413        .rdtscp_supported = svm_rdtscp_supported,
4414        .invpcid_supported = svm_invpcid_supported,
4415        .mpx_supported = svm_mpx_supported,
4416        .xsaves_supported = svm_xsaves_supported,
4417
4418        .set_supported_cpuid = svm_set_supported_cpuid,
4419
4420        .has_wbinvd_exit = svm_has_wbinvd_exit,
4421
4422        .set_tsc_khz = svm_set_tsc_khz,
4423        .read_tsc_offset = svm_read_tsc_offset,
4424        .write_tsc_offset = svm_write_tsc_offset,
4425        .adjust_tsc_offset = svm_adjust_tsc_offset,
4426        .compute_tsc_offset = svm_compute_tsc_offset,
4427        .read_l1_tsc = svm_read_l1_tsc,
4428
4429        .set_tdp_cr3 = set_tdp_cr3,
4430
4431        .check_intercept = svm_check_intercept,
4432        .handle_external_intr = svm_handle_external_intr,
4433
4434        .sched_in = svm_sched_in,
4435};
4436
4437static int __init svm_init(void)
4438{
4439        return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
4440                        __alignof__(struct vcpu_svm), THIS_MODULE);
4441}
4442
4443static void __exit svm_exit(void)
4444{
4445        kvm_exit();
4446}
4447
4448module_init(svm_init)
4449module_exit(svm_exit)
4450