linux/arch/x86/kvm/svm.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * AMD SVM support
   5 *
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   8 *
   9 * Authors:
  10 *   Yaniv Kamay  <yaniv@qumranet.com>
  11 *   Avi Kivity   <avi@qumranet.com>
  12 *
  13 * This work is licensed under the terms of the GNU GPL, version 2.  See
  14 * the COPYING file in the top-level directory.
  15 *
  16 */
  17#include <linux/kvm_host.h>
  18
  19#include "irq.h"
  20#include "mmu.h"
  21#include "kvm_cache_regs.h"
  22#include "x86.h"
  23#include "cpuid.h"
  24#include "pmu.h"
  25
  26#include <linux/module.h>
  27#include <linux/mod_devicetable.h>
  28#include <linux/kernel.h>
  29#include <linux/vmalloc.h>
  30#include <linux/highmem.h>
  31#include <linux/sched.h>
  32#include <linux/trace_events.h>
  33#include <linux/slab.h>
  34
  35#include <asm/perf_event.h>
  36#include <asm/tlbflush.h>
  37#include <asm/desc.h>
  38#include <asm/debugreg.h>
  39#include <asm/kvm_para.h>
  40
  41#include <asm/virtext.h>
  42#include "trace.h"
  43
  44#define __ex(x) __kvm_handle_fault_on_reboot(x)
  45
  46MODULE_AUTHOR("Qumranet");
  47MODULE_LICENSE("GPL");
  48
  49static const struct x86_cpu_id svm_cpu_id[] = {
  50        X86_FEATURE_MATCH(X86_FEATURE_SVM),
  51        {}
  52};
  53MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  54
  55#define IOPM_ALLOC_ORDER 2
  56#define MSRPM_ALLOC_ORDER 1
  57
  58#define SEG_TYPE_LDT 2
  59#define SEG_TYPE_BUSY_TSS16 3
  60
  61#define SVM_FEATURE_NPT            (1 <<  0)
  62#define SVM_FEATURE_LBRV           (1 <<  1)
  63#define SVM_FEATURE_SVML           (1 <<  2)
  64#define SVM_FEATURE_NRIP           (1 <<  3)
  65#define SVM_FEATURE_TSC_RATE       (1 <<  4)
  66#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
  67#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
  68#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
  69#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
  70
  71#define NESTED_EXIT_HOST        0       /* Exit handled on host level */
  72#define NESTED_EXIT_DONE        1       /* Exit caused nested vmexit  */
  73#define NESTED_EXIT_CONTINUE    2       /* Further checks needed      */
  74
  75#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
  76
  77#define TSC_RATIO_RSVD          0xffffff0000000000ULL
  78#define TSC_RATIO_MIN           0x0000000000000001ULL
  79#define TSC_RATIO_MAX           0x000000ffffffffffULL
  80
  81static bool erratum_383_found __read_mostly;
  82
  83static const u32 host_save_user_msrs[] = {
  84#ifdef CONFIG_X86_64
  85        MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
  86        MSR_FS_BASE,
  87#endif
  88        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
  89};
  90
  91#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
  92
  93struct kvm_vcpu;
  94
  95struct nested_state {
  96        struct vmcb *hsave;
  97        u64 hsave_msr;
  98        u64 vm_cr_msr;
  99        u64 vmcb;
 100
 101        /* These are the merged vectors */
 102        u32 *msrpm;
 103
 104        /* gpa pointers to the real vectors */
 105        u64 vmcb_msrpm;
 106        u64 vmcb_iopm;
 107
 108        /* A VMEXIT is required but not yet emulated */
 109        bool exit_required;
 110
 111        /* cache for intercepts of the guest */
 112        u32 intercept_cr;
 113        u32 intercept_dr;
 114        u32 intercept_exceptions;
 115        u64 intercept;
 116
 117        /* Nested Paging related state */
 118        u64 nested_cr3;
 119};
 120
 121#define MSRPM_OFFSETS   16
 122static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 123
 124/*
 125 * Set osvw_len to higher value when updated Revision Guides
 126 * are published and we know what the new status bits are
 127 */
 128static uint64_t osvw_len = 4, osvw_status;
 129
 130struct vcpu_svm {
 131        struct kvm_vcpu vcpu;
 132        struct vmcb *vmcb;
 133        unsigned long vmcb_pa;
 134        struct svm_cpu_data *svm_data;
 135        uint64_t asid_generation;
 136        uint64_t sysenter_esp;
 137        uint64_t sysenter_eip;
 138
 139        u64 next_rip;
 140
 141        u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
 142        struct {
 143                u16 fs;
 144                u16 gs;
 145                u16 ldt;
 146                u64 gs_base;
 147        } host;
 148
 149        u32 *msrpm;
 150
 151        ulong nmi_iret_rip;
 152
 153        struct nested_state nested;
 154
 155        bool nmi_singlestep;
 156
 157        unsigned int3_injected;
 158        unsigned long int3_rip;
 159        u32 apf_reason;
 160
 161        u64  tsc_ratio;
 162};
 163
 164static DEFINE_PER_CPU(u64, current_tsc_ratio);
 165#define TSC_RATIO_DEFAULT       0x0100000000ULL
 166
 167#define MSR_INVALID                     0xffffffffU
 168
 169static const struct svm_direct_access_msrs {
 170        u32 index;   /* Index of the MSR */
 171        bool always; /* True if intercept is always on */
 172} direct_access_msrs[] = {
 173        { .index = MSR_STAR,                            .always = true  },
 174        { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
 175#ifdef CONFIG_X86_64
 176        { .index = MSR_GS_BASE,                         .always = true  },
 177        { .index = MSR_FS_BASE,                         .always = true  },
 178        { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
 179        { .index = MSR_LSTAR,                           .always = true  },
 180        { .index = MSR_CSTAR,                           .always = true  },
 181        { .index = MSR_SYSCALL_MASK,                    .always = true  },
 182#endif
 183        { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
 184        { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
 185        { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
 186        { .index = MSR_IA32_LASTINTTOIP,                .always = false },
 187        { .index = MSR_INVALID,                         .always = false },
 188};
 189
 190/* enable NPT for AMD64 and X86 with PAE */
 191#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 192static bool npt_enabled = true;
 193#else
 194static bool npt_enabled;
 195#endif
 196
 197/* allow nested paging (virtualized MMU) for all guests */
 198static int npt = true;
 199module_param(npt, int, S_IRUGO);
 200
 201/* allow nested virtualization in KVM/SVM */
 202static int nested = true;
 203module_param(nested, int, S_IRUGO);
 204
 205static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 206static void svm_complete_interrupts(struct vcpu_svm *svm);
 207
 208static int nested_svm_exit_handled(struct vcpu_svm *svm);
 209static int nested_svm_intercept(struct vcpu_svm *svm);
 210static int nested_svm_vmexit(struct vcpu_svm *svm);
 211static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 212                                      bool has_error_code, u32 error_code);
 213static u64 __scale_tsc(u64 ratio, u64 tsc);
 214
 215enum {
 216        VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
 217                            pause filter count */
 218        VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
 219        VMCB_ASID,       /* ASID */
 220        VMCB_INTR,       /* int_ctl, int_vector */
 221        VMCB_NPT,        /* npt_en, nCR3, gPAT */
 222        VMCB_CR,         /* CR0, CR3, CR4, EFER */
 223        VMCB_DR,         /* DR6, DR7 */
 224        VMCB_DT,         /* GDT, IDT */
 225        VMCB_SEG,        /* CS, DS, SS, ES, CPL */
 226        VMCB_CR2,        /* CR2 only */
 227        VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
 228        VMCB_DIRTY_MAX,
 229};
 230
 231/* TPR and CR2 are always written before VMRUN */
 232#define VMCB_ALWAYS_DIRTY_MASK  ((1U << VMCB_INTR) | (1U << VMCB_CR2))
 233
 234static inline void mark_all_dirty(struct vmcb *vmcb)
 235{
 236        vmcb->control.clean = 0;
 237}
 238
 239static inline void mark_all_clean(struct vmcb *vmcb)
 240{
 241        vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
 242                               & ~VMCB_ALWAYS_DIRTY_MASK;
 243}
 244
 245static inline void mark_dirty(struct vmcb *vmcb, int bit)
 246{
 247        vmcb->control.clean &= ~(1 << bit);
 248}
 249
 250static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 251{
 252        return container_of(vcpu, struct vcpu_svm, vcpu);
 253}
 254
 255static void recalc_intercepts(struct vcpu_svm *svm)
 256{
 257        struct vmcb_control_area *c, *h;
 258        struct nested_state *g;
 259
 260        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 261
 262        if (!is_guest_mode(&svm->vcpu))
 263                return;
 264
 265        c = &svm->vmcb->control;
 266        h = &svm->nested.hsave->control;
 267        g = &svm->nested;
 268
 269        c->intercept_cr = h->intercept_cr | g->intercept_cr;
 270        c->intercept_dr = h->intercept_dr | g->intercept_dr;
 271        c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 272        c->intercept = h->intercept | g->intercept;
 273}
 274
 275static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
 276{
 277        if (is_guest_mode(&svm->vcpu))
 278                return svm->nested.hsave;
 279        else
 280                return svm->vmcb;
 281}
 282
 283static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
 284{
 285        struct vmcb *vmcb = get_host_vmcb(svm);
 286
 287        vmcb->control.intercept_cr |= (1U << bit);
 288
 289        recalc_intercepts(svm);
 290}
 291
 292static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
 293{
 294        struct vmcb *vmcb = get_host_vmcb(svm);
 295
 296        vmcb->control.intercept_cr &= ~(1U << bit);
 297
 298        recalc_intercepts(svm);
 299}
 300
 301static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
 302{
 303        struct vmcb *vmcb = get_host_vmcb(svm);
 304
 305        return vmcb->control.intercept_cr & (1U << bit);
 306}
 307
 308static inline void set_dr_intercepts(struct vcpu_svm *svm)
 309{
 310        struct vmcb *vmcb = get_host_vmcb(svm);
 311
 312        vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
 313                | (1 << INTERCEPT_DR1_READ)
 314                | (1 << INTERCEPT_DR2_READ)
 315                | (1 << INTERCEPT_DR3_READ)
 316                | (1 << INTERCEPT_DR4_READ)
 317                | (1 << INTERCEPT_DR5_READ)
 318                | (1 << INTERCEPT_DR6_READ)
 319                | (1 << INTERCEPT_DR7_READ)
 320                | (1 << INTERCEPT_DR0_WRITE)
 321                | (1 << INTERCEPT_DR1_WRITE)
 322                | (1 << INTERCEPT_DR2_WRITE)
 323                | (1 << INTERCEPT_DR3_WRITE)
 324                | (1 << INTERCEPT_DR4_WRITE)
 325                | (1 << INTERCEPT_DR5_WRITE)
 326                | (1 << INTERCEPT_DR6_WRITE)
 327                | (1 << INTERCEPT_DR7_WRITE);
 328
 329        recalc_intercepts(svm);
 330}
 331
 332static inline void clr_dr_intercepts(struct vcpu_svm *svm)
 333{
 334        struct vmcb *vmcb = get_host_vmcb(svm);
 335
 336        vmcb->control.intercept_dr = 0;
 337
 338        recalc_intercepts(svm);
 339}
 340
 341static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
 342{
 343        struct vmcb *vmcb = get_host_vmcb(svm);
 344
 345        vmcb->control.intercept_exceptions |= (1U << bit);
 346
 347        recalc_intercepts(svm);
 348}
 349
 350static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
 351{
 352        struct vmcb *vmcb = get_host_vmcb(svm);
 353
 354        vmcb->control.intercept_exceptions &= ~(1U << bit);
 355
 356        recalc_intercepts(svm);
 357}
 358
 359static inline void set_intercept(struct vcpu_svm *svm, int bit)
 360{
 361        struct vmcb *vmcb = get_host_vmcb(svm);
 362
 363        vmcb->control.intercept |= (1ULL << bit);
 364
 365        recalc_intercepts(svm);
 366}
 367
 368static inline void clr_intercept(struct vcpu_svm *svm, int bit)
 369{
 370        struct vmcb *vmcb = get_host_vmcb(svm);
 371
 372        vmcb->control.intercept &= ~(1ULL << bit);
 373
 374        recalc_intercepts(svm);
 375}
 376
 377static inline void enable_gif(struct vcpu_svm *svm)
 378{
 379        svm->vcpu.arch.hflags |= HF_GIF_MASK;
 380}
 381
 382static inline void disable_gif(struct vcpu_svm *svm)
 383{
 384        svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
 385}
 386
 387static inline bool gif_set(struct vcpu_svm *svm)
 388{
 389        return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
 390}
 391
 392static unsigned long iopm_base;
 393
 394struct kvm_ldttss_desc {
 395        u16 limit0;
 396        u16 base0;
 397        unsigned base1:8, type:5, dpl:2, p:1;
 398        unsigned limit1:4, zero0:3, g:1, base2:8;
 399        u32 base3;
 400        u32 zero1;
 401} __attribute__((packed));
 402
 403struct svm_cpu_data {
 404        int cpu;
 405
 406        u64 asid_generation;
 407        u32 max_asid;
 408        u32 next_asid;
 409        struct kvm_ldttss_desc *tss_desc;
 410
 411        struct page *save_area;
 412};
 413
 414static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 415
 416struct svm_init_data {
 417        int cpu;
 418        int r;
 419};
 420
 421static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 422
 423#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 424#define MSRS_RANGE_SIZE 2048
 425#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 426
 427static u32 svm_msrpm_offset(u32 msr)
 428{
 429        u32 offset;
 430        int i;
 431
 432        for (i = 0; i < NUM_MSR_MAPS; i++) {
 433                if (msr < msrpm_ranges[i] ||
 434                    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 435                        continue;
 436
 437                offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 438                offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 439
 440                /* Now we have the u8 offset - but need the u32 offset */
 441                return offset / 4;
 442        }
 443
 444        /* MSR not in any range */
 445        return MSR_INVALID;
 446}
 447
 448#define MAX_INST_SIZE 15
 449
 450static inline void clgi(void)
 451{
 452        asm volatile (__ex(SVM_CLGI));
 453}
 454
 455static inline void stgi(void)
 456{
 457        asm volatile (__ex(SVM_STGI));
 458}
 459
 460static inline void invlpga(unsigned long addr, u32 asid)
 461{
 462        asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
 463}
 464
 465static int get_npt_level(void)
 466{
 467#ifdef CONFIG_X86_64
 468        return PT64_ROOT_LEVEL;
 469#else
 470        return PT32E_ROOT_LEVEL;
 471#endif
 472}
 473
 474static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 475{
 476        vcpu->arch.efer = efer;
 477        if (!npt_enabled && !(efer & EFER_LMA))
 478                efer &= ~EFER_LME;
 479
 480        to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
 481        mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
 482}
 483
 484static int is_external_interrupt(u32 info)
 485{
 486        info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
 487        return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 488}
 489
 490static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 491{
 492        struct vcpu_svm *svm = to_svm(vcpu);
 493        u32 ret = 0;
 494
 495        if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 496                ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 497        return ret;
 498}
 499
 500static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 501{
 502        struct vcpu_svm *svm = to_svm(vcpu);
 503
 504        if (mask == 0)
 505                svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 506        else
 507                svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 508
 509}
 510
 511static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 512{
 513        struct vcpu_svm *svm = to_svm(vcpu);
 514
 515        if (svm->vmcb->control.next_rip != 0) {
 516                WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS));
 517                svm->next_rip = svm->vmcb->control.next_rip;
 518        }
 519
 520        if (!svm->next_rip) {
 521                if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
 522                                EMULATE_DONE)
 523                        printk(KERN_DEBUG "%s: NOP\n", __func__);
 524                return;
 525        }
 526        if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
 527                printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
 528                       __func__, kvm_rip_read(vcpu), svm->next_rip);
 529
 530        kvm_rip_write(vcpu, svm->next_rip);
 531        svm_set_interrupt_shadow(vcpu, 0);
 532}
 533
 534static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 535                                bool has_error_code, u32 error_code,
 536                                bool reinject)
 537{
 538        struct vcpu_svm *svm = to_svm(vcpu);
 539
 540        /*
 541         * If we are within a nested VM we'd better #VMEXIT and let the guest
 542         * handle the exception
 543         */
 544        if (!reinject &&
 545            nested_svm_check_exception(svm, nr, has_error_code, error_code))
 546                return;
 547
 548        if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
 549                unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
 550
 551                /*
 552                 * For guest debugging where we have to reinject #BP if some
 553                 * INT3 is guest-owned:
 554                 * Emulate nRIP by moving RIP forward. Will fail if injection
 555                 * raises a fault that is not intercepted. Still better than
 556                 * failing in all cases.
 557                 */
 558                skip_emulated_instruction(&svm->vcpu);
 559                rip = kvm_rip_read(&svm->vcpu);
 560                svm->int3_rip = rip + svm->vmcb->save.cs.base;
 561                svm->int3_injected = rip - old_rip;
 562        }
 563
 564        svm->vmcb->control.event_inj = nr
 565                | SVM_EVTINJ_VALID
 566                | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 567                | SVM_EVTINJ_TYPE_EXEPT;
 568        svm->vmcb->control.event_inj_err = error_code;
 569}
 570
 571static void svm_init_erratum_383(void)
 572{
 573        u32 low, high;
 574        int err;
 575        u64 val;
 576
 577        if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 578                return;
 579
 580        /* Use _safe variants to not break nested virtualization */
 581        val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 582        if (err)
 583                return;
 584
 585        val |= (1ULL << 47);
 586
 587        low  = lower_32_bits(val);
 588        high = upper_32_bits(val);
 589
 590        native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 591
 592        erratum_383_found = true;
 593}
 594
 595static void svm_init_osvw(struct kvm_vcpu *vcpu)
 596{
 597        /*
 598         * Guests should see errata 400 and 415 as fixed (assuming that
 599         * HLT and IO instructions are intercepted).
 600         */
 601        vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 602        vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 603
 604        /*
 605         * By increasing VCPU's osvw.length to 3 we are telling the guest that
 606         * all osvw.status bits inside that length, including bit 0 (which is
 607         * reserved for erratum 298), are valid. However, if host processor's
 608         * osvw_len is 0 then osvw_status[0] carries no information. We need to
 609         * be conservative here and therefore we tell the guest that erratum 298
 610         * is present (because we really don't know).
 611         */
 612        if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 613                vcpu->arch.osvw.status |= 1;
 614}
 615
 616static int has_svm(void)
 617{
 618        const char *msg;
 619
 620        if (!cpu_has_svm(&msg)) {
 621                printk(KERN_INFO "has_svm: %s\n", msg);
 622                return 0;
 623        }
 624
 625        return 1;
 626}
 627
 628static void svm_hardware_disable(void)
 629{
 630        /* Make sure we clean up behind us */
 631        if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
 632                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 633
 634        cpu_svm_disable();
 635
 636        amd_pmu_disable_virt();
 637}
 638
 639static int svm_hardware_enable(void)
 640{
 641
 642        struct svm_cpu_data *sd;
 643        uint64_t efer;
 644        struct desc_ptr gdt_descr;
 645        struct desc_struct *gdt;
 646        int me = raw_smp_processor_id();
 647
 648        rdmsrl(MSR_EFER, efer);
 649        if (efer & EFER_SVME)
 650                return -EBUSY;
 651
 652        if (!has_svm()) {
 653                pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
 654                return -EINVAL;
 655        }
 656        sd = per_cpu(svm_data, me);
 657        if (!sd) {
 658                pr_err("%s: svm_data is NULL on %d\n", __func__, me);
 659                return -EINVAL;
 660        }
 661
 662        sd->asid_generation = 1;
 663        sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 664        sd->next_asid = sd->max_asid + 1;
 665
 666        native_store_gdt(&gdt_descr);
 667        gdt = (struct desc_struct *)gdt_descr.address;
 668        sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 669
 670        wrmsrl(MSR_EFER, efer | EFER_SVME);
 671
 672        wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
 673
 674        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 675                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 676                __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
 677        }
 678
 679
 680        /*
 681         * Get OSVW bits.
 682         *
 683         * Note that it is possible to have a system with mixed processor
 684         * revisions and therefore different OSVW bits. If bits are not the same
 685         * on different processors then choose the worst case (i.e. if erratum
 686         * is present on one processor and not on another then assume that the
 687         * erratum is present everywhere).
 688         */
 689        if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 690                uint64_t len, status = 0;
 691                int err;
 692
 693                len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 694                if (!err)
 695                        status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 696                                                      &err);
 697
 698                if (err)
 699                        osvw_status = osvw_len = 0;
 700                else {
 701                        if (len < osvw_len)
 702                                osvw_len = len;
 703                        osvw_status |= status;
 704                        osvw_status &= (1ULL << osvw_len) - 1;
 705                }
 706        } else
 707                osvw_status = osvw_len = 0;
 708
 709        svm_init_erratum_383();
 710
 711        amd_pmu_enable_virt();
 712
 713        return 0;
 714}
 715
 716static void svm_cpu_uninit(int cpu)
 717{
 718        struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
 719
 720        if (!sd)
 721                return;
 722
 723        per_cpu(svm_data, raw_smp_processor_id()) = NULL;
 724        __free_page(sd->save_area);
 725        kfree(sd);
 726}
 727
 728static int svm_cpu_init(int cpu)
 729{
 730        struct svm_cpu_data *sd;
 731        int r;
 732
 733        sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
 734        if (!sd)
 735                return -ENOMEM;
 736        sd->cpu = cpu;
 737        sd->save_area = alloc_page(GFP_KERNEL);
 738        r = -ENOMEM;
 739        if (!sd->save_area)
 740                goto err_1;
 741
 742        per_cpu(svm_data, cpu) = sd;
 743
 744        return 0;
 745
 746err_1:
 747        kfree(sd);
 748        return r;
 749
 750}
 751
 752static bool valid_msr_intercept(u32 index)
 753{
 754        int i;
 755
 756        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 757                if (direct_access_msrs[i].index == index)
 758                        return true;
 759
 760        return false;
 761}
 762
 763static void set_msr_interception(u32 *msrpm, unsigned msr,
 764                                 int read, int write)
 765{
 766        u8 bit_read, bit_write;
 767        unsigned long tmp;
 768        u32 offset;
 769
 770        /*
 771         * If this warning triggers extend the direct_access_msrs list at the
 772         * beginning of the file
 773         */
 774        WARN_ON(!valid_msr_intercept(msr));
 775
 776        offset    = svm_msrpm_offset(msr);
 777        bit_read  = 2 * (msr & 0x0f);
 778        bit_write = 2 * (msr & 0x0f) + 1;
 779        tmp       = msrpm[offset];
 780
 781        BUG_ON(offset == MSR_INVALID);
 782
 783        read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 784        write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 785
 786        msrpm[offset] = tmp;
 787}
 788
 789static void svm_vcpu_init_msrpm(u32 *msrpm)
 790{
 791        int i;
 792
 793        memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
 794
 795        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 796                if (!direct_access_msrs[i].always)
 797                        continue;
 798
 799                set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
 800        }
 801}
 802
 803static void add_msr_offset(u32 offset)
 804{
 805        int i;
 806
 807        for (i = 0; i < MSRPM_OFFSETS; ++i) {
 808
 809                /* Offset already in list? */
 810                if (msrpm_offsets[i] == offset)
 811                        return;
 812
 813                /* Slot used by another offset? */
 814                if (msrpm_offsets[i] != MSR_INVALID)
 815                        continue;
 816
 817                /* Add offset to list */
 818                msrpm_offsets[i] = offset;
 819
 820                return;
 821        }
 822
 823        /*
 824         * If this BUG triggers the msrpm_offsets table has an overflow. Just
 825         * increase MSRPM_OFFSETS in this case.
 826         */
 827        BUG();
 828}
 829
 830static void init_msrpm_offsets(void)
 831{
 832        int i;
 833
 834        memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 835
 836        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 837                u32 offset;
 838
 839                offset = svm_msrpm_offset(direct_access_msrs[i].index);
 840                BUG_ON(offset == MSR_INVALID);
 841
 842                add_msr_offset(offset);
 843        }
 844}
 845
 846static void svm_enable_lbrv(struct vcpu_svm *svm)
 847{
 848        u32 *msrpm = svm->msrpm;
 849
 850        svm->vmcb->control.lbr_ctl = 1;
 851        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 852        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 853        set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 854        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 855}
 856
 857static void svm_disable_lbrv(struct vcpu_svm *svm)
 858{
 859        u32 *msrpm = svm->msrpm;
 860
 861        svm->vmcb->control.lbr_ctl = 0;
 862        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
 863        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 864        set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 865        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 866}
 867
 868#define MTRR_TYPE_UC_MINUS      7
 869#define MTRR2PROTVAL_INVALID 0xff
 870
 871static u8 mtrr2protval[8];
 872
 873static u8 fallback_mtrr_type(int mtrr)
 874{
 875        /*
 876         * WT and WP aren't always available in the host PAT.  Treat
 877         * them as UC and UC- respectively.  Everything else should be
 878         * there.
 879         */
 880        switch (mtrr)
 881        {
 882        case MTRR_TYPE_WRTHROUGH:
 883                return MTRR_TYPE_UNCACHABLE;
 884        case MTRR_TYPE_WRPROT:
 885                return MTRR_TYPE_UC_MINUS;
 886        default:
 887                BUG();
 888        }
 889}
 890
 891static void build_mtrr2protval(void)
 892{
 893        int i;
 894        u64 pat;
 895
 896        for (i = 0; i < 8; i++)
 897                mtrr2protval[i] = MTRR2PROTVAL_INVALID;
 898
 899        /* Ignore the invalid MTRR types.  */
 900        mtrr2protval[2] = 0;
 901        mtrr2protval[3] = 0;
 902
 903        /*
 904         * Use host PAT value to figure out the mapping from guest MTRR
 905         * values to nested page table PAT/PCD/PWT values.  We do not
 906         * want to change the host PAT value every time we enter the
 907         * guest.
 908         */
 909        rdmsrl(MSR_IA32_CR_PAT, pat);
 910        for (i = 0; i < 8; i++) {
 911                u8 mtrr = pat >> (8 * i);
 912
 913                if (mtrr2protval[mtrr] == MTRR2PROTVAL_INVALID)
 914                        mtrr2protval[mtrr] = __cm_idx2pte(i);
 915        }
 916
 917        for (i = 0; i < 8; i++) {
 918                if (mtrr2protval[i] == MTRR2PROTVAL_INVALID) {
 919                        u8 fallback = fallback_mtrr_type(i);
 920                        mtrr2protval[i] = mtrr2protval[fallback];
 921                        BUG_ON(mtrr2protval[i] == MTRR2PROTVAL_INVALID);
 922                }
 923        }
 924}
 925
 926static __init int svm_hardware_setup(void)
 927{
 928        int cpu;
 929        struct page *iopm_pages;
 930        void *iopm_va;
 931        int r;
 932
 933        iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
 934
 935        if (!iopm_pages)
 936                return -ENOMEM;
 937
 938        iopm_va = page_address(iopm_pages);
 939        memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
 940        iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 941
 942        init_msrpm_offsets();
 943
 944        if (boot_cpu_has(X86_FEATURE_NX))
 945                kvm_enable_efer_bits(EFER_NX);
 946
 947        if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
 948                kvm_enable_efer_bits(EFER_FFXSR);
 949
 950        if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 951                u64 max;
 952
 953                kvm_has_tsc_control = true;
 954
 955                /*
 956                 * Make sure the user can only configure tsc_khz values that
 957                 * fit into a signed integer.
 958                 * A min value is not calculated needed because it will always
 959                 * be 1 on all machines and a value of 0 is used to disable
 960                 * tsc-scaling for the vcpu.
 961                 */
 962                max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
 963
 964                kvm_max_guest_tsc_khz = max;
 965        }
 966
 967        if (nested) {
 968                printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
 969                kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
 970        }
 971
 972        for_each_possible_cpu(cpu) {
 973                r = svm_cpu_init(cpu);
 974                if (r)
 975                        goto err;
 976        }
 977
 978        if (!boot_cpu_has(X86_FEATURE_NPT))
 979                npt_enabled = false;
 980
 981        if (npt_enabled && !npt) {
 982                printk(KERN_INFO "kvm: Nested Paging disabled\n");
 983                npt_enabled = false;
 984        }
 985
 986        if (npt_enabled) {
 987                printk(KERN_INFO "kvm: Nested Paging enabled\n");
 988                kvm_enable_tdp();
 989        } else
 990                kvm_disable_tdp();
 991
 992        build_mtrr2protval();
 993        return 0;
 994
 995err:
 996        __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
 997        iopm_base = 0;
 998        return r;
 999}
1000
1001static __exit void svm_hardware_unsetup(void)
1002{
1003        int cpu;
1004
1005        for_each_possible_cpu(cpu)
1006                svm_cpu_uninit(cpu);
1007
1008        __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
1009        iopm_base = 0;
1010}
1011
1012static void init_seg(struct vmcb_seg *seg)
1013{
1014        seg->selector = 0;
1015        seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1016                      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1017        seg->limit = 0xffff;
1018        seg->base = 0;
1019}
1020
1021static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1022{
1023        seg->selector = 0;
1024        seg->attrib = SVM_SELECTOR_P_MASK | type;
1025        seg->limit = 0xffff;
1026        seg->base = 0;
1027}
1028
1029static u64 __scale_tsc(u64 ratio, u64 tsc)
1030{
1031        u64 mult, frac, _tsc;
1032
1033        mult  = ratio >> 32;
1034        frac  = ratio & ((1ULL << 32) - 1);
1035
1036        _tsc  = tsc;
1037        _tsc *= mult;
1038        _tsc += (tsc >> 32) * frac;
1039        _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
1040
1041        return _tsc;
1042}
1043
1044static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
1045{
1046        struct vcpu_svm *svm = to_svm(vcpu);
1047        u64 _tsc = tsc;
1048
1049        if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
1050                _tsc = __scale_tsc(svm->tsc_ratio, tsc);
1051
1052        return _tsc;
1053}
1054
1055static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
1056{
1057        struct vcpu_svm *svm = to_svm(vcpu);
1058        u64 ratio;
1059        u64 khz;
1060
1061        /* Guest TSC same frequency as host TSC? */
1062        if (!scale) {
1063                svm->tsc_ratio = TSC_RATIO_DEFAULT;
1064                return;
1065        }
1066
1067        /* TSC scaling supported? */
1068        if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1069                if (user_tsc_khz > tsc_khz) {
1070                        vcpu->arch.tsc_catchup = 1;
1071                        vcpu->arch.tsc_always_catchup = 1;
1072                } else
1073                        WARN(1, "user requested TSC rate below hardware speed\n");
1074                return;
1075        }
1076
1077        khz = user_tsc_khz;
1078
1079        /* TSC scaling required  - calculate ratio */
1080        ratio = khz << 32;
1081        do_div(ratio, tsc_khz);
1082
1083        if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
1084                WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
1085                                user_tsc_khz);
1086                return;
1087        }
1088        svm->tsc_ratio             = ratio;
1089}
1090
1091static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
1092{
1093        struct vcpu_svm *svm = to_svm(vcpu);
1094
1095        return svm->vmcb->control.tsc_offset;
1096}
1097
1098static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1099{
1100        struct vcpu_svm *svm = to_svm(vcpu);
1101        u64 g_tsc_offset = 0;
1102
1103        if (is_guest_mode(vcpu)) {
1104                g_tsc_offset = svm->vmcb->control.tsc_offset -
1105                               svm->nested.hsave->control.tsc_offset;
1106                svm->nested.hsave->control.tsc_offset = offset;
1107        } else
1108                trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1109                                           svm->vmcb->control.tsc_offset,
1110                                           offset);
1111
1112        svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1113
1114        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1115}
1116
1117static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
1118{
1119        struct vcpu_svm *svm = to_svm(vcpu);
1120
1121        if (host) {
1122                if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
1123                        WARN_ON(adjustment < 0);
1124                adjustment = svm_scale_tsc(vcpu, (u64)adjustment);
1125        }
1126
1127        svm->vmcb->control.tsc_offset += adjustment;
1128        if (is_guest_mode(vcpu))
1129                svm->nested.hsave->control.tsc_offset += adjustment;
1130        else
1131                trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1132                                     svm->vmcb->control.tsc_offset - adjustment,
1133                                     svm->vmcb->control.tsc_offset);
1134
1135        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1136}
1137
1138static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1139{
1140        u64 tsc;
1141
1142        tsc = svm_scale_tsc(vcpu, native_read_tsc());
1143
1144        return target_tsc - tsc;
1145}
1146
1147static void svm_set_guest_pat(struct vcpu_svm *svm, u64 *g_pat)
1148{
1149        struct kvm_vcpu *vcpu = &svm->vcpu;
1150
1151        /* Unlike Intel, AMD takes the guest's CR0.CD into account.
1152         *
1153         * AMD does not have IPAT.  To emulate it for the case of guests
1154         * with no assigned devices, just set everything to WB.  If guests
1155         * have assigned devices, however, we cannot force WB for RAM
1156         * pages only, so use the guest PAT directly.
1157         */
1158        if (!kvm_arch_has_assigned_device(vcpu->kvm))
1159                *g_pat = 0x0606060606060606;
1160        else
1161                *g_pat = vcpu->arch.pat;
1162}
1163
1164static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
1165{
1166        u8 mtrr;
1167
1168        /*
1169         * 1. MMIO: trust guest MTRR, so same as item 3.
1170         * 2. No passthrough: always map as WB, and force guest PAT to WB as well
1171         * 3. Passthrough: can't guarantee the result, try to trust guest.
1172         */
1173        if (!is_mmio && !kvm_arch_has_assigned_device(vcpu->kvm))
1174                return 0;
1175
1176        mtrr = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
1177        return mtrr2protval[mtrr];
1178}
1179
1180static void init_vmcb(struct vcpu_svm *svm, bool init_event)
1181{
1182        struct vmcb_control_area *control = &svm->vmcb->control;
1183        struct vmcb_save_area *save = &svm->vmcb->save;
1184
1185        svm->vcpu.fpu_active = 1;
1186        svm->vcpu.arch.hflags = 0;
1187
1188        set_cr_intercept(svm, INTERCEPT_CR0_READ);
1189        set_cr_intercept(svm, INTERCEPT_CR3_READ);
1190        set_cr_intercept(svm, INTERCEPT_CR4_READ);
1191        set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1192        set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1193        set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1194        set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
1195
1196        set_dr_intercepts(svm);
1197
1198        set_exception_intercept(svm, PF_VECTOR);
1199        set_exception_intercept(svm, UD_VECTOR);
1200        set_exception_intercept(svm, MC_VECTOR);
1201
1202        set_intercept(svm, INTERCEPT_INTR);
1203        set_intercept(svm, INTERCEPT_NMI);
1204        set_intercept(svm, INTERCEPT_SMI);
1205        set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1206        set_intercept(svm, INTERCEPT_RDPMC);
1207        set_intercept(svm, INTERCEPT_CPUID);
1208        set_intercept(svm, INTERCEPT_INVD);
1209        set_intercept(svm, INTERCEPT_HLT);
1210        set_intercept(svm, INTERCEPT_INVLPG);
1211        set_intercept(svm, INTERCEPT_INVLPGA);
1212        set_intercept(svm, INTERCEPT_IOIO_PROT);
1213        set_intercept(svm, INTERCEPT_MSR_PROT);
1214        set_intercept(svm, INTERCEPT_TASK_SWITCH);
1215        set_intercept(svm, INTERCEPT_SHUTDOWN);
1216        set_intercept(svm, INTERCEPT_VMRUN);
1217        set_intercept(svm, INTERCEPT_VMMCALL);
1218        set_intercept(svm, INTERCEPT_VMLOAD);
1219        set_intercept(svm, INTERCEPT_VMSAVE);
1220        set_intercept(svm, INTERCEPT_STGI);
1221        set_intercept(svm, INTERCEPT_CLGI);
1222        set_intercept(svm, INTERCEPT_SKINIT);
1223        set_intercept(svm, INTERCEPT_WBINVD);
1224        set_intercept(svm, INTERCEPT_MONITOR);
1225        set_intercept(svm, INTERCEPT_MWAIT);
1226        set_intercept(svm, INTERCEPT_XSETBV);
1227
1228        control->iopm_base_pa = iopm_base;
1229        control->msrpm_base_pa = __pa(svm->msrpm);
1230        control->int_ctl = V_INTR_MASKING_MASK;
1231
1232        init_seg(&save->es);
1233        init_seg(&save->ss);
1234        init_seg(&save->ds);
1235        init_seg(&save->fs);
1236        init_seg(&save->gs);
1237
1238        save->cs.selector = 0xf000;
1239        save->cs.base = 0xffff0000;
1240        /* Executable/Readable Code Segment */
1241        save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1242                SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1243        save->cs.limit = 0xffff;
1244
1245        save->gdtr.limit = 0xffff;
1246        save->idtr.limit = 0xffff;
1247
1248        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1249        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1250
1251        if (!init_event)
1252                svm_set_efer(&svm->vcpu, 0);
1253        save->dr6 = 0xffff0ff0;
1254        kvm_set_rflags(&svm->vcpu, 2);
1255        save->rip = 0x0000fff0;
1256        svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
1257
1258        /*
1259         * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1260         * It also updates the guest-visible cr0 value.
1261         */
1262        (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1263
1264        save->cr4 = X86_CR4_PAE;
1265        /* rdx = ?? */
1266
1267        if (npt_enabled) {
1268                /* Setup VMCB for Nested Paging */
1269                control->nested_ctl = 1;
1270                clr_intercept(svm, INTERCEPT_INVLPG);
1271                clr_exception_intercept(svm, PF_VECTOR);
1272                clr_cr_intercept(svm, INTERCEPT_CR3_READ);
1273                clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1274                save->g_pat = svm->vcpu.arch.pat;
1275                svm_set_guest_pat(svm, &save->g_pat);
1276                save->cr3 = 0;
1277                save->cr4 = 0;
1278        }
1279        svm->asid_generation = 0;
1280
1281        svm->nested.vmcb = 0;
1282        svm->vcpu.arch.hflags = 0;
1283
1284        if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1285                control->pause_filter_count = 3000;
1286                set_intercept(svm, INTERCEPT_PAUSE);
1287        }
1288
1289        mark_all_dirty(svm->vmcb);
1290
1291        enable_gif(svm);
1292}
1293
1294static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1295{
1296        struct vcpu_svm *svm = to_svm(vcpu);
1297        u32 dummy;
1298        u32 eax = 1;
1299
1300        if (!init_event) {
1301                svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1302                                           MSR_IA32_APICBASE_ENABLE;
1303                if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1304                        svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1305        }
1306        init_vmcb(svm, init_event);
1307
1308        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1309        kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1310}
1311
1312static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1313{
1314        struct vcpu_svm *svm;
1315        struct page *page;
1316        struct page *msrpm_pages;
1317        struct page *hsave_page;
1318        struct page *nested_msrpm_pages;
1319        int err;
1320
1321        svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1322        if (!svm) {
1323                err = -ENOMEM;
1324                goto out;
1325        }
1326
1327        svm->tsc_ratio = TSC_RATIO_DEFAULT;
1328
1329        err = kvm_vcpu_init(&svm->vcpu, kvm, id);
1330        if (err)
1331                goto free_svm;
1332
1333        err = -ENOMEM;
1334        page = alloc_page(GFP_KERNEL);
1335        if (!page)
1336                goto uninit;
1337
1338        msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1339        if (!msrpm_pages)
1340                goto free_page1;
1341
1342        nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1343        if (!nested_msrpm_pages)
1344                goto free_page2;
1345
1346        hsave_page = alloc_page(GFP_KERNEL);
1347        if (!hsave_page)
1348                goto free_page3;
1349
1350        svm->nested.hsave = page_address(hsave_page);
1351
1352        svm->msrpm = page_address(msrpm_pages);
1353        svm_vcpu_init_msrpm(svm->msrpm);
1354
1355        svm->nested.msrpm = page_address(nested_msrpm_pages);
1356        svm_vcpu_init_msrpm(svm->nested.msrpm);
1357
1358        svm->vmcb = page_address(page);
1359        clear_page(svm->vmcb);
1360        svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1361        svm->asid_generation = 0;
1362        init_vmcb(svm, false);
1363
1364        svm_init_osvw(&svm->vcpu);
1365
1366        return &svm->vcpu;
1367
1368free_page3:
1369        __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1370free_page2:
1371        __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
1372free_page1:
1373        __free_page(page);
1374uninit:
1375        kvm_vcpu_uninit(&svm->vcpu);
1376free_svm:
1377        kmem_cache_free(kvm_vcpu_cache, svm);
1378out:
1379        return ERR_PTR(err);
1380}
1381
1382static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1383{
1384        struct vcpu_svm *svm = to_svm(vcpu);
1385
1386        __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
1387        __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1388        __free_page(virt_to_page(svm->nested.hsave));
1389        __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1390        kvm_vcpu_uninit(vcpu);
1391        kmem_cache_free(kvm_vcpu_cache, svm);
1392}
1393
1394static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1395{
1396        struct vcpu_svm *svm = to_svm(vcpu);
1397        int i;
1398
1399        if (unlikely(cpu != vcpu->cpu)) {
1400                svm->asid_generation = 0;
1401                mark_all_dirty(svm->vmcb);
1402        }
1403
1404#ifdef CONFIG_X86_64
1405        rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1406#endif
1407        savesegment(fs, svm->host.fs);
1408        savesegment(gs, svm->host.gs);
1409        svm->host.ldt = kvm_read_ldt();
1410
1411        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1412                rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1413
1414        if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
1415            svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
1416                __this_cpu_write(current_tsc_ratio, svm->tsc_ratio);
1417                wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
1418        }
1419}
1420
1421static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1422{
1423        struct vcpu_svm *svm = to_svm(vcpu);
1424        int i;
1425
1426        ++vcpu->stat.host_state_reload;
1427        kvm_load_ldt(svm->host.ldt);
1428#ifdef CONFIG_X86_64
1429        loadsegment(fs, svm->host.fs);
1430        wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1431        load_gs_index(svm->host.gs);
1432#else
1433#ifdef CONFIG_X86_32_LAZY_GS
1434        loadsegment(gs, svm->host.gs);
1435#endif
1436#endif
1437        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1438                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1439}
1440
1441static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1442{
1443        return to_svm(vcpu)->vmcb->save.rflags;
1444}
1445
1446static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1447{
1448       /*
1449        * Any change of EFLAGS.VM is accompained by a reload of SS
1450        * (caused by either a task switch or an inter-privilege IRET),
1451        * so we do not need to update the CPL here.
1452        */
1453        to_svm(vcpu)->vmcb->save.rflags = rflags;
1454}
1455
1456static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1457{
1458        switch (reg) {
1459        case VCPU_EXREG_PDPTR:
1460                BUG_ON(!npt_enabled);
1461                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1462                break;
1463        default:
1464                BUG();
1465        }
1466}
1467
1468static void svm_set_vintr(struct vcpu_svm *svm)
1469{
1470        set_intercept(svm, INTERCEPT_VINTR);
1471}
1472
1473static void svm_clear_vintr(struct vcpu_svm *svm)
1474{
1475        clr_intercept(svm, INTERCEPT_VINTR);
1476}
1477
1478static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1479{
1480        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1481
1482        switch (seg) {
1483        case VCPU_SREG_CS: return &save->cs;
1484        case VCPU_SREG_DS: return &save->ds;
1485        case VCPU_SREG_ES: return &save->es;
1486        case VCPU_SREG_FS: return &save->fs;
1487        case VCPU_SREG_GS: return &save->gs;
1488        case VCPU_SREG_SS: return &save->ss;
1489        case VCPU_SREG_TR: return &save->tr;
1490        case VCPU_SREG_LDTR: return &save->ldtr;
1491        }
1492        BUG();
1493        return NULL;
1494}
1495
1496static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1497{
1498        struct vmcb_seg *s = svm_seg(vcpu, seg);
1499
1500        return s->base;
1501}
1502
1503static void svm_get_segment(struct kvm_vcpu *vcpu,
1504                            struct kvm_segment *var, int seg)
1505{
1506        struct vmcb_seg *s = svm_seg(vcpu, seg);
1507
1508        var->base = s->base;
1509        var->limit = s->limit;
1510        var->selector = s->selector;
1511        var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1512        var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1513        var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1514        var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1515        var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1516        var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1517        var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1518
1519        /*
1520         * AMD CPUs circa 2014 track the G bit for all segments except CS.
1521         * However, the SVM spec states that the G bit is not observed by the
1522         * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1523         * So let's synthesize a legal G bit for all segments, this helps
1524         * running KVM nested. It also helps cross-vendor migration, because
1525         * Intel's vmentry has a check on the 'G' bit.
1526         */
1527        var->g = s->limit > 0xfffff;
1528
1529        /*
1530         * AMD's VMCB does not have an explicit unusable field, so emulate it
1531         * for cross vendor migration purposes by "not present"
1532         */
1533        var->unusable = !var->present || (var->type == 0);
1534
1535        switch (seg) {
1536        case VCPU_SREG_TR:
1537                /*
1538                 * Work around a bug where the busy flag in the tr selector
1539                 * isn't exposed
1540                 */
1541                var->type |= 0x2;
1542                break;
1543        case VCPU_SREG_DS:
1544        case VCPU_SREG_ES:
1545        case VCPU_SREG_FS:
1546        case VCPU_SREG_GS:
1547                /*
1548                 * The accessed bit must always be set in the segment
1549                 * descriptor cache, although it can be cleared in the
1550                 * descriptor, the cached bit always remains at 1. Since
1551                 * Intel has a check on this, set it here to support
1552                 * cross-vendor migration.
1553                 */
1554                if (!var->unusable)
1555                        var->type |= 0x1;
1556                break;
1557        case VCPU_SREG_SS:
1558                /*
1559                 * On AMD CPUs sometimes the DB bit in the segment
1560                 * descriptor is left as 1, although the whole segment has
1561                 * been made unusable. Clear it here to pass an Intel VMX
1562                 * entry check when cross vendor migrating.
1563                 */
1564                if (var->unusable)
1565                        var->db = 0;
1566                var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1567                break;
1568        }
1569}
1570
1571static int svm_get_cpl(struct kvm_vcpu *vcpu)
1572{
1573        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1574
1575        return save->cpl;
1576}
1577
1578static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1579{
1580        struct vcpu_svm *svm = to_svm(vcpu);
1581
1582        dt->size = svm->vmcb->save.idtr.limit;
1583        dt->address = svm->vmcb->save.idtr.base;
1584}
1585
1586static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1587{
1588        struct vcpu_svm *svm = to_svm(vcpu);
1589
1590        svm->vmcb->save.idtr.limit = dt->size;
1591        svm->vmcb->save.idtr.base = dt->address ;
1592        mark_dirty(svm->vmcb, VMCB_DT);
1593}
1594
1595static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1596{
1597        struct vcpu_svm *svm = to_svm(vcpu);
1598
1599        dt->size = svm->vmcb->save.gdtr.limit;
1600        dt->address = svm->vmcb->save.gdtr.base;
1601}
1602
1603static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1604{
1605        struct vcpu_svm *svm = to_svm(vcpu);
1606
1607        svm->vmcb->save.gdtr.limit = dt->size;
1608        svm->vmcb->save.gdtr.base = dt->address ;
1609        mark_dirty(svm->vmcb, VMCB_DT);
1610}
1611
1612static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1613{
1614}
1615
1616static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1617{
1618}
1619
1620static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1621{
1622}
1623
1624static void update_cr0_intercept(struct vcpu_svm *svm)
1625{
1626        ulong gcr0 = svm->vcpu.arch.cr0;
1627        u64 *hcr0 = &svm->vmcb->save.cr0;
1628
1629        if (!svm->vcpu.fpu_active)
1630                *hcr0 |= SVM_CR0_SELECTIVE_MASK;
1631        else
1632                *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1633                        | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1634
1635        mark_dirty(svm->vmcb, VMCB_CR);
1636
1637        if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1638                clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1639                clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1640        } else {
1641                set_cr_intercept(svm, INTERCEPT_CR0_READ);
1642                set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1643        }
1644}
1645
1646static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1647{
1648        struct vcpu_svm *svm = to_svm(vcpu);
1649
1650#ifdef CONFIG_X86_64
1651        if (vcpu->arch.efer & EFER_LME) {
1652                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1653                        vcpu->arch.efer |= EFER_LMA;
1654                        svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1655                }
1656
1657                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1658                        vcpu->arch.efer &= ~EFER_LMA;
1659                        svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1660                }
1661        }
1662#endif
1663        vcpu->arch.cr0 = cr0;
1664
1665        if (!npt_enabled)
1666                cr0 |= X86_CR0_PG | X86_CR0_WP;
1667
1668        if (!vcpu->fpu_active)
1669                cr0 |= X86_CR0_TS;
1670        /*
1671         * re-enable caching here because the QEMU bios
1672         * does not do it - this results in some delay at
1673         * reboot
1674         */
1675        if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1676                cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1677        svm->vmcb->save.cr0 = cr0;
1678        mark_dirty(svm->vmcb, VMCB_CR);
1679        update_cr0_intercept(svm);
1680}
1681
1682static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1683{
1684        unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1685        unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1686
1687        if (cr4 & X86_CR4_VMXE)
1688                return 1;
1689
1690        if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1691                svm_flush_tlb(vcpu);
1692
1693        vcpu->arch.cr4 = cr4;
1694        if (!npt_enabled)
1695                cr4 |= X86_CR4_PAE;
1696        cr4 |= host_cr4_mce;
1697        to_svm(vcpu)->vmcb->save.cr4 = cr4;
1698        mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1699        return 0;
1700}
1701
1702static void svm_set_segment(struct kvm_vcpu *vcpu,
1703                            struct kvm_segment *var, int seg)
1704{
1705        struct vcpu_svm *svm = to_svm(vcpu);
1706        struct vmcb_seg *s = svm_seg(vcpu, seg);
1707
1708        s->base = var->base;
1709        s->limit = var->limit;
1710        s->selector = var->selector;
1711        if (var->unusable)
1712                s->attrib = 0;
1713        else {
1714                s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1715                s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1716                s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1717                s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
1718                s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1719                s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1720                s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1721                s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1722        }
1723
1724        /*
1725         * This is always accurate, except if SYSRET returned to a segment
1726         * with SS.DPL != 3.  Intel does not have this quirk, and always
1727         * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1728         * would entail passing the CPL to userspace and back.
1729         */
1730        if (seg == VCPU_SREG_SS)
1731                svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1732
1733        mark_dirty(svm->vmcb, VMCB_SEG);
1734}
1735
1736static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
1737{
1738        struct vcpu_svm *svm = to_svm(vcpu);
1739
1740        clr_exception_intercept(svm, DB_VECTOR);
1741        clr_exception_intercept(svm, BP_VECTOR);
1742
1743        if (svm->nmi_singlestep)
1744                set_exception_intercept(svm, DB_VECTOR);
1745
1746        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1747                if (vcpu->guest_debug &
1748                    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1749                        set_exception_intercept(svm, DB_VECTOR);
1750                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1751                        set_exception_intercept(svm, BP_VECTOR);
1752        } else
1753                vcpu->guest_debug = 0;
1754}
1755
1756static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1757{
1758        if (sd->next_asid > sd->max_asid) {
1759                ++sd->asid_generation;
1760                sd->next_asid = 1;
1761                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1762        }
1763
1764        svm->asid_generation = sd->asid_generation;
1765        svm->vmcb->control.asid = sd->next_asid++;
1766
1767        mark_dirty(svm->vmcb, VMCB_ASID);
1768}
1769
1770static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
1771{
1772        return to_svm(vcpu)->vmcb->save.dr6;
1773}
1774
1775static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
1776{
1777        struct vcpu_svm *svm = to_svm(vcpu);
1778
1779        svm->vmcb->save.dr6 = value;
1780        mark_dirty(svm->vmcb, VMCB_DR);
1781}
1782
1783static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1784{
1785        struct vcpu_svm *svm = to_svm(vcpu);
1786
1787        get_debugreg(vcpu->arch.db[0], 0);
1788        get_debugreg(vcpu->arch.db[1], 1);
1789        get_debugreg(vcpu->arch.db[2], 2);
1790        get_debugreg(vcpu->arch.db[3], 3);
1791        vcpu->arch.dr6 = svm_get_dr6(vcpu);
1792        vcpu->arch.dr7 = svm->vmcb->save.dr7;
1793
1794        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1795        set_dr_intercepts(svm);
1796}
1797
1798static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1799{
1800        struct vcpu_svm *svm = to_svm(vcpu);
1801
1802        svm->vmcb->save.dr7 = value;
1803        mark_dirty(svm->vmcb, VMCB_DR);
1804}
1805
1806static int pf_interception(struct vcpu_svm *svm)
1807{
1808        u64 fault_address = svm->vmcb->control.exit_info_2;
1809        u32 error_code;
1810        int r = 1;
1811
1812        switch (svm->apf_reason) {
1813        default:
1814                error_code = svm->vmcb->control.exit_info_1;
1815
1816                trace_kvm_page_fault(fault_address, error_code);
1817                if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1818                        kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1819                r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1820                        svm->vmcb->control.insn_bytes,
1821                        svm->vmcb->control.insn_len);
1822                break;
1823        case KVM_PV_REASON_PAGE_NOT_PRESENT:
1824                svm->apf_reason = 0;
1825                local_irq_disable();
1826                kvm_async_pf_task_wait(fault_address);
1827                local_irq_enable();
1828                break;
1829        case KVM_PV_REASON_PAGE_READY:
1830                svm->apf_reason = 0;
1831                local_irq_disable();
1832                kvm_async_pf_task_wake(fault_address);
1833                local_irq_enable();
1834                break;
1835        }
1836        return r;
1837}
1838
1839static int db_interception(struct vcpu_svm *svm)
1840{
1841        struct kvm_run *kvm_run = svm->vcpu.run;
1842
1843        if (!(svm->vcpu.guest_debug &
1844              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1845                !svm->nmi_singlestep) {
1846                kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1847                return 1;
1848        }
1849
1850        if (svm->nmi_singlestep) {
1851                svm->nmi_singlestep = false;
1852                if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1853                        svm->vmcb->save.rflags &=
1854                                ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1855                update_db_bp_intercept(&svm->vcpu);
1856        }
1857
1858        if (svm->vcpu.guest_debug &
1859            (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1860                kvm_run->exit_reason = KVM_EXIT_DEBUG;
1861                kvm_run->debug.arch.pc =
1862                        svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1863                kvm_run->debug.arch.exception = DB_VECTOR;
1864                return 0;
1865        }
1866
1867        return 1;
1868}
1869
1870static int bp_interception(struct vcpu_svm *svm)
1871{
1872        struct kvm_run *kvm_run = svm->vcpu.run;
1873
1874        kvm_run->exit_reason = KVM_EXIT_DEBUG;
1875        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1876        kvm_run->debug.arch.exception = BP_VECTOR;
1877        return 0;
1878}
1879
1880static int ud_interception(struct vcpu_svm *svm)
1881{
1882        int er;
1883
1884        er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
1885        if (er != EMULATE_DONE)
1886                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1887        return 1;
1888}
1889
1890static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1891{
1892        struct vcpu_svm *svm = to_svm(vcpu);
1893
1894        clr_exception_intercept(svm, NM_VECTOR);
1895
1896        svm->vcpu.fpu_active = 1;
1897        update_cr0_intercept(svm);
1898}
1899
1900static int nm_interception(struct vcpu_svm *svm)
1901{
1902        svm_fpu_activate(&svm->vcpu);
1903        return 1;
1904}
1905
1906static bool is_erratum_383(void)
1907{
1908        int err, i;
1909        u64 value;
1910
1911        if (!erratum_383_found)
1912                return false;
1913
1914        value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
1915        if (err)
1916                return false;
1917
1918        /* Bit 62 may or may not be set for this mce */
1919        value &= ~(1ULL << 62);
1920
1921        if (value != 0xb600000000010015ULL)
1922                return false;
1923
1924        /* Clear MCi_STATUS registers */
1925        for (i = 0; i < 6; ++i)
1926                native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
1927
1928        value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
1929        if (!err) {
1930                u32 low, high;
1931
1932                value &= ~(1ULL << 2);
1933                low    = lower_32_bits(value);
1934                high   = upper_32_bits(value);
1935
1936                native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
1937        }
1938
1939        /* Flush tlb to evict multi-match entries */
1940        __flush_tlb_all();
1941
1942        return true;
1943}
1944
1945static void svm_handle_mce(struct vcpu_svm *svm)
1946{
1947        if (is_erratum_383()) {
1948                /*
1949                 * Erratum 383 triggered. Guest state is corrupt so kill the
1950                 * guest.
1951                 */
1952                pr_err("KVM: Guest triggered AMD Erratum 383\n");
1953
1954                kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1955
1956                return;
1957        }
1958
1959        /*
1960         * On an #MC intercept the MCE handler is not called automatically in
1961         * the host. So do it by hand here.
1962         */
1963        asm volatile (
1964                "int $0x12\n");
1965        /* not sure if we ever come back to this point */
1966
1967        return;
1968}
1969
1970static int mc_interception(struct vcpu_svm *svm)
1971{
1972        return 1;
1973}
1974
1975static int shutdown_interception(struct vcpu_svm *svm)
1976{
1977        struct kvm_run *kvm_run = svm->vcpu.run;
1978
1979        /*
1980         * VMCB is undefined after a SHUTDOWN intercept
1981         * so reinitialize it.
1982         */
1983        clear_page(svm->vmcb);
1984        init_vmcb(svm, false);
1985
1986        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1987        return 0;
1988}
1989
1990static int io_interception(struct vcpu_svm *svm)
1991{
1992        struct kvm_vcpu *vcpu = &svm->vcpu;
1993        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1994        int size, in, string;
1995        unsigned port;
1996
1997        ++svm->vcpu.stat.io_exits;
1998        string = (io_info & SVM_IOIO_STR_MASK) != 0;
1999        in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2000        if (string || in)
2001                return emulate_instruction(vcpu, 0) == EMULATE_DONE;
2002
2003        port = io_info >> 16;
2004        size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2005        svm->next_rip = svm->vmcb->control.exit_info_2;
2006        skip_emulated_instruction(&svm->vcpu);
2007
2008        return kvm_fast_pio_out(vcpu, size, port);
2009}
2010
2011static int nmi_interception(struct vcpu_svm *svm)
2012{
2013        return 1;
2014}
2015
2016static int intr_interception(struct vcpu_svm *svm)
2017{
2018        ++svm->vcpu.stat.irq_exits;
2019        return 1;
2020}
2021
2022static int nop_on_interception(struct vcpu_svm *svm)
2023{
2024        return 1;
2025}
2026
2027static int halt_interception(struct vcpu_svm *svm)
2028{
2029        svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
2030        return kvm_emulate_halt(&svm->vcpu);
2031}
2032
2033static int vmmcall_interception(struct vcpu_svm *svm)
2034{
2035        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2036        kvm_emulate_hypercall(&svm->vcpu);
2037        return 1;
2038}
2039
2040static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
2041{
2042        struct vcpu_svm *svm = to_svm(vcpu);
2043
2044        return svm->nested.nested_cr3;
2045}
2046
2047static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
2048{
2049        struct vcpu_svm *svm = to_svm(vcpu);
2050        u64 cr3 = svm->nested.nested_cr3;
2051        u64 pdpte;
2052        int ret;
2053
2054        ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
2055                                       offset_in_page(cr3) + index * 8, 8);
2056        if (ret)
2057                return 0;
2058        return pdpte;
2059}
2060
2061static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
2062                                   unsigned long root)
2063{
2064        struct vcpu_svm *svm = to_svm(vcpu);
2065
2066        svm->vmcb->control.nested_cr3 = root;
2067        mark_dirty(svm->vmcb, VMCB_NPT);
2068        svm_flush_tlb(vcpu);
2069}
2070
2071static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
2072                                       struct x86_exception *fault)
2073{
2074        struct vcpu_svm *svm = to_svm(vcpu);
2075
2076        if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
2077                /*
2078                 * TODO: track the cause of the nested page fault, and
2079                 * correctly fill in the high bits of exit_info_1.
2080                 */
2081                svm->vmcb->control.exit_code = SVM_EXIT_NPF;
2082                svm->vmcb->control.exit_code_hi = 0;
2083                svm->vmcb->control.exit_info_1 = (1ULL << 32);
2084                svm->vmcb->control.exit_info_2 = fault->address;
2085        }
2086
2087        svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
2088        svm->vmcb->control.exit_info_1 |= fault->error_code;
2089
2090        /*
2091         * The present bit is always zero for page structure faults on real
2092         * hardware.
2093         */
2094        if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
2095                svm->vmcb->control.exit_info_1 &= ~1;
2096
2097        nested_svm_vmexit(svm);
2098}
2099
2100static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
2101{
2102        WARN_ON(mmu_is_nested(vcpu));
2103        kvm_init_shadow_mmu(vcpu);
2104        vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
2105        vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
2106        vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
2107        vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
2108        vcpu->arch.mmu.shadow_root_level = get_npt_level();
2109        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
2110}
2111
2112static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
2113{
2114        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
2115}
2116
2117static int nested_svm_check_permissions(struct vcpu_svm *svm)
2118{
2119        if (!(svm->vcpu.arch.efer & EFER_SVME)
2120            || !is_paging(&svm->vcpu)) {
2121                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2122                return 1;
2123        }
2124
2125        if (svm->vmcb->save.cpl) {
2126                kvm_inject_gp(&svm->vcpu, 0);
2127                return 1;
2128        }
2129
2130       return 0;
2131}
2132
2133static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
2134                                      bool has_error_code, u32 error_code)
2135{
2136        int vmexit;
2137
2138        if (!is_guest_mode(&svm->vcpu))
2139                return 0;
2140
2141        svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
2142        svm->vmcb->control.exit_code_hi = 0;
2143        svm->vmcb->control.exit_info_1 = error_code;
2144        svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
2145
2146        vmexit = nested_svm_intercept(svm);
2147        if (vmexit == NESTED_EXIT_DONE)
2148                svm->nested.exit_required = true;
2149
2150        return vmexit;
2151}
2152
2153/* This function returns true if it is save to enable the irq window */
2154static inline bool nested_svm_intr(struct vcpu_svm *svm)
2155{
2156        if (!is_guest_mode(&svm->vcpu))
2157                return true;
2158
2159        if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2160                return true;
2161
2162        if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
2163                return false;
2164
2165        /*
2166         * if vmexit was already requested (by intercepted exception
2167         * for instance) do not overwrite it with "external interrupt"
2168         * vmexit.
2169         */
2170        if (svm->nested.exit_required)
2171                return false;
2172
2173        svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
2174        svm->vmcb->control.exit_info_1 = 0;
2175        svm->vmcb->control.exit_info_2 = 0;
2176
2177        if (svm->nested.intercept & 1ULL) {
2178                /*
2179                 * The #vmexit can't be emulated here directly because this
2180                 * code path runs with irqs and preemption disabled. A
2181                 * #vmexit emulation might sleep. Only signal request for
2182                 * the #vmexit here.
2183                 */
2184                svm->nested.exit_required = true;
2185                trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
2186                return false;
2187        }
2188
2189        return true;
2190}
2191
2192/* This function returns true if it is save to enable the nmi window */
2193static inline bool nested_svm_nmi(struct vcpu_svm *svm)
2194{
2195        if (!is_guest_mode(&svm->vcpu))
2196                return true;
2197
2198        if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
2199                return true;
2200
2201        svm->vmcb->control.exit_code = SVM_EXIT_NMI;
2202        svm->nested.exit_required = true;
2203
2204        return false;
2205}
2206
2207static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
2208{
2209        struct page *page;
2210
2211        might_sleep();
2212
2213        page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
2214        if (is_error_page(page))
2215                goto error;
2216
2217        *_page = page;
2218
2219        return kmap(page);
2220
2221error:
2222        kvm_inject_gp(&svm->vcpu, 0);
2223
2224        return NULL;
2225}
2226
2227static void nested_svm_unmap(struct page *page)
2228{
2229        kunmap(page);
2230        kvm_release_page_dirty(page);
2231}
2232
2233static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
2234{
2235        unsigned port, size, iopm_len;
2236        u16 val, mask;
2237        u8 start_bit;
2238        u64 gpa;
2239
2240        if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
2241                return NESTED_EXIT_HOST;
2242
2243        port = svm->vmcb->control.exit_info_1 >> 16;
2244        size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
2245                SVM_IOIO_SIZE_SHIFT;
2246        gpa  = svm->nested.vmcb_iopm + (port / 8);
2247        start_bit = port % 8;
2248        iopm_len = (start_bit + size > 8) ? 2 : 1;
2249        mask = (0xf >> (4 - size)) << start_bit;
2250        val = 0;
2251
2252        if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
2253                return NESTED_EXIT_DONE;
2254
2255        return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2256}
2257
2258static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
2259{
2260        u32 offset, msr, value;
2261        int write, mask;
2262
2263        if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2264                return NESTED_EXIT_HOST;
2265
2266        msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2267        offset = svm_msrpm_offset(msr);
2268        write  = svm->vmcb->control.exit_info_1 & 1;
2269        mask   = 1 << ((2 * (msr & 0xf)) + write);
2270
2271        if (offset == MSR_INVALID)
2272                return NESTED_EXIT_DONE;
2273
2274        /* Offset is in 32 bit units but need in 8 bit units */
2275        offset *= 4;
2276
2277        if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
2278                return NESTED_EXIT_DONE;
2279
2280        return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2281}
2282
2283static int nested_svm_exit_special(struct vcpu_svm *svm)
2284{
2285        u32 exit_code = svm->vmcb->control.exit_code;
2286
2287        switch (exit_code) {
2288        case SVM_EXIT_INTR:
2289        case SVM_EXIT_NMI:
2290        case SVM_EXIT_EXCP_BASE + MC_VECTOR:
2291                return NESTED_EXIT_HOST;
2292        case SVM_EXIT_NPF:
2293                /* For now we are always handling NPFs when using them */
2294                if (npt_enabled)
2295                        return NESTED_EXIT_HOST;
2296                break;
2297        case SVM_EXIT_EXCP_BASE + PF_VECTOR:
2298                /* When we're shadowing, trap PFs, but not async PF */
2299                if (!npt_enabled && svm->apf_reason == 0)
2300                        return NESTED_EXIT_HOST;
2301                break;
2302        case SVM_EXIT_EXCP_BASE + NM_VECTOR:
2303                nm_interception(svm);
2304                break;
2305        default:
2306                break;
2307        }
2308
2309        return NESTED_EXIT_CONTINUE;
2310}
2311
2312/*
2313 * If this function returns true, this #vmexit was already handled
2314 */
2315static int nested_svm_intercept(struct vcpu_svm *svm)
2316{
2317        u32 exit_code = svm->vmcb->control.exit_code;
2318        int vmexit = NESTED_EXIT_HOST;
2319
2320        switch (exit_code) {
2321        case SVM_EXIT_MSR:
2322                vmexit = nested_svm_exit_handled_msr(svm);
2323                break;
2324        case SVM_EXIT_IOIO:
2325                vmexit = nested_svm_intercept_ioio(svm);
2326                break;
2327        case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
2328                u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
2329                if (svm->nested.intercept_cr & bit)
2330                        vmexit = NESTED_EXIT_DONE;
2331                break;
2332        }
2333        case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
2334                u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
2335                if (svm->nested.intercept_dr & bit)
2336                        vmexit = NESTED_EXIT_DONE;
2337                break;
2338        }
2339        case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
2340                u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
2341                if (svm->nested.intercept_exceptions & excp_bits)
2342                        vmexit = NESTED_EXIT_DONE;
2343                /* async page fault always cause vmexit */
2344                else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2345                         svm->apf_reason != 0)
2346                        vmexit = NESTED_EXIT_DONE;
2347                break;
2348        }
2349        case SVM_EXIT_ERR: {
2350                vmexit = NESTED_EXIT_DONE;
2351                break;
2352        }
2353        default: {
2354                u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
2355                if (svm->nested.intercept & exit_bits)
2356                        vmexit = NESTED_EXIT_DONE;
2357        }
2358        }
2359
2360        return vmexit;
2361}
2362
2363static int nested_svm_exit_handled(struct vcpu_svm *svm)
2364{
2365        int vmexit;
2366
2367        vmexit = nested_svm_intercept(svm);
2368
2369        if (vmexit == NESTED_EXIT_DONE)
2370                nested_svm_vmexit(svm);
2371
2372        return vmexit;
2373}
2374
2375static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
2376{
2377        struct vmcb_control_area *dst  = &dst_vmcb->control;
2378        struct vmcb_control_area *from = &from_vmcb->control;
2379
2380        dst->intercept_cr         = from->intercept_cr;
2381        dst->intercept_dr         = from->intercept_dr;
2382        dst->intercept_exceptions = from->intercept_exceptions;
2383        dst->intercept            = from->intercept;
2384        dst->iopm_base_pa         = from->iopm_base_pa;
2385        dst->msrpm_base_pa        = from->msrpm_base_pa;
2386        dst->tsc_offset           = from->tsc_offset;
2387        dst->asid                 = from->asid;
2388        dst->tlb_ctl              = from->tlb_ctl;
2389        dst->int_ctl              = from->int_ctl;
2390        dst->int_vector           = from->int_vector;
2391        dst->int_state            = from->int_state;
2392        dst->exit_code            = from->exit_code;
2393        dst->exit_code_hi         = from->exit_code_hi;
2394        dst->exit_info_1          = from->exit_info_1;
2395        dst->exit_info_2          = from->exit_info_2;
2396        dst->exit_int_info        = from->exit_int_info;
2397        dst->exit_int_info_err    = from->exit_int_info_err;
2398        dst->nested_ctl           = from->nested_ctl;
2399        dst->event_inj            = from->event_inj;
2400        dst->event_inj_err        = from->event_inj_err;
2401        dst->nested_cr3           = from->nested_cr3;
2402        dst->lbr_ctl              = from->lbr_ctl;
2403}
2404
2405static int nested_svm_vmexit(struct vcpu_svm *svm)
2406{
2407        struct vmcb *nested_vmcb;
2408        struct vmcb *hsave = svm->nested.hsave;
2409        struct vmcb *vmcb = svm->vmcb;
2410        struct page *page;
2411
2412        trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
2413                                       vmcb->control.exit_info_1,
2414                                       vmcb->control.exit_info_2,
2415                                       vmcb->control.exit_int_info,
2416                                       vmcb->control.exit_int_info_err,
2417                                       KVM_ISA_SVM);
2418
2419        nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
2420        if (!nested_vmcb)
2421                return 1;
2422
2423        /* Exit Guest-Mode */
2424        leave_guest_mode(&svm->vcpu);
2425        svm->nested.vmcb = 0;
2426
2427        /* Give the current vmcb to the guest */
2428        disable_gif(svm);
2429
2430        nested_vmcb->save.es     = vmcb->save.es;
2431        nested_vmcb->save.cs     = vmcb->save.cs;
2432        nested_vmcb->save.ss     = vmcb->save.ss;
2433        nested_vmcb->save.ds     = vmcb->save.ds;
2434        nested_vmcb->save.gdtr   = vmcb->save.gdtr;
2435        nested_vmcb->save.idtr   = vmcb->save.idtr;
2436        nested_vmcb->save.efer   = svm->vcpu.arch.efer;
2437        nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
2438        nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
2439        nested_vmcb->save.cr2    = vmcb->save.cr2;
2440        nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
2441        nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
2442        nested_vmcb->save.rip    = vmcb->save.rip;
2443        nested_vmcb->save.rsp    = vmcb->save.rsp;
2444        nested_vmcb->save.rax    = vmcb->save.rax;
2445        nested_vmcb->save.dr7    = vmcb->save.dr7;
2446        nested_vmcb->save.dr6    = vmcb->save.dr6;
2447        nested_vmcb->save.cpl    = vmcb->save.cpl;
2448
2449        nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
2450        nested_vmcb->control.int_vector        = vmcb->control.int_vector;
2451        nested_vmcb->control.int_state         = vmcb->control.int_state;
2452        nested_vmcb->control.exit_code         = vmcb->control.exit_code;
2453        nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
2454        nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
2455        nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
2456        nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
2457        nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2458        nested_vmcb->control.next_rip          = vmcb->control.next_rip;
2459
2460        /*
2461         * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
2462         * to make sure that we do not lose injected events. So check event_inj
2463         * here and copy it to exit_int_info if it is valid.
2464         * Exit_int_info and event_inj can't be both valid because the case
2465         * below only happens on a VMRUN instruction intercept which has
2466         * no valid exit_int_info set.
2467         */
2468        if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
2469                struct vmcb_control_area *nc = &nested_vmcb->control;
2470
2471                nc->exit_int_info     = vmcb->control.event_inj;
2472                nc->exit_int_info_err = vmcb->control.event_inj_err;
2473        }
2474
2475        nested_vmcb->control.tlb_ctl           = 0;
2476        nested_vmcb->control.event_inj         = 0;
2477        nested_vmcb->control.event_inj_err     = 0;
2478
2479        /* We always set V_INTR_MASKING and remember the old value in hflags */
2480        if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2481                nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
2482
2483        /* Restore the original control entries */
2484        copy_vmcb_control_area(vmcb, hsave);
2485
2486        kvm_clear_exception_queue(&svm->vcpu);
2487        kvm_clear_interrupt_queue(&svm->vcpu);
2488
2489        svm->nested.nested_cr3 = 0;
2490
2491        /* Restore selected save entries */
2492        svm->vmcb->save.es = hsave->save.es;
2493        svm->vmcb->save.cs = hsave->save.cs;
2494        svm->vmcb->save.ss = hsave->save.ss;
2495        svm->vmcb->save.ds = hsave->save.ds;
2496        svm->vmcb->save.gdtr = hsave->save.gdtr;
2497        svm->vmcb->save.idtr = hsave->save.idtr;
2498        kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
2499        svm_set_efer(&svm->vcpu, hsave->save.efer);
2500        svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
2501        svm_set_cr4(&svm->vcpu, hsave->save.cr4);
2502        if (npt_enabled) {
2503                svm->vmcb->save.cr3 = hsave->save.cr3;
2504                svm->vcpu.arch.cr3 = hsave->save.cr3;
2505        } else {
2506                (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
2507        }
2508        kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
2509        kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
2510        kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
2511        svm->vmcb->save.dr7 = 0;
2512        svm->vmcb->save.cpl = 0;
2513        svm->vmcb->control.exit_int_info = 0;
2514
2515        mark_all_dirty(svm->vmcb);
2516
2517        nested_svm_unmap(page);
2518
2519        nested_svm_uninit_mmu_context(&svm->vcpu);
2520        kvm_mmu_reset_context(&svm->vcpu);
2521        kvm_mmu_load(&svm->vcpu);
2522
2523        return 0;
2524}
2525
2526static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2527{
2528        /*
2529         * This function merges the msr permission bitmaps of kvm and the
2530         * nested vmcb. It is optimized in that it only merges the parts where
2531         * the kvm msr permission bitmap may contain zero bits
2532         */
2533        int i;
2534
2535        if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2536                return true;
2537
2538        for (i = 0; i < MSRPM_OFFSETS; i++) {
2539                u32 value, p;
2540                u64 offset;
2541
2542                if (msrpm_offsets[i] == 0xffffffff)
2543                        break;
2544
2545                p      = msrpm_offsets[i];
2546                offset = svm->nested.vmcb_msrpm + (p * 4);
2547
2548                if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
2549                        return false;
2550
2551                svm->nested.msrpm[p] = svm->msrpm[p] | value;
2552        }
2553
2554        svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
2555
2556        return true;
2557}
2558
2559static bool nested_vmcb_checks(struct vmcb *vmcb)
2560{
2561        if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2562                return false;
2563
2564        if (vmcb->control.asid == 0)
2565                return false;
2566
2567        if (vmcb->control.nested_ctl && !npt_enabled)
2568                return false;
2569
2570        return true;
2571}
2572
2573static bool nested_svm_vmrun(struct vcpu_svm *svm)
2574{
2575        struct vmcb *nested_vmcb;
2576        struct vmcb *hsave = svm->nested.hsave;
2577        struct vmcb *vmcb = svm->vmcb;
2578        struct page *page;
2579        u64 vmcb_gpa;
2580
2581        vmcb_gpa = svm->vmcb->save.rax;
2582
2583        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2584        if (!nested_vmcb)
2585                return false;
2586
2587        if (!nested_vmcb_checks(nested_vmcb)) {
2588                nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
2589                nested_vmcb->control.exit_code_hi = 0;
2590                nested_vmcb->control.exit_info_1  = 0;
2591                nested_vmcb->control.exit_info_2  = 0;
2592
2593                nested_svm_unmap(page);
2594
2595                return false;
2596        }
2597
2598        trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2599                               nested_vmcb->save.rip,
2600                               nested_vmcb->control.int_ctl,
2601                               nested_vmcb->control.event_inj,
2602                               nested_vmcb->control.nested_ctl);
2603
2604        trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2605                                    nested_vmcb->control.intercept_cr >> 16,
2606                                    nested_vmcb->control.intercept_exceptions,
2607                                    nested_vmcb->control.intercept);
2608
2609        /* Clear internal status */
2610        kvm_clear_exception_queue(&svm->vcpu);
2611        kvm_clear_interrupt_queue(&svm->vcpu);
2612
2613        /*
2614         * Save the old vmcb, so we don't need to pick what we save, but can
2615         * restore everything when a VMEXIT occurs
2616         */
2617        hsave->save.es     = vmcb->save.es;
2618        hsave->save.cs     = vmcb->save.cs;
2619        hsave->save.ss     = vmcb->save.ss;
2620        hsave->save.ds     = vmcb->save.ds;
2621        hsave->save.gdtr   = vmcb->save.gdtr;
2622        hsave->save.idtr   = vmcb->save.idtr;
2623        hsave->save.efer   = svm->vcpu.arch.efer;
2624        hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
2625        hsave->save.cr4    = svm->vcpu.arch.cr4;
2626        hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
2627        hsave->save.rip    = kvm_rip_read(&svm->vcpu);
2628        hsave->save.rsp    = vmcb->save.rsp;
2629        hsave->save.rax    = vmcb->save.rax;
2630        if (npt_enabled)
2631                hsave->save.cr3    = vmcb->save.cr3;
2632        else
2633                hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
2634
2635        copy_vmcb_control_area(hsave, vmcb);
2636
2637        if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
2638                svm->vcpu.arch.hflags |= HF_HIF_MASK;
2639        else
2640                svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2641
2642        if (nested_vmcb->control.nested_ctl) {
2643                kvm_mmu_unload(&svm->vcpu);
2644                svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2645                nested_svm_init_mmu_context(&svm->vcpu);
2646        }
2647
2648        /* Load the nested guest state */
2649        svm->vmcb->save.es = nested_vmcb->save.es;
2650        svm->vmcb->save.cs = nested_vmcb->save.cs;
2651        svm->vmcb->save.ss = nested_vmcb->save.ss;
2652        svm->vmcb->save.ds = nested_vmcb->save.ds;
2653        svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
2654        svm->vmcb->save.idtr = nested_vmcb->save.idtr;
2655        kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
2656        svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
2657        svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
2658        svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
2659        if (npt_enabled) {
2660                svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2661                svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2662        } else
2663                (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2664
2665        /* Guest paging mode is active - reset mmu */
2666        kvm_mmu_reset_context(&svm->vcpu);
2667
2668        svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
2669        kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
2670        kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
2671        kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2672
2673        /* In case we don't even reach vcpu_run, the fields are not updated */
2674        svm->vmcb->save.rax = nested_vmcb->save.rax;
2675        svm->vmcb->save.rsp = nested_vmcb->save.rsp;
2676        svm->vmcb->save.rip = nested_vmcb->save.rip;
2677        svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
2678        svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
2679        svm->vmcb->save.cpl = nested_vmcb->save.cpl;
2680
2681        svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
2682        svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
2683
2684        /* cache intercepts */
2685        svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
2686        svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
2687        svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2688        svm->nested.intercept            = nested_vmcb->control.intercept;
2689
2690        svm_flush_tlb(&svm->vcpu);
2691        svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2692        if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2693                svm->vcpu.arch.hflags |= HF_VINTR_MASK;
2694        else
2695                svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
2696
2697        if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2698                /* We only want the cr8 intercept bits of the guest */
2699                clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2700                clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2701        }
2702
2703        /* We don't want to see VMMCALLs from a nested guest */
2704        clr_intercept(svm, INTERCEPT_VMMCALL);
2705
2706        svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2707        svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
2708        svm->vmcb->control.int_state = nested_vmcb->control.int_state;
2709        svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
2710        svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
2711        svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
2712
2713        nested_svm_unmap(page);
2714
2715        /* Enter Guest-Mode */
2716        enter_guest_mode(&svm->vcpu);
2717
2718        /*
2719         * Merge guest and host intercepts - must be called  with vcpu in
2720         * guest-mode to take affect here
2721         */
2722        recalc_intercepts(svm);
2723
2724        svm->nested.vmcb = vmcb_gpa;
2725
2726        enable_gif(svm);
2727
2728        mark_all_dirty(svm->vmcb);
2729
2730        return true;
2731}
2732
2733static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
2734{
2735        to_vmcb->save.fs = from_vmcb->save.fs;
2736        to_vmcb->save.gs = from_vmcb->save.gs;
2737        to_vmcb->save.tr = from_vmcb->save.tr;
2738        to_vmcb->save.ldtr = from_vmcb->save.ldtr;
2739        to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
2740        to_vmcb->save.star = from_vmcb->save.star;
2741        to_vmcb->save.lstar = from_vmcb->save.lstar;
2742        to_vmcb->save.cstar = from_vmcb->save.cstar;
2743        to_vmcb->save.sfmask = from_vmcb->save.sfmask;
2744        to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
2745        to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
2746        to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
2747}
2748
2749static int vmload_interception(struct vcpu_svm *svm)
2750{
2751        struct vmcb *nested_vmcb;
2752        struct page *page;
2753
2754        if (nested_svm_check_permissions(svm))
2755                return 1;
2756
2757        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2758        if (!nested_vmcb)
2759                return 1;
2760
2761        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2762        skip_emulated_instruction(&svm->vcpu);
2763
2764        nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2765        nested_svm_unmap(page);
2766
2767        return 1;
2768}
2769
2770static int vmsave_interception(struct vcpu_svm *svm)
2771{
2772        struct vmcb *nested_vmcb;
2773        struct page *page;
2774
2775        if (nested_svm_check_permissions(svm))
2776                return 1;
2777
2778        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2779        if (!nested_vmcb)
2780                return 1;
2781
2782        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2783        skip_emulated_instruction(&svm->vcpu);
2784
2785        nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2786        nested_svm_unmap(page);
2787
2788        return 1;
2789}
2790
2791static int vmrun_interception(struct vcpu_svm *svm)
2792{
2793        if (nested_svm_check_permissions(svm))
2794                return 1;
2795
2796        /* Save rip after vmrun instruction */
2797        kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
2798
2799        if (!nested_svm_vmrun(svm))
2800                return 1;
2801
2802        if (!nested_svm_vmrun_msrpm(svm))
2803                goto failed;
2804
2805        return 1;
2806
2807failed:
2808
2809        svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
2810        svm->vmcb->control.exit_code_hi = 0;
2811        svm->vmcb->control.exit_info_1  = 0;
2812        svm->vmcb->control.exit_info_2  = 0;
2813
2814        nested_svm_vmexit(svm);
2815
2816        return 1;
2817}
2818
2819static int stgi_interception(struct vcpu_svm *svm)
2820{
2821        if (nested_svm_check_permissions(svm))
2822                return 1;
2823
2824        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2825        skip_emulated_instruction(&svm->vcpu);
2826        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2827
2828        enable_gif(svm);
2829
2830        return 1;
2831}
2832
2833static int clgi_interception(struct vcpu_svm *svm)
2834{
2835        if (nested_svm_check_permissions(svm))
2836                return 1;
2837
2838        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2839        skip_emulated_instruction(&svm->vcpu);
2840
2841        disable_gif(svm);
2842
2843        /* After a CLGI no interrupts should come */
2844        svm_clear_vintr(svm);
2845        svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2846
2847        mark_dirty(svm->vmcb, VMCB_INTR);
2848
2849        return 1;
2850}
2851
2852static int invlpga_interception(struct vcpu_svm *svm)
2853{
2854        struct kvm_vcpu *vcpu = &svm->vcpu;
2855
2856        trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
2857                          kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2858
2859        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2860        kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2861
2862        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2863        skip_emulated_instruction(&svm->vcpu);
2864        return 1;
2865}
2866
2867static int skinit_interception(struct vcpu_svm *svm)
2868{
2869        trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2870
2871        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2872        return 1;
2873}
2874
2875static int wbinvd_interception(struct vcpu_svm *svm)
2876{
2877        kvm_emulate_wbinvd(&svm->vcpu);
2878        return 1;
2879}
2880
2881static int xsetbv_interception(struct vcpu_svm *svm)
2882{
2883        u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2884        u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
2885
2886        if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2887                svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2888                skip_emulated_instruction(&svm->vcpu);
2889        }
2890
2891        return 1;
2892}
2893
2894static int task_switch_interception(struct vcpu_svm *svm)
2895{
2896        u16 tss_selector;
2897        int reason;
2898        int int_type = svm->vmcb->control.exit_int_info &
2899                SVM_EXITINTINFO_TYPE_MASK;
2900        int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2901        uint32_t type =
2902                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2903        uint32_t idt_v =
2904                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2905        bool has_error_code = false;
2906        u32 error_code = 0;
2907
2908        tss_selector = (u16)svm->vmcb->control.exit_info_1;
2909
2910        if (svm->vmcb->control.exit_info_2 &
2911            (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2912                reason = TASK_SWITCH_IRET;
2913        else if (svm->vmcb->control.exit_info_2 &
2914                 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2915                reason = TASK_SWITCH_JMP;
2916        else if (idt_v)
2917                reason = TASK_SWITCH_GATE;
2918        else
2919                reason = TASK_SWITCH_CALL;
2920
2921        if (reason == TASK_SWITCH_GATE) {
2922                switch (type) {
2923                case SVM_EXITINTINFO_TYPE_NMI:
2924                        svm->vcpu.arch.nmi_injected = false;
2925                        break;
2926                case SVM_EXITINTINFO_TYPE_EXEPT:
2927                        if (svm->vmcb->control.exit_info_2 &
2928                            (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2929                                has_error_code = true;
2930                                error_code =
2931                                        (u32)svm->vmcb->control.exit_info_2;
2932                        }
2933                        kvm_clear_exception_queue(&svm->vcpu);
2934                        break;
2935                case SVM_EXITINTINFO_TYPE_INTR:
2936                        kvm_clear_interrupt_queue(&svm->vcpu);
2937                        break;
2938                default:
2939                        break;
2940                }
2941        }
2942
2943        if (reason != TASK_SWITCH_GATE ||
2944            int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2945            (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2946             (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2947                skip_emulated_instruction(&svm->vcpu);
2948
2949        if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2950                int_vec = -1;
2951
2952        if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
2953                                has_error_code, error_code) == EMULATE_FAIL) {
2954                svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2955                svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2956                svm->vcpu.run->internal.ndata = 0;
2957                return 0;
2958        }
2959        return 1;
2960}
2961
2962static int cpuid_interception(struct vcpu_svm *svm)
2963{
2964        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2965        kvm_emulate_cpuid(&svm->vcpu);
2966        return 1;
2967}
2968
2969static int iret_interception(struct vcpu_svm *svm)
2970{
2971        ++svm->vcpu.stat.nmi_window_exits;
2972        clr_intercept(svm, INTERCEPT_IRET);
2973        svm->vcpu.arch.hflags |= HF_IRET_MASK;
2974        svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2975        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2976        return 1;
2977}
2978
2979static int invlpg_interception(struct vcpu_svm *svm)
2980{
2981        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2982                return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2983
2984        kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2985        skip_emulated_instruction(&svm->vcpu);
2986        return 1;
2987}
2988
2989static int emulate_on_interception(struct vcpu_svm *svm)
2990{
2991        return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2992}
2993
2994static int rdpmc_interception(struct vcpu_svm *svm)
2995{
2996        int err;
2997
2998        if (!static_cpu_has(X86_FEATURE_NRIPS))
2999                return emulate_on_interception(svm);
3000
3001        err = kvm_rdpmc(&svm->vcpu);
3002        kvm_complete_insn_gp(&svm->vcpu, err);
3003
3004        return 1;
3005}
3006
3007static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
3008                                            unsigned long val)
3009{
3010        unsigned long cr0 = svm->vcpu.arch.cr0;
3011        bool ret = false;
3012        u64 intercept;
3013
3014        intercept = svm->nested.intercept;
3015
3016        if (!is_guest_mode(&svm->vcpu) ||
3017            (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
3018                return false;
3019
3020        cr0 &= ~SVM_CR0_SELECTIVE_MASK;
3021        val &= ~SVM_CR0_SELECTIVE_MASK;
3022
3023        if (cr0 ^ val) {
3024                svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
3025                ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
3026        }
3027
3028        return ret;
3029}
3030
3031#define CR_VALID (1ULL << 63)
3032
3033static int cr_interception(struct vcpu_svm *svm)
3034{
3035        int reg, cr;
3036        unsigned long val;
3037        int err;
3038
3039        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
3040                return emulate_on_interception(svm);
3041
3042        if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
3043                return emulate_on_interception(svm);
3044
3045        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3046        if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
3047                cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
3048        else
3049                cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
3050
3051        err = 0;
3052        if (cr >= 16) { /* mov to cr */
3053                cr -= 16;
3054                val = kvm_register_read(&svm->vcpu, reg);
3055                switch (cr) {
3056                case 0:
3057                        if (!check_selective_cr0_intercepted(svm, val))
3058                                err = kvm_set_cr0(&svm->vcpu, val);
3059                        else
3060                                return 1;
3061
3062                        break;
3063                case 3:
3064                        err = kvm_set_cr3(&svm->vcpu, val);
3065                        break;
3066                case 4:
3067                        err = kvm_set_cr4(&svm->vcpu, val);
3068                        break;
3069                case 8:
3070                        err = kvm_set_cr8(&svm->vcpu, val);
3071                        break;
3072                default:
3073                        WARN(1, "unhandled write to CR%d", cr);
3074                        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3075                        return 1;
3076                }
3077        } else { /* mov from cr */
3078                switch (cr) {
3079                case 0:
3080                        val = kvm_read_cr0(&svm->vcpu);
3081                        break;
3082                case 2:
3083                        val = svm->vcpu.arch.cr2;
3084                        break;
3085                case 3:
3086                        val = kvm_read_cr3(&svm->vcpu);
3087                        break;
3088                case 4:
3089                        val = kvm_read_cr4(&svm->vcpu);
3090                        break;
3091                case 8:
3092                        val = kvm_get_cr8(&svm->vcpu);
3093                        break;
3094                default:
3095                        WARN(1, "unhandled read from CR%d", cr);
3096                        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3097                        return 1;
3098                }
3099                kvm_register_write(&svm->vcpu, reg, val);
3100        }
3101        kvm_complete_insn_gp(&svm->vcpu, err);
3102
3103        return 1;
3104}
3105
3106static int dr_interception(struct vcpu_svm *svm)
3107{
3108        int reg, dr;
3109        unsigned long val;
3110
3111        if (svm->vcpu.guest_debug == 0) {
3112                /*
3113                 * No more DR vmexits; force a reload of the debug registers
3114                 * and reenter on this instruction.  The next vmexit will
3115                 * retrieve the full state of the debug registers.
3116                 */
3117                clr_dr_intercepts(svm);
3118                svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
3119                return 1;
3120        }
3121
3122        if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
3123                return emulate_on_interception(svm);
3124
3125        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3126        dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
3127
3128        if (dr >= 16) { /* mov to DRn */
3129                if (!kvm_require_dr(&svm->vcpu, dr - 16))
3130                        return 1;
3131                val = kvm_register_read(&svm->vcpu, reg);
3132                kvm_set_dr(&svm->vcpu, dr - 16, val);
3133        } else {
3134                if (!kvm_require_dr(&svm->vcpu, dr))
3135                        return 1;
3136                kvm_get_dr(&svm->vcpu, dr, &val);
3137                kvm_register_write(&svm->vcpu, reg, val);
3138        }
3139
3140        skip_emulated_instruction(&svm->vcpu);
3141
3142        return 1;
3143}
3144
3145static int cr8_write_interception(struct vcpu_svm *svm)
3146{
3147        struct kvm_run *kvm_run = svm->vcpu.run;
3148        int r;
3149
3150        u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
3151        /* instruction emulation calls kvm_set_cr8() */
3152        r = cr_interception(svm);
3153        if (irqchip_in_kernel(svm->vcpu.kvm))
3154                return r;
3155        if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
3156                return r;
3157        kvm_run->exit_reason = KVM_EXIT_SET_TPR;
3158        return 0;
3159}
3160
3161static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
3162{
3163        struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
3164        return vmcb->control.tsc_offset +
3165                svm_scale_tsc(vcpu, host_tsc);
3166}
3167
3168static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3169{
3170        struct vcpu_svm *svm = to_svm(vcpu);
3171
3172        switch (msr_info->index) {
3173        case MSR_IA32_TSC: {
3174                msr_info->data = svm->vmcb->control.tsc_offset +
3175                        svm_scale_tsc(vcpu, native_read_tsc());
3176
3177                break;
3178        }
3179        case MSR_STAR:
3180                msr_info->data = svm->vmcb->save.star;
3181                break;
3182#ifdef CONFIG_X86_64
3183        case MSR_LSTAR:
3184                msr_info->data = svm->vmcb->save.lstar;
3185                break;
3186        case MSR_CSTAR:
3187                msr_info->data = svm->vmcb->save.cstar;
3188                break;
3189        case MSR_KERNEL_GS_BASE:
3190                msr_info->data = svm->vmcb->save.kernel_gs_base;
3191                break;
3192        case MSR_SYSCALL_MASK:
3193                msr_info->data = svm->vmcb->save.sfmask;
3194                break;
3195#endif
3196        case MSR_IA32_SYSENTER_CS:
3197                msr_info->data = svm->vmcb->save.sysenter_cs;
3198                break;
3199        case MSR_IA32_SYSENTER_EIP:
3200                msr_info->data = svm->sysenter_eip;
3201                break;
3202        case MSR_IA32_SYSENTER_ESP:
3203                msr_info->data = svm->sysenter_esp;
3204                break;
3205        /*
3206         * Nobody will change the following 5 values in the VMCB so we can
3207         * safely return them on rdmsr. They will always be 0 until LBRV is
3208         * implemented.
3209         */
3210        case MSR_IA32_DEBUGCTLMSR:
3211                msr_info->data = svm->vmcb->save.dbgctl;
3212                break;
3213        case MSR_IA32_LASTBRANCHFROMIP:
3214                msr_info->data = svm->vmcb->save.br_from;
3215                break;
3216        case MSR_IA32_LASTBRANCHTOIP:
3217                msr_info->data = svm->vmcb->save.br_to;
3218                break;
3219        case MSR_IA32_LASTINTFROMIP:
3220                msr_info->data = svm->vmcb->save.last_excp_from;
3221                break;
3222        case MSR_IA32_LASTINTTOIP:
3223                msr_info->data = svm->vmcb->save.last_excp_to;
3224                break;
3225        case MSR_VM_HSAVE_PA:
3226                msr_info->data = svm->nested.hsave_msr;
3227                break;
3228        case MSR_VM_CR:
3229                msr_info->data = svm->nested.vm_cr_msr;
3230                break;
3231        case MSR_IA32_UCODE_REV:
3232                msr_info->data = 0x01000065;
3233                break;
3234        default:
3235                return kvm_get_msr_common(vcpu, msr_info);
3236        }
3237        return 0;
3238}
3239
3240static int rdmsr_interception(struct vcpu_svm *svm)
3241{
3242        u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3243        struct msr_data msr_info;
3244
3245        msr_info.index = ecx;
3246        msr_info.host_initiated = false;
3247        if (svm_get_msr(&svm->vcpu, &msr_info)) {
3248                trace_kvm_msr_read_ex(ecx);
3249                kvm_inject_gp(&svm->vcpu, 0);
3250        } else {
3251                trace_kvm_msr_read(ecx, msr_info.data);
3252
3253                kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
3254                                   msr_info.data & 0xffffffff);
3255                kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
3256                                   msr_info.data >> 32);
3257                svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3258                skip_emulated_instruction(&svm->vcpu);
3259        }
3260        return 1;
3261}
3262
3263static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
3264{
3265        struct vcpu_svm *svm = to_svm(vcpu);
3266        int svm_dis, chg_mask;
3267
3268        if (data & ~SVM_VM_CR_VALID_MASK)
3269                return 1;
3270
3271        chg_mask = SVM_VM_CR_VALID_MASK;
3272
3273        if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
3274                chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
3275
3276        svm->nested.vm_cr_msr &= ~chg_mask;
3277        svm->nested.vm_cr_msr |= (data & chg_mask);
3278
3279        svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
3280
3281        /* check for svm_disable while efer.svme is set */
3282        if (svm_dis && (vcpu->arch.efer & EFER_SVME))
3283                return 1;
3284
3285        return 0;
3286}
3287
3288static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3289{
3290        struct vcpu_svm *svm = to_svm(vcpu);
3291
3292        u32 ecx = msr->index;
3293        u64 data = msr->data;
3294        switch (ecx) {
3295        case MSR_IA32_TSC:
3296                kvm_write_tsc(vcpu, msr);
3297                break;
3298        case MSR_STAR:
3299                svm->vmcb->save.star = data;
3300                break;
3301#ifdef CONFIG_X86_64
3302        case MSR_LSTAR:
3303                svm->vmcb->save.lstar = data;
3304                break;
3305        case MSR_CSTAR:
3306                svm->vmcb->save.cstar = data;
3307                break;
3308        case MSR_KERNEL_GS_BASE:
3309                svm->vmcb->save.kernel_gs_base = data;
3310                break;
3311        case MSR_SYSCALL_MASK:
3312                svm->vmcb->save.sfmask = data;
3313                break;
3314#endif
3315        case MSR_IA32_SYSENTER_CS:
3316                svm->vmcb->save.sysenter_cs = data;
3317                break;
3318        case MSR_IA32_SYSENTER_EIP:
3319                svm->sysenter_eip = data;
3320                svm->vmcb->save.sysenter_eip = data;
3321                break;
3322        case MSR_IA32_SYSENTER_ESP:
3323                svm->sysenter_esp = data;
3324                svm->vmcb->save.sysenter_esp = data;
3325                break;
3326        case MSR_IA32_DEBUGCTLMSR:
3327                if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3328                        vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3329                                    __func__, data);
3330                        break;
3331                }
3332                if (data & DEBUGCTL_RESERVED_BITS)
3333                        return 1;
3334
3335                svm->vmcb->save.dbgctl = data;
3336                mark_dirty(svm->vmcb, VMCB_LBR);
3337                if (data & (1ULL<<0))
3338                        svm_enable_lbrv(svm);
3339                else
3340                        svm_disable_lbrv(svm);
3341                break;
3342        case MSR_VM_HSAVE_PA:
3343                svm->nested.hsave_msr = data;
3344                break;
3345        case MSR_VM_CR:
3346                return svm_set_vm_cr(vcpu, data);
3347        case MSR_VM_IGNNE:
3348                vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3349                break;
3350        case MSR_IA32_CR_PAT:
3351                if (npt_enabled) {
3352                        if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
3353                                return 1;
3354                        vcpu->arch.pat = data;
3355                        svm_set_guest_pat(svm, &svm->vmcb->save.g_pat);
3356                        mark_dirty(svm->vmcb, VMCB_NPT);
3357                        break;
3358                }
3359                /* fall through */
3360        default:
3361                return kvm_set_msr_common(vcpu, msr);
3362        }
3363        return 0;
3364}
3365
3366static int wrmsr_interception(struct vcpu_svm *svm)
3367{
3368        struct msr_data msr;
3369        u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3370        u64 data = kvm_read_edx_eax(&svm->vcpu);
3371
3372        msr.data = data;
3373        msr.index = ecx;
3374        msr.host_initiated = false;
3375
3376        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3377        if (kvm_set_msr(&svm->vcpu, &msr)) {
3378                trace_kvm_msr_write_ex(ecx, data);
3379                kvm_inject_gp(&svm->vcpu, 0);
3380        } else {
3381                trace_kvm_msr_write(ecx, data);
3382                skip_emulated_instruction(&svm->vcpu);
3383        }
3384        return 1;
3385}
3386
3387static int msr_interception(struct vcpu_svm *svm)
3388{
3389        if (svm->vmcb->control.exit_info_1)
3390                return wrmsr_interception(svm);
3391        else
3392                return rdmsr_interception(svm);
3393}
3394
3395static int interrupt_window_interception(struct vcpu_svm *svm)
3396{
3397        struct kvm_run *kvm_run = svm->vcpu.run;
3398
3399        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3400        svm_clear_vintr(svm);
3401        svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3402        mark_dirty(svm->vmcb, VMCB_INTR);
3403        ++svm->vcpu.stat.irq_window_exits;
3404        /*
3405         * If the user space waits to inject interrupts, exit as soon as
3406         * possible
3407         */
3408        if (!irqchip_in_kernel(svm->vcpu.kvm) &&
3409            kvm_run->request_interrupt_window &&
3410            !kvm_cpu_has_interrupt(&svm->vcpu)) {
3411                kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3412                return 0;
3413        }
3414
3415        return 1;
3416}
3417
3418static int pause_interception(struct vcpu_svm *svm)
3419{
3420        kvm_vcpu_on_spin(&(svm->vcpu));
3421        return 1;
3422}
3423
3424static int nop_interception(struct vcpu_svm *svm)
3425{
3426        skip_emulated_instruction(&(svm->vcpu));
3427        return 1;
3428}
3429
3430static int monitor_interception(struct vcpu_svm *svm)
3431{
3432        printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
3433        return nop_interception(svm);
3434}
3435
3436static int mwait_interception(struct vcpu_svm *svm)
3437{
3438        printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
3439        return nop_interception(svm);
3440}
3441
3442static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3443        [SVM_EXIT_READ_CR0]                     = cr_interception,
3444        [SVM_EXIT_READ_CR3]                     = cr_interception,
3445        [SVM_EXIT_READ_CR4]                     = cr_interception,
3446        [SVM_EXIT_READ_CR8]                     = cr_interception,
3447        [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3448        [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3449        [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3450        [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3451        [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3452        [SVM_EXIT_READ_DR0]                     = dr_interception,
3453        [SVM_EXIT_READ_DR1]                     = dr_interception,
3454        [SVM_EXIT_READ_DR2]                     = dr_interception,
3455        [SVM_EXIT_READ_DR3]                     = dr_interception,
3456        [SVM_EXIT_READ_DR4]                     = dr_interception,
3457        [SVM_EXIT_READ_DR5]                     = dr_interception,
3458        [SVM_EXIT_READ_DR6]                     = dr_interception,
3459        [SVM_EXIT_READ_DR7]                     = dr_interception,
3460        [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3461        [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3462        [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3463        [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3464        [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3465        [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3466        [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3467        [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3468        [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3469        [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3470        [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3471        [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3472        [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
3473        [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3474        [SVM_EXIT_INTR]                         = intr_interception,
3475        [SVM_EXIT_NMI]                          = nmi_interception,
3476        [SVM_EXIT_SMI]                          = nop_on_interception,
3477        [SVM_EXIT_INIT]                         = nop_on_interception,
3478        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3479        [SVM_EXIT_RDPMC]                        = rdpmc_interception,
3480        [SVM_EXIT_CPUID]                        = cpuid_interception,
3481        [SVM_EXIT_IRET]                         = iret_interception,
3482        [SVM_EXIT_INVD]                         = emulate_on_interception,
3483        [SVM_EXIT_PAUSE]                        = pause_interception,
3484        [SVM_EXIT_HLT]                          = halt_interception,
3485        [SVM_EXIT_INVLPG]                       = invlpg_interception,
3486        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3487        [SVM_EXIT_IOIO]                         = io_interception,
3488        [SVM_EXIT_MSR]                          = msr_interception,
3489        [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3490        [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3491        [SVM_EXIT_VMRUN]                        = vmrun_interception,
3492        [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
3493        [SVM_EXIT_VMLOAD]                       = vmload_interception,
3494        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3495        [SVM_EXIT_STGI]                         = stgi_interception,
3496        [SVM_EXIT_CLGI]                         = clgi_interception,
3497        [SVM_EXIT_SKINIT]                       = skinit_interception,
3498        [SVM_EXIT_WBINVD]                       = wbinvd_interception,
3499        [SVM_EXIT_MONITOR]                      = monitor_interception,
3500        [SVM_EXIT_MWAIT]                        = mwait_interception,
3501        [SVM_EXIT_XSETBV]                       = xsetbv_interception,
3502        [SVM_EXIT_NPF]                          = pf_interception,
3503        [SVM_EXIT_RSM]                          = emulate_on_interception,
3504};
3505
3506static void dump_vmcb(struct kvm_vcpu *vcpu)
3507{
3508        struct vcpu_svm *svm = to_svm(vcpu);
3509        struct vmcb_control_area *control = &svm->vmcb->control;
3510        struct vmcb_save_area *save = &svm->vmcb->save;
3511
3512        pr_err("VMCB Control Area:\n");
3513        pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
3514        pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
3515        pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
3516        pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
3517        pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
3518        pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
3519        pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3520        pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3521        pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3522        pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3523        pr_err("%-20s%d\n", "asid:", control->asid);
3524        pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3525        pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3526        pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3527        pr_err("%-20s%08x\n", "int_state:", control->int_state);
3528        pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3529        pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3530        pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3531        pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3532        pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3533        pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3534        pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3535        pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3536        pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3537        pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
3538        pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3539        pr_err("VMCB State Save Area:\n");
3540        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3541               "es:",
3542               save->es.selector, save->es.attrib,
3543               save->es.limit, save->es.base);
3544        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3545               "cs:",
3546               save->cs.selector, save->cs.attrib,
3547               save->cs.limit, save->cs.base);
3548        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3549               "ss:",
3550               save->ss.selector, save->ss.attrib,
3551               save->ss.limit, save->ss.base);
3552        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3553               "ds:",
3554               save->ds.selector, save->ds.attrib,
3555               save->ds.limit, save->ds.base);
3556        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3557               "fs:",
3558               save->fs.selector, save->fs.attrib,
3559               save->fs.limit, save->fs.base);
3560        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3561               "gs:",
3562               save->gs.selector, save->gs.attrib,
3563               save->gs.limit, save->gs.base);
3564        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3565               "gdtr:",
3566               save->gdtr.selector, save->gdtr.attrib,
3567               save->gdtr.limit, save->gdtr.base);
3568        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3569               "ldtr:",
3570               save->ldtr.selector, save->ldtr.attrib,
3571               save->ldtr.limit, save->ldtr.base);
3572        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3573               "idtr:",
3574               save->idtr.selector, save->idtr.attrib,
3575               save->idtr.limit, save->idtr.base);
3576        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3577               "tr:",
3578               save->tr.selector, save->tr.attrib,
3579               save->tr.limit, save->tr.base);
3580        pr_err("cpl:            %d                efer:         %016llx\n",
3581                save->cpl, save->efer);
3582        pr_err("%-15s %016llx %-13s %016llx\n",
3583               "cr0:", save->cr0, "cr2:", save->cr2);
3584        pr_err("%-15s %016llx %-13s %016llx\n",
3585               "cr3:", save->cr3, "cr4:", save->cr4);
3586        pr_err("%-15s %016llx %-13s %016llx\n",
3587               "dr6:", save->dr6, "dr7:", save->dr7);
3588        pr_err("%-15s %016llx %-13s %016llx\n",
3589               "rip:", save->rip, "rflags:", save->rflags);
3590        pr_err("%-15s %016llx %-13s %016llx\n",
3591               "rsp:", save->rsp, "rax:", save->rax);
3592        pr_err("%-15s %016llx %-13s %016llx\n",
3593               "star:", save->star, "lstar:", save->lstar);
3594        pr_err("%-15s %016llx %-13s %016llx\n",
3595               "cstar:", save->cstar, "sfmask:", save->sfmask);
3596        pr_err("%-15s %016llx %-13s %016llx\n",
3597               "kernel_gs_base:", save->kernel_gs_base,
3598               "sysenter_cs:", save->sysenter_cs);
3599        pr_err("%-15s %016llx %-13s %016llx\n",
3600               "sysenter_esp:", save->sysenter_esp,
3601               "sysenter_eip:", save->sysenter_eip);
3602        pr_err("%-15s %016llx %-13s %016llx\n",
3603               "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3604        pr_err("%-15s %016llx %-13s %016llx\n",
3605               "br_from:", save->br_from, "br_to:", save->br_to);
3606        pr_err("%-15s %016llx %-13s %016llx\n",
3607               "excp_from:", save->last_excp_from,
3608               "excp_to:", save->last_excp_to);
3609}
3610
3611static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3612{
3613        struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3614
3615        *info1 = control->exit_info_1;
3616        *info2 = control->exit_info_2;
3617}
3618
3619static int handle_exit(struct kvm_vcpu *vcpu)
3620{
3621        struct vcpu_svm *svm = to_svm(vcpu);
3622        struct kvm_run *kvm_run = vcpu->run;
3623        u32 exit_code = svm->vmcb->control.exit_code;
3624
3625        if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
3626                vcpu->arch.cr0 = svm->vmcb->save.cr0;
3627        if (npt_enabled)
3628                vcpu->arch.cr3 = svm->vmcb->save.cr3;
3629
3630        if (unlikely(svm->nested.exit_required)) {
3631                nested_svm_vmexit(svm);
3632                svm->nested.exit_required = false;
3633
3634                return 1;
3635        }
3636
3637        if (is_guest_mode(vcpu)) {
3638                int vmexit;
3639
3640                trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
3641                                        svm->vmcb->control.exit_info_1,
3642                                        svm->vmcb->control.exit_info_2,
3643                                        svm->vmcb->control.exit_int_info,
3644                                        svm->vmcb->control.exit_int_info_err,
3645                                        KVM_ISA_SVM);
3646
3647                vmexit = nested_svm_exit_special(svm);
3648
3649                if (vmexit == NESTED_EXIT_CONTINUE)
3650                        vmexit = nested_svm_exit_handled(svm);
3651
3652                if (vmexit == NESTED_EXIT_DONE)
3653                        return 1;
3654        }
3655
3656        svm_complete_interrupts(svm);
3657
3658        if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3659                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3660                kvm_run->fail_entry.hardware_entry_failure_reason
3661                        = svm->vmcb->control.exit_code;
3662                pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
3663                dump_vmcb(vcpu);
3664                return 0;
3665        }
3666
3667        if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
3668            exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3669            exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3670            exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3671                printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
3672                       "exit_code 0x%x\n",
3673                       __func__, svm->vmcb->control.exit_int_info,
3674                       exit_code);
3675
3676        if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
3677            || !svm_exit_handlers[exit_code]) {
3678                WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
3679                kvm_queue_exception(vcpu, UD_VECTOR);
3680                return 1;
3681        }
3682
3683        return svm_exit_handlers[exit_code](svm);
3684}
3685
3686static void reload_tss(struct kvm_vcpu *vcpu)
3687{
3688        int cpu = raw_smp_processor_id();
3689
3690        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3691        sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3692        load_TR_desc();
3693}
3694
3695static void pre_svm_run(struct vcpu_svm *svm)
3696{
3697        int cpu = raw_smp_processor_id();
3698
3699        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3700
3701        /* FIXME: handle wraparound of asid_generation */
3702        if (svm->asid_generation != sd->asid_generation)
3703                new_asid(svm, sd);
3704}
3705
3706static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3707{
3708        struct vcpu_svm *svm = to_svm(vcpu);
3709
3710        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3711        vcpu->arch.hflags |= HF_NMI_MASK;
3712        set_intercept(svm, INTERCEPT_IRET);
3713        ++vcpu->stat.nmi_injections;
3714}
3715
3716static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
3717{
3718        struct vmcb_control_area *control;
3719
3720        control = &svm->vmcb->control;
3721        control->int_vector = irq;
3722        control->int_ctl &= ~V_INTR_PRIO_MASK;
3723        control->int_ctl |= V_IRQ_MASK |
3724                ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
3725        mark_dirty(svm->vmcb, VMCB_INTR);
3726}
3727
3728static void svm_set_irq(struct kvm_vcpu *vcpu)
3729{
3730        struct vcpu_svm *svm = to_svm(vcpu);
3731
3732        BUG_ON(!(gif_set(svm)));
3733
3734        trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
3735        ++vcpu->stat.irq_injections;
3736
3737        svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3738                SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
3739}
3740
3741static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3742{
3743        struct vcpu_svm *svm = to_svm(vcpu);
3744
3745        if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3746                return;
3747
3748        clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3749
3750        if (irr == -1)
3751                return;
3752
3753        if (tpr >= irr)
3754                set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3755}
3756
3757static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
3758{
3759        return;
3760}
3761
3762static int svm_vm_has_apicv(struct kvm *kvm)
3763{
3764        return 0;
3765}
3766
3767static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
3768{
3769        return;
3770}
3771
3772static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
3773{
3774        return;
3775}
3776
3777static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
3778{
3779        struct vcpu_svm *svm = to_svm(vcpu);
3780        struct vmcb *vmcb = svm->vmcb;
3781        int ret;
3782        ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
3783              !(svm->vcpu.arch.hflags & HF_NMI_MASK);
3784        ret = ret && gif_set(svm) && nested_svm_nmi(svm);
3785
3786        return ret;
3787}
3788
3789static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3790{
3791        struct vcpu_svm *svm = to_svm(vcpu);
3792
3793        return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
3794}
3795
3796static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3797{
3798        struct vcpu_svm *svm = to_svm(vcpu);
3799
3800        if (masked) {
3801                svm->vcpu.arch.hflags |= HF_NMI_MASK;
3802                set_intercept(svm, INTERCEPT_IRET);
3803        } else {
3804                svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3805                clr_intercept(svm, INTERCEPT_IRET);
3806        }
3807}
3808
3809static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3810{
3811        struct vcpu_svm *svm = to_svm(vcpu);
3812        struct vmcb *vmcb = svm->vmcb;
3813        int ret;
3814
3815        if (!gif_set(svm) ||
3816             (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
3817                return 0;
3818
3819        ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
3820
3821        if (is_guest_mode(vcpu))
3822                return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
3823
3824        return ret;
3825}
3826
3827static void enable_irq_window(struct kvm_vcpu *vcpu)
3828{
3829        struct vcpu_svm *svm = to_svm(vcpu);
3830
3831        /*
3832         * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3833         * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3834         * get that intercept, this function will be called again though and
3835         * we'll get the vintr intercept.
3836         */
3837        if (gif_set(svm) && nested_svm_intr(svm)) {
3838                svm_set_vintr(svm);
3839                svm_inject_irq(svm, 0x0);
3840        }
3841}
3842
3843static void enable_nmi_window(struct kvm_vcpu *vcpu)
3844{
3845        struct vcpu_svm *svm = to_svm(vcpu);
3846
3847        if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
3848            == HF_NMI_MASK)
3849                return; /* IRET will cause a vm exit */
3850
3851        /*
3852         * Something prevents NMI from been injected. Single step over possible
3853         * problem (IRET or exception injection or interrupt shadow)
3854         */
3855        svm->nmi_singlestep = true;
3856        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3857        update_db_bp_intercept(vcpu);
3858}
3859
3860static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3861{
3862        return 0;
3863}
3864
3865static void svm_flush_tlb(struct kvm_vcpu *vcpu)
3866{
3867        struct vcpu_svm *svm = to_svm(vcpu);
3868
3869        if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3870                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3871        else
3872                svm->asid_generation--;
3873}
3874
3875static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
3876{
3877}
3878
3879static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3880{
3881        struct vcpu_svm *svm = to_svm(vcpu);
3882
3883        if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3884                return;
3885
3886        if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3887                int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3888                kvm_set_cr8(vcpu, cr8);
3889        }
3890}
3891
3892static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3893{
3894        struct vcpu_svm *svm = to_svm(vcpu);
3895        u64 cr8;
3896
3897        if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3898                return;
3899
3900        cr8 = kvm_get_cr8(vcpu);
3901        svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3902        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3903}
3904
3905static void svm_complete_interrupts(struct vcpu_svm *svm)
3906{
3907        u8 vector;
3908        int type;
3909        u32 exitintinfo = svm->vmcb->control.exit_int_info;
3910        unsigned int3_injected = svm->int3_injected;
3911
3912        svm->int3_injected = 0;
3913
3914        /*
3915         * If we've made progress since setting HF_IRET_MASK, we've
3916         * executed an IRET and can allow NMI injection.
3917         */
3918        if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3919            && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3920                svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3921                kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3922        }
3923
3924        svm->vcpu.arch.nmi_injected = false;
3925        kvm_clear_exception_queue(&svm->vcpu);
3926        kvm_clear_interrupt_queue(&svm->vcpu);
3927
3928        if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3929                return;
3930
3931        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3932
3933        vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3934        type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3935
3936        switch (type) {
3937        case SVM_EXITINTINFO_TYPE_NMI:
3938                svm->vcpu.arch.nmi_injected = true;
3939                break;
3940        case SVM_EXITINTINFO_TYPE_EXEPT:
3941                /*
3942                 * In case of software exceptions, do not reinject the vector,
3943                 * but re-execute the instruction instead. Rewind RIP first
3944                 * if we emulated INT3 before.
3945                 */
3946                if (kvm_exception_is_soft(vector)) {
3947                        if (vector == BP_VECTOR && int3_injected &&
3948                            kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
3949                                kvm_rip_write(&svm->vcpu,
3950                                              kvm_rip_read(&svm->vcpu) -
3951                                              int3_injected);
3952                        break;
3953                }
3954                if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3955                        u32 err = svm->vmcb->control.exit_int_info_err;
3956                        kvm_requeue_exception_e(&svm->vcpu, vector, err);
3957
3958                } else
3959                        kvm_requeue_exception(&svm->vcpu, vector);
3960                break;
3961        case SVM_EXITINTINFO_TYPE_INTR:
3962                kvm_queue_interrupt(&svm->vcpu, vector, false);
3963                break;
3964        default:
3965                break;
3966        }
3967}
3968
3969static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3970{
3971        struct vcpu_svm *svm = to_svm(vcpu);
3972        struct vmcb_control_area *control = &svm->vmcb->control;
3973
3974        control->exit_int_info = control->event_inj;
3975        control->exit_int_info_err = control->event_inj_err;
3976        control->event_inj = 0;
3977        svm_complete_interrupts(svm);
3978}
3979
3980static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3981{
3982        struct vcpu_svm *svm = to_svm(vcpu);
3983
3984        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3985        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3986        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3987
3988        /*
3989         * A vmexit emulation is required before the vcpu can be executed
3990         * again.
3991         */
3992        if (unlikely(svm->nested.exit_required))
3993                return;
3994
3995        pre_svm_run(svm);
3996
3997        sync_lapic_to_cr8(vcpu);
3998
3999        svm->vmcb->save.cr2 = vcpu->arch.cr2;
4000
4001        clgi();
4002
4003        local_irq_enable();
4004
4005        asm volatile (
4006                "push %%" _ASM_BP "; \n\t"
4007                "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
4008                "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
4009                "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
4010                "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
4011                "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
4012                "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
4013#ifdef CONFIG_X86_64
4014                "mov %c[r8](%[svm]),  %%r8  \n\t"
4015                "mov %c[r9](%[svm]),  %%r9  \n\t"
4016                "mov %c[r10](%[svm]), %%r10 \n\t"
4017                "mov %c[r11](%[svm]), %%r11 \n\t"
4018                "mov %c[r12](%[svm]), %%r12 \n\t"
4019                "mov %c[r13](%[svm]), %%r13 \n\t"
4020                "mov %c[r14](%[svm]), %%r14 \n\t"
4021                "mov %c[r15](%[svm]), %%r15 \n\t"
4022#endif
4023
4024                /* Enter guest mode */
4025                "push %%" _ASM_AX " \n\t"
4026                "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
4027                __ex(SVM_VMLOAD) "\n\t"
4028                __ex(SVM_VMRUN) "\n\t"
4029                __ex(SVM_VMSAVE) "\n\t"
4030                "pop %%" _ASM_AX " \n\t"
4031
4032                /* Save guest registers, load host registers */
4033                "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
4034                "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
4035                "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
4036                "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
4037                "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
4038                "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
4039#ifdef CONFIG_X86_64
4040                "mov %%r8,  %c[r8](%[svm]) \n\t"
4041                "mov %%r9,  %c[r9](%[svm]) \n\t"
4042                "mov %%r10, %c[r10](%[svm]) \n\t"
4043                "mov %%r11, %c[r11](%[svm]) \n\t"
4044                "mov %%r12, %c[r12](%[svm]) \n\t"
4045                "mov %%r13, %c[r13](%[svm]) \n\t"
4046                "mov %%r14, %c[r14](%[svm]) \n\t"
4047                "mov %%r15, %c[r15](%[svm]) \n\t"
4048#endif
4049                "pop %%" _ASM_BP
4050                :
4051                : [svm]"a"(svm),
4052                  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
4053                  [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
4054                  [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
4055                  [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
4056                  [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
4057                  [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
4058                  [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
4059#ifdef CONFIG_X86_64
4060                  , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
4061                  [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
4062                  [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
4063                  [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
4064                  [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
4065                  [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
4066                  [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
4067                  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
4068#endif
4069                : "cc", "memory"
4070#ifdef CONFIG_X86_64
4071                , "rbx", "rcx", "rdx", "rsi", "rdi"
4072                , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
4073#else
4074                , "ebx", "ecx", "edx", "esi", "edi"
4075#endif
4076                );
4077
4078#ifdef CONFIG_X86_64
4079        wrmsrl(MSR_GS_BASE, svm->host.gs_base);
4080#else
4081        loadsegment(fs, svm->host.fs);
4082#ifndef CONFIG_X86_32_LAZY_GS
4083        loadsegment(gs, svm->host.gs);
4084#endif
4085#endif
4086
4087        reload_tss(vcpu);
4088
4089        local_irq_disable();
4090
4091        vcpu->arch.cr2 = svm->vmcb->save.cr2;
4092        vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4093        vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4094        vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4095
4096        trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
4097
4098        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4099                kvm_before_handle_nmi(&svm->vcpu);
4100
4101        stgi();
4102
4103        /* Any pending NMI will happen here */
4104
4105        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4106                kvm_after_handle_nmi(&svm->vcpu);
4107
4108        sync_cr8_to_lapic(vcpu);
4109
4110        svm->next_rip = 0;
4111
4112        svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4113
4114        /* if exit due to PF check for async PF */
4115        if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4116                svm->apf_reason = kvm_read_and_reset_pf_reason();
4117
4118        if (npt_enabled) {
4119                vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
4120                vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
4121        }
4122
4123        /*
4124         * We need to handle MC intercepts here before the vcpu has a chance to
4125         * change the physical cpu
4126         */
4127        if (unlikely(svm->vmcb->control.exit_code ==
4128                     SVM_EXIT_EXCP_BASE + MC_VECTOR))
4129                svm_handle_mce(svm);
4130
4131        mark_all_clean(svm->vmcb);
4132}
4133
4134static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
4135{
4136        struct vcpu_svm *svm = to_svm(vcpu);
4137
4138        svm->vmcb->save.cr3 = root;
4139        mark_dirty(svm->vmcb, VMCB_CR);
4140        svm_flush_tlb(vcpu);
4141}
4142
4143static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
4144{
4145        struct vcpu_svm *svm = to_svm(vcpu);
4146
4147        svm->vmcb->control.nested_cr3 = root;
4148        mark_dirty(svm->vmcb, VMCB_NPT);
4149
4150        /* Also sync guest cr3 here in case we live migrate */
4151        svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
4152        mark_dirty(svm->vmcb, VMCB_CR);
4153
4154        svm_flush_tlb(vcpu);
4155}
4156
4157static int is_disabled(void)
4158{
4159        u64 vm_cr;
4160
4161        rdmsrl(MSR_VM_CR, vm_cr);
4162        if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4163                return 1;
4164
4165        return 0;
4166}
4167
4168static void
4169svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4170{
4171        /*
4172         * Patch in the VMMCALL instruction:
4173         */
4174        hypercall[0] = 0x0f;
4175        hypercall[1] = 0x01;
4176        hypercall[2] = 0xd9;
4177}
4178
4179static void svm_check_processor_compat(void *rtn)
4180{
4181        *(int *)rtn = 0;
4182}
4183
4184static bool svm_cpu_has_accelerated_tpr(void)
4185{
4186        return false;
4187}
4188
4189static bool svm_has_high_real_mode_segbase(void)
4190{
4191        return true;
4192}
4193
4194static void svm_cpuid_update(struct kvm_vcpu *vcpu)
4195{
4196}
4197
4198static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4199{
4200        switch (func) {
4201        case 0x80000001:
4202                if (nested)
4203                        entry->ecx |= (1 << 2); /* Set SVM bit */
4204                break;
4205        case 0x8000000A:
4206                entry->eax = 1; /* SVM revision 1 */
4207                entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
4208                                   ASID emulation to nested SVM */
4209                entry->ecx = 0; /* Reserved */
4210                entry->edx = 0; /* Per default do not support any
4211                                   additional features */
4212
4213                /* Support next_rip if host supports it */
4214                if (boot_cpu_has(X86_FEATURE_NRIPS))
4215                        entry->edx |= SVM_FEATURE_NRIP;
4216
4217                /* Support NPT for the guest if enabled */
4218                if (npt_enabled)
4219                        entry->edx |= SVM_FEATURE_NPT;
4220
4221                break;
4222        }
4223}
4224
4225static int svm_get_lpage_level(void)
4226{
4227        return PT_PDPE_LEVEL;
4228}
4229
4230static bool svm_rdtscp_supported(void)
4231{
4232        return false;
4233}
4234
4235static bool svm_invpcid_supported(void)
4236{
4237        return false;
4238}
4239
4240static bool svm_mpx_supported(void)
4241{
4242        return false;
4243}
4244
4245static bool svm_xsaves_supported(void)
4246{
4247        return false;
4248}
4249
4250static bool svm_has_wbinvd_exit(void)
4251{
4252        return true;
4253}
4254
4255static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
4256{
4257        struct vcpu_svm *svm = to_svm(vcpu);
4258
4259        set_exception_intercept(svm, NM_VECTOR);
4260        update_cr0_intercept(svm);
4261}
4262
4263#define PRE_EX(exit)  { .exit_code = (exit), \
4264                        .stage = X86_ICPT_PRE_EXCEPT, }
4265#define POST_EX(exit) { .exit_code = (exit), \
4266                        .stage = X86_ICPT_POST_EXCEPT, }
4267#define POST_MEM(exit) { .exit_code = (exit), \
4268                        .stage = X86_ICPT_POST_MEMACCESS, }
4269
4270static const struct __x86_intercept {
4271        u32 exit_code;
4272        enum x86_intercept_stage stage;
4273} x86_intercept_map[] = {
4274        [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4275        [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4276        [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4277        [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4278        [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4279        [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4280        [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4281        [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4282        [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4283        [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4284        [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4285        [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4286        [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4287        [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4288        [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4289        [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4290        [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4291        [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4292        [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4293        [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4294        [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4295        [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4296        [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4297        [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4298        [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4299        [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4300        [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4301        [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4302        [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4303        [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4304        [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4305        [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4306        [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4307        [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4308        [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4309        [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4310        [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4311        [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4312        [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4313        [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4314        [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4315        [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4316        [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4317        [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4318        [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4319        [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4320};
4321
4322#undef PRE_EX
4323#undef POST_EX
4324#undef POST_MEM
4325
4326static int svm_check_intercept(struct kvm_vcpu *vcpu,
4327                               struct x86_instruction_info *info,
4328                               enum x86_intercept_stage stage)
4329{
4330        struct vcpu_svm *svm = to_svm(vcpu);
4331        int vmexit, ret = X86EMUL_CONTINUE;
4332        struct __x86_intercept icpt_info;
4333        struct vmcb *vmcb = svm->vmcb;
4334
4335        if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4336                goto out;
4337
4338        icpt_info = x86_intercept_map[info->intercept];
4339
4340        if (stage != icpt_info.stage)
4341                goto out;
4342
4343        switch (icpt_info.exit_code) {
4344        case SVM_EXIT_READ_CR0:
4345                if (info->intercept == x86_intercept_cr_read)
4346                        icpt_info.exit_code += info->modrm_reg;
4347                break;
4348        case SVM_EXIT_WRITE_CR0: {
4349                unsigned long cr0, val;
4350                u64 intercept;
4351
4352                if (info->intercept == x86_intercept_cr_write)
4353                        icpt_info.exit_code += info->modrm_reg;
4354
4355                if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4356                    info->intercept == x86_intercept_clts)
4357                        break;
4358
4359                intercept = svm->nested.intercept;
4360
4361                if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
4362                        break;
4363
4364                cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4365                val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4366
4367                if (info->intercept == x86_intercept_lmsw) {
4368                        cr0 &= 0xfUL;
4369                        val &= 0xfUL;
4370                        /* lmsw can't clear PE - catch this here */
4371                        if (cr0 & X86_CR0_PE)
4372                                val |= X86_CR0_PE;
4373                }
4374
4375                if (cr0 ^ val)
4376                        icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4377
4378                break;
4379        }
4380        case SVM_EXIT_READ_DR0:
4381        case SVM_EXIT_WRITE_DR0:
4382                icpt_info.exit_code += info->modrm_reg;
4383                break;
4384        case SVM_EXIT_MSR:
4385                if (info->intercept == x86_intercept_wrmsr)
4386                        vmcb->control.exit_info_1 = 1;
4387                else
4388                        vmcb->control.exit_info_1 = 0;
4389                break;
4390        case SVM_EXIT_PAUSE:
4391                /*
4392                 * We get this for NOP only, but pause
4393                 * is rep not, check this here
4394                 */
4395                if (info->rep_prefix != REPE_PREFIX)
4396                        goto out;
4397        case SVM_EXIT_IOIO: {
4398                u64 exit_info;
4399                u32 bytes;
4400
4401                if (info->intercept == x86_intercept_in ||
4402                    info->intercept == x86_intercept_ins) {
4403                        exit_info = ((info->src_val & 0xffff) << 16) |
4404                                SVM_IOIO_TYPE_MASK;
4405                        bytes = info->dst_bytes;
4406                } else {
4407                        exit_info = (info->dst_val & 0xffff) << 16;
4408                        bytes = info->src_bytes;
4409                }
4410
4411                if (info->intercept == x86_intercept_outs ||
4412                    info->intercept == x86_intercept_ins)
4413                        exit_info |= SVM_IOIO_STR_MASK;
4414
4415                if (info->rep_prefix)
4416                        exit_info |= SVM_IOIO_REP_MASK;
4417
4418                bytes = min(bytes, 4u);
4419
4420                exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4421
4422                exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4423
4424                vmcb->control.exit_info_1 = exit_info;
4425                vmcb->control.exit_info_2 = info->next_rip;
4426
4427                break;
4428        }
4429        default:
4430                break;
4431        }
4432
4433        /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4434        if (static_cpu_has(X86_FEATURE_NRIPS))
4435                vmcb->control.next_rip  = info->next_rip;
4436        vmcb->control.exit_code = icpt_info.exit_code;
4437        vmexit = nested_svm_exit_handled(svm);
4438
4439        ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4440                                           : X86EMUL_CONTINUE;
4441
4442out:
4443        return ret;
4444}
4445
4446static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
4447{
4448        local_irq_enable();
4449}
4450
4451static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4452{
4453}
4454
4455static struct kvm_x86_ops svm_x86_ops = {
4456        .cpu_has_kvm_support = has_svm,
4457        .disabled_by_bios = is_disabled,
4458        .hardware_setup = svm_hardware_setup,
4459        .hardware_unsetup = svm_hardware_unsetup,
4460        .check_processor_compatibility = svm_check_processor_compat,
4461        .hardware_enable = svm_hardware_enable,
4462        .hardware_disable = svm_hardware_disable,
4463        .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
4464        .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
4465
4466        .vcpu_create = svm_create_vcpu,
4467        .vcpu_free = svm_free_vcpu,
4468        .vcpu_reset = svm_vcpu_reset,
4469
4470        .prepare_guest_switch = svm_prepare_guest_switch,
4471        .vcpu_load = svm_vcpu_load,
4472        .vcpu_put = svm_vcpu_put,
4473
4474        .update_db_bp_intercept = update_db_bp_intercept,
4475        .get_msr = svm_get_msr,
4476        .set_msr = svm_set_msr,
4477        .get_segment_base = svm_get_segment_base,
4478        .get_segment = svm_get_segment,
4479        .set_segment = svm_set_segment,
4480        .get_cpl = svm_get_cpl,
4481        .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
4482        .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
4483        .decache_cr3 = svm_decache_cr3,
4484        .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
4485        .set_cr0 = svm_set_cr0,
4486        .set_cr3 = svm_set_cr3,
4487        .set_cr4 = svm_set_cr4,
4488        .set_efer = svm_set_efer,
4489        .get_idt = svm_get_idt,
4490        .set_idt = svm_set_idt,
4491        .get_gdt = svm_get_gdt,
4492        .set_gdt = svm_set_gdt,
4493        .get_dr6 = svm_get_dr6,
4494        .set_dr6 = svm_set_dr6,
4495        .set_dr7 = svm_set_dr7,
4496        .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4497        .cache_reg = svm_cache_reg,
4498        .get_rflags = svm_get_rflags,
4499        .set_rflags = svm_set_rflags,
4500        .fpu_activate = svm_fpu_activate,
4501        .fpu_deactivate = svm_fpu_deactivate,
4502
4503        .tlb_flush = svm_flush_tlb,
4504
4505        .run = svm_vcpu_run,
4506        .handle_exit = handle_exit,
4507        .skip_emulated_instruction = skip_emulated_instruction,
4508        .set_interrupt_shadow = svm_set_interrupt_shadow,
4509        .get_interrupt_shadow = svm_get_interrupt_shadow,
4510        .patch_hypercall = svm_patch_hypercall,
4511        .set_irq = svm_set_irq,
4512        .set_nmi = svm_inject_nmi,
4513        .queue_exception = svm_queue_exception,
4514        .cancel_injection = svm_cancel_injection,
4515        .interrupt_allowed = svm_interrupt_allowed,
4516        .nmi_allowed = svm_nmi_allowed,
4517        .get_nmi_mask = svm_get_nmi_mask,
4518        .set_nmi_mask = svm_set_nmi_mask,
4519        .enable_nmi_window = enable_nmi_window,
4520        .enable_irq_window = enable_irq_window,
4521        .update_cr8_intercept = update_cr8_intercept,
4522        .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
4523        .vm_has_apicv = svm_vm_has_apicv,
4524        .load_eoi_exitmap = svm_load_eoi_exitmap,
4525        .sync_pir_to_irr = svm_sync_pir_to_irr,
4526
4527        .set_tss_addr = svm_set_tss_addr,
4528        .get_tdp_level = get_npt_level,
4529        .get_mt_mask = svm_get_mt_mask,
4530
4531        .get_exit_info = svm_get_exit_info,
4532
4533        .get_lpage_level = svm_get_lpage_level,
4534
4535        .cpuid_update = svm_cpuid_update,
4536
4537        .rdtscp_supported = svm_rdtscp_supported,
4538        .invpcid_supported = svm_invpcid_supported,
4539        .mpx_supported = svm_mpx_supported,
4540        .xsaves_supported = svm_xsaves_supported,
4541
4542        .set_supported_cpuid = svm_set_supported_cpuid,
4543
4544        .has_wbinvd_exit = svm_has_wbinvd_exit,
4545
4546        .set_tsc_khz = svm_set_tsc_khz,
4547        .read_tsc_offset = svm_read_tsc_offset,
4548        .write_tsc_offset = svm_write_tsc_offset,
4549        .adjust_tsc_offset = svm_adjust_tsc_offset,
4550        .compute_tsc_offset = svm_compute_tsc_offset,
4551        .read_l1_tsc = svm_read_l1_tsc,
4552
4553        .set_tdp_cr3 = set_tdp_cr3,
4554
4555        .check_intercept = svm_check_intercept,
4556        .handle_external_intr = svm_handle_external_intr,
4557
4558        .sched_in = svm_sched_in,
4559
4560        .pmu_ops = &amd_pmu_ops,
4561};
4562
4563static int __init svm_init(void)
4564{
4565        return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
4566                        __alignof__(struct vcpu_svm), THIS_MODULE);
4567}
4568
4569static void __exit svm_exit(void)
4570{
4571        kvm_exit();
4572}
4573
4574module_init(svm_init)
4575module_exit(svm_exit)
4576