linux/arch/x86/kvm/svm.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * AMD SVM support
   5 *
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   8 *
   9 * Authors:
  10 *   Yaniv Kamay  <yaniv@qumranet.com>
  11 *   Avi Kivity   <avi@qumranet.com>
  12 *
  13 * This work is licensed under the terms of the GNU GPL, version 2.  See
  14 * the COPYING file in the top-level directory.
  15 *
  16 */
  17
  18#define pr_fmt(fmt) "SVM: " fmt
  19
  20#include <linux/kvm_host.h>
  21
  22#include "irq.h"
  23#include "mmu.h"
  24#include "kvm_cache_regs.h"
  25#include "x86.h"
  26#include "cpuid.h"
  27#include "pmu.h"
  28
  29#include <linux/module.h>
  30#include <linux/mod_devicetable.h>
  31#include <linux/kernel.h>
  32#include <linux/vmalloc.h>
  33#include <linux/highmem.h>
  34#include <linux/sched.h>
  35#include <linux/trace_events.h>
  36#include <linux/slab.h>
  37#include <linux/amd-iommu.h>
  38#include <linux/hashtable.h>
  39
  40#include <asm/apic.h>
  41#include <asm/perf_event.h>
  42#include <asm/tlbflush.h>
  43#include <asm/desc.h>
  44#include <asm/debugreg.h>
  45#include <asm/kvm_para.h>
  46#include <asm/irq_remapping.h>
  47
  48#include <asm/virtext.h>
  49#include "trace.h"
  50
  51#define __ex(x) __kvm_handle_fault_on_reboot(x)
  52
  53MODULE_AUTHOR("Qumranet");
  54MODULE_LICENSE("GPL");
  55
  56static const struct x86_cpu_id svm_cpu_id[] = {
  57        X86_FEATURE_MATCH(X86_FEATURE_SVM),
  58        {}
  59};
  60MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  61
  62#define IOPM_ALLOC_ORDER 2
  63#define MSRPM_ALLOC_ORDER 1
  64
  65#define SEG_TYPE_LDT 2
  66#define SEG_TYPE_BUSY_TSS16 3
  67
  68#define SVM_FEATURE_NPT            (1 <<  0)
  69#define SVM_FEATURE_LBRV           (1 <<  1)
  70#define SVM_FEATURE_SVML           (1 <<  2)
  71#define SVM_FEATURE_NRIP           (1 <<  3)
  72#define SVM_FEATURE_TSC_RATE       (1 <<  4)
  73#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
  74#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
  75#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
  76#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
  77
  78#define SVM_AVIC_DOORBELL       0xc001011b
  79
  80#define NESTED_EXIT_HOST        0       /* Exit handled on host level */
  81#define NESTED_EXIT_DONE        1       /* Exit caused nested vmexit  */
  82#define NESTED_EXIT_CONTINUE    2       /* Further checks needed      */
  83
  84#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
  85
  86#define TSC_RATIO_RSVD          0xffffff0000000000ULL
  87#define TSC_RATIO_MIN           0x0000000000000001ULL
  88#define TSC_RATIO_MAX           0x000000ffffffffffULL
  89
  90#define AVIC_HPA_MASK   ~((0xFFFULL << 52) | 0xFFF)
  91
  92/*
  93 * 0xff is broadcast, so the max index allowed for physical APIC ID
  94 * table is 0xfe.  APIC IDs above 0xff are reserved.
  95 */
  96#define AVIC_MAX_PHYSICAL_ID_COUNT      255
  97
  98#define AVIC_UNACCEL_ACCESS_WRITE_MASK          1
  99#define AVIC_UNACCEL_ACCESS_OFFSET_MASK         0xFF0
 100#define AVIC_UNACCEL_ACCESS_VECTOR_MASK         0xFFFFFFFF
 101
 102/* AVIC GATAG is encoded using VM and VCPU IDs */
 103#define AVIC_VCPU_ID_BITS               8
 104#define AVIC_VCPU_ID_MASK               ((1 << AVIC_VCPU_ID_BITS) - 1)
 105
 106#define AVIC_VM_ID_BITS                 24
 107#define AVIC_VM_ID_NR                   (1 << AVIC_VM_ID_BITS)
 108#define AVIC_VM_ID_MASK                 ((1 << AVIC_VM_ID_BITS) - 1)
 109
 110#define AVIC_GATAG(x, y)                (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
 111                                                (y & AVIC_VCPU_ID_MASK))
 112#define AVIC_GATAG_TO_VMID(x)           ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
 113#define AVIC_GATAG_TO_VCPUID(x)         (x & AVIC_VCPU_ID_MASK)
 114
 115static bool erratum_383_found __read_mostly;
 116
 117static const u32 host_save_user_msrs[] = {
 118#ifdef CONFIG_X86_64
 119        MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
 120        MSR_FS_BASE,
 121#endif
 122        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 123        MSR_TSC_AUX,
 124};
 125
 126#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
 127
 128struct kvm_vcpu;
 129
 130struct nested_state {
 131        struct vmcb *hsave;
 132        u64 hsave_msr;
 133        u64 vm_cr_msr;
 134        u64 vmcb;
 135
 136        /* These are the merged vectors */
 137        u32 *msrpm;
 138
 139        /* gpa pointers to the real vectors */
 140        u64 vmcb_msrpm;
 141        u64 vmcb_iopm;
 142
 143        /* A VMEXIT is required but not yet emulated */
 144        bool exit_required;
 145
 146        /* cache for intercepts of the guest */
 147        u32 intercept_cr;
 148        u32 intercept_dr;
 149        u32 intercept_exceptions;
 150        u64 intercept;
 151
 152        /* Nested Paging related state */
 153        u64 nested_cr3;
 154};
 155
 156#define MSRPM_OFFSETS   16
 157static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 158
 159/*
 160 * Set osvw_len to higher value when updated Revision Guides
 161 * are published and we know what the new status bits are
 162 */
 163static uint64_t osvw_len = 4, osvw_status;
 164
 165struct vcpu_svm {
 166        struct kvm_vcpu vcpu;
 167        struct vmcb *vmcb;
 168        unsigned long vmcb_pa;
 169        struct svm_cpu_data *svm_data;
 170        uint64_t asid_generation;
 171        uint64_t sysenter_esp;
 172        uint64_t sysenter_eip;
 173        uint64_t tsc_aux;
 174
 175        u64 next_rip;
 176
 177        u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
 178        struct {
 179                u16 fs;
 180                u16 gs;
 181                u16 ldt;
 182                u64 gs_base;
 183        } host;
 184
 185        u32 *msrpm;
 186
 187        ulong nmi_iret_rip;
 188
 189        struct nested_state nested;
 190
 191        bool nmi_singlestep;
 192
 193        unsigned int3_injected;
 194        unsigned long int3_rip;
 195        u32 apf_reason;
 196
 197        /* cached guest cpuid flags for faster access */
 198        bool nrips_enabled      : 1;
 199
 200        u32 ldr_reg;
 201        struct page *avic_backing_page;
 202        u64 *avic_physical_id_cache;
 203        bool avic_is_running;
 204
 205        /*
 206         * Per-vcpu list of struct amd_svm_iommu_ir:
 207         * This is used mainly to store interrupt remapping information used
 208         * when update the vcpu affinity. This avoids the need to scan for
 209         * IRTE and try to match ga_tag in the IOMMU driver.
 210         */
 211        struct list_head ir_list;
 212        spinlock_t ir_list_lock;
 213};
 214
 215/*
 216 * This is a wrapper of struct amd_iommu_ir_data.
 217 */
 218struct amd_svm_iommu_ir {
 219        struct list_head node;  /* Used by SVM for per-vcpu ir_list */
 220        void *data;             /* Storing pointer to struct amd_ir_data */
 221};
 222
 223#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK    (0xFF)
 224#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK                (1 << 31)
 225
 226#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK    (0xFFULL)
 227#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK        (0xFFFFFFFFFFULL << 12)
 228#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK          (1ULL << 62)
 229#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK               (1ULL << 63)
 230
 231static DEFINE_PER_CPU(u64, current_tsc_ratio);
 232#define TSC_RATIO_DEFAULT       0x0100000000ULL
 233
 234#define MSR_INVALID                     0xffffffffU
 235
 236static const struct svm_direct_access_msrs {
 237        u32 index;   /* Index of the MSR */
 238        bool always; /* True if intercept is always on */
 239} direct_access_msrs[] = {
 240        { .index = MSR_STAR,                            .always = true  },
 241        { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
 242#ifdef CONFIG_X86_64
 243        { .index = MSR_GS_BASE,                         .always = true  },
 244        { .index = MSR_FS_BASE,                         .always = true  },
 245        { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
 246        { .index = MSR_LSTAR,                           .always = true  },
 247        { .index = MSR_CSTAR,                           .always = true  },
 248        { .index = MSR_SYSCALL_MASK,                    .always = true  },
 249#endif
 250        { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
 251        { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
 252        { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
 253        { .index = MSR_IA32_LASTINTTOIP,                .always = false },
 254        { .index = MSR_INVALID,                         .always = false },
 255};
 256
 257/* enable NPT for AMD64 and X86 with PAE */
 258#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 259static bool npt_enabled = true;
 260#else
 261static bool npt_enabled;
 262#endif
 263
 264/* allow nested paging (virtualized MMU) for all guests */
 265static int npt = true;
 266module_param(npt, int, S_IRUGO);
 267
 268/* allow nested virtualization in KVM/SVM */
 269static int nested = true;
 270module_param(nested, int, S_IRUGO);
 271
 272/* enable / disable AVIC */
 273static int avic;
 274#ifdef CONFIG_X86_LOCAL_APIC
 275module_param(avic, int, S_IRUGO);
 276#endif
 277
 278/* AVIC VM ID bit masks and lock */
 279static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
 280static DEFINE_SPINLOCK(avic_vm_id_lock);
 281
 282static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 283static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 284static void svm_complete_interrupts(struct vcpu_svm *svm);
 285
 286static int nested_svm_exit_handled(struct vcpu_svm *svm);
 287static int nested_svm_intercept(struct vcpu_svm *svm);
 288static int nested_svm_vmexit(struct vcpu_svm *svm);
 289static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 290                                      bool has_error_code, u32 error_code);
 291
 292enum {
 293        VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
 294                            pause filter count */
 295        VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
 296        VMCB_ASID,       /* ASID */
 297        VMCB_INTR,       /* int_ctl, int_vector */
 298        VMCB_NPT,        /* npt_en, nCR3, gPAT */
 299        VMCB_CR,         /* CR0, CR3, CR4, EFER */
 300        VMCB_DR,         /* DR6, DR7 */
 301        VMCB_DT,         /* GDT, IDT */
 302        VMCB_SEG,        /* CS, DS, SS, ES, CPL */
 303        VMCB_CR2,        /* CR2 only */
 304        VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
 305        VMCB_AVIC,       /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
 306                          * AVIC PHYSICAL_TABLE pointer,
 307                          * AVIC LOGICAL_TABLE pointer
 308                          */
 309        VMCB_DIRTY_MAX,
 310};
 311
 312/* TPR and CR2 are always written before VMRUN */
 313#define VMCB_ALWAYS_DIRTY_MASK  ((1U << VMCB_INTR) | (1U << VMCB_CR2))
 314
 315#define VMCB_AVIC_APIC_BAR_MASK         0xFFFFFFFFFF000ULL
 316
 317static inline void mark_all_dirty(struct vmcb *vmcb)
 318{
 319        vmcb->control.clean = 0;
 320}
 321
 322static inline void mark_all_clean(struct vmcb *vmcb)
 323{
 324        vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
 325                               & ~VMCB_ALWAYS_DIRTY_MASK;
 326}
 327
 328static inline void mark_dirty(struct vmcb *vmcb, int bit)
 329{
 330        vmcb->control.clean &= ~(1 << bit);
 331}
 332
 333static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 334{
 335        return container_of(vcpu, struct vcpu_svm, vcpu);
 336}
 337
 338static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
 339{
 340        svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
 341        mark_dirty(svm->vmcb, VMCB_AVIC);
 342}
 343
 344static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
 345{
 346        struct vcpu_svm *svm = to_svm(vcpu);
 347        u64 *entry = svm->avic_physical_id_cache;
 348
 349        if (!entry)
 350                return false;
 351
 352        return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
 353}
 354
 355static void recalc_intercepts(struct vcpu_svm *svm)
 356{
 357        struct vmcb_control_area *c, *h;
 358        struct nested_state *g;
 359
 360        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 361
 362        if (!is_guest_mode(&svm->vcpu))
 363                return;
 364
 365        c = &svm->vmcb->control;
 366        h = &svm->nested.hsave->control;
 367        g = &svm->nested;
 368
 369        c->intercept_cr = h->intercept_cr | g->intercept_cr;
 370        c->intercept_dr = h->intercept_dr | g->intercept_dr;
 371        c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 372        c->intercept = h->intercept | g->intercept;
 373}
 374
 375static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
 376{
 377        if (is_guest_mode(&svm->vcpu))
 378                return svm->nested.hsave;
 379        else
 380                return svm->vmcb;
 381}
 382
 383static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
 384{
 385        struct vmcb *vmcb = get_host_vmcb(svm);
 386
 387        vmcb->control.intercept_cr |= (1U << bit);
 388
 389        recalc_intercepts(svm);
 390}
 391
 392static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
 393{
 394        struct vmcb *vmcb = get_host_vmcb(svm);
 395
 396        vmcb->control.intercept_cr &= ~(1U << bit);
 397
 398        recalc_intercepts(svm);
 399}
 400
 401static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
 402{
 403        struct vmcb *vmcb = get_host_vmcb(svm);
 404
 405        return vmcb->control.intercept_cr & (1U << bit);
 406}
 407
 408static inline void set_dr_intercepts(struct vcpu_svm *svm)
 409{
 410        struct vmcb *vmcb = get_host_vmcb(svm);
 411
 412        vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
 413                | (1 << INTERCEPT_DR1_READ)
 414                | (1 << INTERCEPT_DR2_READ)
 415                | (1 << INTERCEPT_DR3_READ)
 416                | (1 << INTERCEPT_DR4_READ)
 417                | (1 << INTERCEPT_DR5_READ)
 418                | (1 << INTERCEPT_DR6_READ)
 419                | (1 << INTERCEPT_DR7_READ)
 420                | (1 << INTERCEPT_DR0_WRITE)
 421                | (1 << INTERCEPT_DR1_WRITE)
 422                | (1 << INTERCEPT_DR2_WRITE)
 423                | (1 << INTERCEPT_DR3_WRITE)
 424                | (1 << INTERCEPT_DR4_WRITE)
 425                | (1 << INTERCEPT_DR5_WRITE)
 426                | (1 << INTERCEPT_DR6_WRITE)
 427                | (1 << INTERCEPT_DR7_WRITE);
 428
 429        recalc_intercepts(svm);
 430}
 431
 432static inline void clr_dr_intercepts(struct vcpu_svm *svm)
 433{
 434        struct vmcb *vmcb = get_host_vmcb(svm);
 435
 436        vmcb->control.intercept_dr = 0;
 437
 438        recalc_intercepts(svm);
 439}
 440
 441static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
 442{
 443        struct vmcb *vmcb = get_host_vmcb(svm);
 444
 445        vmcb->control.intercept_exceptions |= (1U << bit);
 446
 447        recalc_intercepts(svm);
 448}
 449
 450static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
 451{
 452        struct vmcb *vmcb = get_host_vmcb(svm);
 453
 454        vmcb->control.intercept_exceptions &= ~(1U << bit);
 455
 456        recalc_intercepts(svm);
 457}
 458
 459static inline void set_intercept(struct vcpu_svm *svm, int bit)
 460{
 461        struct vmcb *vmcb = get_host_vmcb(svm);
 462
 463        vmcb->control.intercept |= (1ULL << bit);
 464
 465        recalc_intercepts(svm);
 466}
 467
 468static inline void clr_intercept(struct vcpu_svm *svm, int bit)
 469{
 470        struct vmcb *vmcb = get_host_vmcb(svm);
 471
 472        vmcb->control.intercept &= ~(1ULL << bit);
 473
 474        recalc_intercepts(svm);
 475}
 476
 477static inline void enable_gif(struct vcpu_svm *svm)
 478{
 479        svm->vcpu.arch.hflags |= HF_GIF_MASK;
 480}
 481
 482static inline void disable_gif(struct vcpu_svm *svm)
 483{
 484        svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
 485}
 486
 487static inline bool gif_set(struct vcpu_svm *svm)
 488{
 489        return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
 490}
 491
 492static unsigned long iopm_base;
 493
 494struct kvm_ldttss_desc {
 495        u16 limit0;
 496        u16 base0;
 497        unsigned base1:8, type:5, dpl:2, p:1;
 498        unsigned limit1:4, zero0:3, g:1, base2:8;
 499        u32 base3;
 500        u32 zero1;
 501} __attribute__((packed));
 502
 503struct svm_cpu_data {
 504        int cpu;
 505
 506        u64 asid_generation;
 507        u32 max_asid;
 508        u32 next_asid;
 509        struct kvm_ldttss_desc *tss_desc;
 510
 511        struct page *save_area;
 512};
 513
 514static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 515
 516struct svm_init_data {
 517        int cpu;
 518        int r;
 519};
 520
 521static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 522
 523#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 524#define MSRS_RANGE_SIZE 2048
 525#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 526
 527static u32 svm_msrpm_offset(u32 msr)
 528{
 529        u32 offset;
 530        int i;
 531
 532        for (i = 0; i < NUM_MSR_MAPS; i++) {
 533                if (msr < msrpm_ranges[i] ||
 534                    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 535                        continue;
 536
 537                offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 538                offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 539
 540                /* Now we have the u8 offset - but need the u32 offset */
 541                return offset / 4;
 542        }
 543
 544        /* MSR not in any range */
 545        return MSR_INVALID;
 546}
 547
 548#define MAX_INST_SIZE 15
 549
 550static inline void clgi(void)
 551{
 552        asm volatile (__ex(SVM_CLGI));
 553}
 554
 555static inline void stgi(void)
 556{
 557        asm volatile (__ex(SVM_STGI));
 558}
 559
 560static inline void invlpga(unsigned long addr, u32 asid)
 561{
 562        asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
 563}
 564
 565static int get_npt_level(void)
 566{
 567#ifdef CONFIG_X86_64
 568        return PT64_ROOT_LEVEL;
 569#else
 570        return PT32E_ROOT_LEVEL;
 571#endif
 572}
 573
 574static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 575{
 576        vcpu->arch.efer = efer;
 577        if (!npt_enabled && !(efer & EFER_LMA))
 578                efer &= ~EFER_LME;
 579
 580        to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
 581        mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
 582}
 583
 584static int is_external_interrupt(u32 info)
 585{
 586        info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
 587        return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 588}
 589
 590static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 591{
 592        struct vcpu_svm *svm = to_svm(vcpu);
 593        u32 ret = 0;
 594
 595        if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 596                ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 597        return ret;
 598}
 599
 600static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 601{
 602        struct vcpu_svm *svm = to_svm(vcpu);
 603
 604        if (mask == 0)
 605                svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 606        else
 607                svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 608
 609}
 610
 611static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 612{
 613        struct vcpu_svm *svm = to_svm(vcpu);
 614
 615        if (svm->vmcb->control.next_rip != 0) {
 616                WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 617                svm->next_rip = svm->vmcb->control.next_rip;
 618        }
 619
 620        if (!svm->next_rip) {
 621                if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
 622                                EMULATE_DONE)
 623                        printk(KERN_DEBUG "%s: NOP\n", __func__);
 624                return;
 625        }
 626        if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
 627                printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
 628                       __func__, kvm_rip_read(vcpu), svm->next_rip);
 629
 630        kvm_rip_write(vcpu, svm->next_rip);
 631        svm_set_interrupt_shadow(vcpu, 0);
 632}
 633
 634static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 635                                bool has_error_code, u32 error_code,
 636                                bool reinject)
 637{
 638        struct vcpu_svm *svm = to_svm(vcpu);
 639
 640        /*
 641         * If we are within a nested VM we'd better #VMEXIT and let the guest
 642         * handle the exception
 643         */
 644        if (!reinject &&
 645            nested_svm_check_exception(svm, nr, has_error_code, error_code))
 646                return;
 647
 648        if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
 649                unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
 650
 651                /*
 652                 * For guest debugging where we have to reinject #BP if some
 653                 * INT3 is guest-owned:
 654                 * Emulate nRIP by moving RIP forward. Will fail if injection
 655                 * raises a fault that is not intercepted. Still better than
 656                 * failing in all cases.
 657                 */
 658                skip_emulated_instruction(&svm->vcpu);
 659                rip = kvm_rip_read(&svm->vcpu);
 660                svm->int3_rip = rip + svm->vmcb->save.cs.base;
 661                svm->int3_injected = rip - old_rip;
 662        }
 663
 664        svm->vmcb->control.event_inj = nr
 665                | SVM_EVTINJ_VALID
 666                | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 667                | SVM_EVTINJ_TYPE_EXEPT;
 668        svm->vmcb->control.event_inj_err = error_code;
 669}
 670
 671static void svm_init_erratum_383(void)
 672{
 673        u32 low, high;
 674        int err;
 675        u64 val;
 676
 677        if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 678                return;
 679
 680        /* Use _safe variants to not break nested virtualization */
 681        val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 682        if (err)
 683                return;
 684
 685        val |= (1ULL << 47);
 686
 687        low  = lower_32_bits(val);
 688        high = upper_32_bits(val);
 689
 690        native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 691
 692        erratum_383_found = true;
 693}
 694
 695static void svm_init_osvw(struct kvm_vcpu *vcpu)
 696{
 697        /*
 698         * Guests should see errata 400 and 415 as fixed (assuming that
 699         * HLT and IO instructions are intercepted).
 700         */
 701        vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 702        vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 703
 704        /*
 705         * By increasing VCPU's osvw.length to 3 we are telling the guest that
 706         * all osvw.status bits inside that length, including bit 0 (which is
 707         * reserved for erratum 298), are valid. However, if host processor's
 708         * osvw_len is 0 then osvw_status[0] carries no information. We need to
 709         * be conservative here and therefore we tell the guest that erratum 298
 710         * is present (because we really don't know).
 711         */
 712        if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 713                vcpu->arch.osvw.status |= 1;
 714}
 715
 716static int has_svm(void)
 717{
 718        const char *msg;
 719
 720        if (!cpu_has_svm(&msg)) {
 721                printk(KERN_INFO "has_svm: %s\n", msg);
 722                return 0;
 723        }
 724
 725        return 1;
 726}
 727
 728static void svm_hardware_disable(void)
 729{
 730        /* Make sure we clean up behind us */
 731        if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
 732                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 733
 734        cpu_svm_disable();
 735
 736        amd_pmu_disable_virt();
 737}
 738
 739static int svm_hardware_enable(void)
 740{
 741
 742        struct svm_cpu_data *sd;
 743        uint64_t efer;
 744        struct desc_struct *gdt;
 745        int me = raw_smp_processor_id();
 746
 747        rdmsrl(MSR_EFER, efer);
 748        if (efer & EFER_SVME)
 749                return -EBUSY;
 750
 751        if (!has_svm()) {
 752                pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
 753                return -EINVAL;
 754        }
 755        sd = per_cpu(svm_data, me);
 756        if (!sd) {
 757                pr_err("%s: svm_data is NULL on %d\n", __func__, me);
 758                return -EINVAL;
 759        }
 760
 761        sd->asid_generation = 1;
 762        sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 763        sd->next_asid = sd->max_asid + 1;
 764
 765        gdt = get_current_gdt_rw();
 766        sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 767
 768        wrmsrl(MSR_EFER, efer | EFER_SVME);
 769
 770        wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
 771
 772        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 773                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 774                __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
 775        }
 776
 777
 778        /*
 779         * Get OSVW bits.
 780         *
 781         * Note that it is possible to have a system with mixed processor
 782         * revisions and therefore different OSVW bits. If bits are not the same
 783         * on different processors then choose the worst case (i.e. if erratum
 784         * is present on one processor and not on another then assume that the
 785         * erratum is present everywhere).
 786         */
 787        if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 788                uint64_t len, status = 0;
 789                int err;
 790
 791                len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 792                if (!err)
 793                        status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 794                                                      &err);
 795
 796                if (err)
 797                        osvw_status = osvw_len = 0;
 798                else {
 799                        if (len < osvw_len)
 800                                osvw_len = len;
 801                        osvw_status |= status;
 802                        osvw_status &= (1ULL << osvw_len) - 1;
 803                }
 804        } else
 805                osvw_status = osvw_len = 0;
 806
 807        svm_init_erratum_383();
 808
 809        amd_pmu_enable_virt();
 810
 811        return 0;
 812}
 813
 814static void svm_cpu_uninit(int cpu)
 815{
 816        struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
 817
 818        if (!sd)
 819                return;
 820
 821        per_cpu(svm_data, raw_smp_processor_id()) = NULL;
 822        __free_page(sd->save_area);
 823        kfree(sd);
 824}
 825
 826static int svm_cpu_init(int cpu)
 827{
 828        struct svm_cpu_data *sd;
 829        int r;
 830
 831        sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
 832        if (!sd)
 833                return -ENOMEM;
 834        sd->cpu = cpu;
 835        sd->save_area = alloc_page(GFP_KERNEL);
 836        r = -ENOMEM;
 837        if (!sd->save_area)
 838                goto err_1;
 839
 840        per_cpu(svm_data, cpu) = sd;
 841
 842        return 0;
 843
 844err_1:
 845        kfree(sd);
 846        return r;
 847
 848}
 849
 850static bool valid_msr_intercept(u32 index)
 851{
 852        int i;
 853
 854        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 855                if (direct_access_msrs[i].index == index)
 856                        return true;
 857
 858        return false;
 859}
 860
 861static void set_msr_interception(u32 *msrpm, unsigned msr,
 862                                 int read, int write)
 863{
 864        u8 bit_read, bit_write;
 865        unsigned long tmp;
 866        u32 offset;
 867
 868        /*
 869         * If this warning triggers extend the direct_access_msrs list at the
 870         * beginning of the file
 871         */
 872        WARN_ON(!valid_msr_intercept(msr));
 873
 874        offset    = svm_msrpm_offset(msr);
 875        bit_read  = 2 * (msr & 0x0f);
 876        bit_write = 2 * (msr & 0x0f) + 1;
 877        tmp       = msrpm[offset];
 878
 879        BUG_ON(offset == MSR_INVALID);
 880
 881        read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 882        write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 883
 884        msrpm[offset] = tmp;
 885}
 886
 887static void svm_vcpu_init_msrpm(u32 *msrpm)
 888{
 889        int i;
 890
 891        memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
 892
 893        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 894                if (!direct_access_msrs[i].always)
 895                        continue;
 896
 897                set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
 898        }
 899}
 900
 901static void add_msr_offset(u32 offset)
 902{
 903        int i;
 904
 905        for (i = 0; i < MSRPM_OFFSETS; ++i) {
 906
 907                /* Offset already in list? */
 908                if (msrpm_offsets[i] == offset)
 909                        return;
 910
 911                /* Slot used by another offset? */
 912                if (msrpm_offsets[i] != MSR_INVALID)
 913                        continue;
 914
 915                /* Add offset to list */
 916                msrpm_offsets[i] = offset;
 917
 918                return;
 919        }
 920
 921        /*
 922         * If this BUG triggers the msrpm_offsets table has an overflow. Just
 923         * increase MSRPM_OFFSETS in this case.
 924         */
 925        BUG();
 926}
 927
 928static void init_msrpm_offsets(void)
 929{
 930        int i;
 931
 932        memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 933
 934        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 935                u32 offset;
 936
 937                offset = svm_msrpm_offset(direct_access_msrs[i].index);
 938                BUG_ON(offset == MSR_INVALID);
 939
 940                add_msr_offset(offset);
 941        }
 942}
 943
 944static void svm_enable_lbrv(struct vcpu_svm *svm)
 945{
 946        u32 *msrpm = svm->msrpm;
 947
 948        svm->vmcb->control.lbr_ctl = 1;
 949        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 950        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 951        set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 952        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 953}
 954
 955static void svm_disable_lbrv(struct vcpu_svm *svm)
 956{
 957        u32 *msrpm = svm->msrpm;
 958
 959        svm->vmcb->control.lbr_ctl = 0;
 960        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
 961        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 962        set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 963        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 964}
 965
 966/* Note:
 967 * This hash table is used to map VM_ID to a struct kvm_arch,
 968 * when handling AMD IOMMU GALOG notification to schedule in
 969 * a particular vCPU.
 970 */
 971#define SVM_VM_DATA_HASH_BITS   8
 972static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
 973static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
 974
 975/* Note:
 976 * This function is called from IOMMU driver to notify
 977 * SVM to schedule in a particular vCPU of a particular VM.
 978 */
 979static int avic_ga_log_notifier(u32 ga_tag)
 980{
 981        unsigned long flags;
 982        struct kvm_arch *ka = NULL;
 983        struct kvm_vcpu *vcpu = NULL;
 984        u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
 985        u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
 986
 987        pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
 988
 989        spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
 990        hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
 991                struct kvm *kvm = container_of(ka, struct kvm, arch);
 992                struct kvm_arch *vm_data = &kvm->arch;
 993
 994                if (vm_data->avic_vm_id != vm_id)
 995                        continue;
 996                vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
 997                break;
 998        }
 999        spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1000
1001        if (!vcpu)
1002                return 0;
1003
1004        /* Note:
1005         * At this point, the IOMMU should have already set the pending
1006         * bit in the vAPIC backing page. So, we just need to schedule
1007         * in the vcpu.
1008         */
1009        if (vcpu->mode == OUTSIDE_GUEST_MODE)
1010                kvm_vcpu_wake_up(vcpu);
1011
1012        return 0;
1013}
1014
1015static __init int svm_hardware_setup(void)
1016{
1017        int cpu;
1018        struct page *iopm_pages;
1019        void *iopm_va;
1020        int r;
1021
1022        iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
1023
1024        if (!iopm_pages)
1025                return -ENOMEM;
1026
1027        iopm_va = page_address(iopm_pages);
1028        memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
1029        iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
1030
1031        init_msrpm_offsets();
1032
1033        if (boot_cpu_has(X86_FEATURE_NX))
1034                kvm_enable_efer_bits(EFER_NX);
1035
1036        if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
1037                kvm_enable_efer_bits(EFER_FFXSR);
1038
1039        if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1040                kvm_has_tsc_control = true;
1041                kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
1042                kvm_tsc_scaling_ratio_frac_bits = 32;
1043        }
1044
1045        if (nested) {
1046                printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
1047                kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
1048        }
1049
1050        for_each_possible_cpu(cpu) {
1051                r = svm_cpu_init(cpu);
1052                if (r)
1053                        goto err;
1054        }
1055
1056        if (!boot_cpu_has(X86_FEATURE_NPT))
1057                npt_enabled = false;
1058
1059        if (npt_enabled && !npt) {
1060                printk(KERN_INFO "kvm: Nested Paging disabled\n");
1061                npt_enabled = false;
1062        }
1063
1064        if (npt_enabled) {
1065                printk(KERN_INFO "kvm: Nested Paging enabled\n");
1066                kvm_enable_tdp();
1067        } else
1068                kvm_disable_tdp();
1069
1070        if (avic) {
1071                if (!npt_enabled ||
1072                    !boot_cpu_has(X86_FEATURE_AVIC) ||
1073                    !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
1074                        avic = false;
1075                } else {
1076                        pr_info("AVIC enabled\n");
1077
1078                        amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1079                }
1080        }
1081
1082        return 0;
1083
1084err:
1085        __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
1086        iopm_base = 0;
1087        return r;
1088}
1089
1090static __exit void svm_hardware_unsetup(void)
1091{
1092        int cpu;
1093
1094        for_each_possible_cpu(cpu)
1095                svm_cpu_uninit(cpu);
1096
1097        __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
1098        iopm_base = 0;
1099}
1100
1101static void init_seg(struct vmcb_seg *seg)
1102{
1103        seg->selector = 0;
1104        seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1105                      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1106        seg->limit = 0xffff;
1107        seg->base = 0;
1108}
1109
1110static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1111{
1112        seg->selector = 0;
1113        seg->attrib = SVM_SELECTOR_P_MASK | type;
1114        seg->limit = 0xffff;
1115        seg->base = 0;
1116}
1117
1118static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1119{
1120        struct vcpu_svm *svm = to_svm(vcpu);
1121        u64 g_tsc_offset = 0;
1122
1123        if (is_guest_mode(vcpu)) {
1124                g_tsc_offset = svm->vmcb->control.tsc_offset -
1125                               svm->nested.hsave->control.tsc_offset;
1126                svm->nested.hsave->control.tsc_offset = offset;
1127        } else
1128                trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1129                                           svm->vmcb->control.tsc_offset,
1130                                           offset);
1131
1132        svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1133
1134        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1135}
1136
1137static void avic_init_vmcb(struct vcpu_svm *svm)
1138{
1139        struct vmcb *vmcb = svm->vmcb;
1140        struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
1141        phys_addr_t bpa = page_to_phys(svm->avic_backing_page);
1142        phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page);
1143        phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page);
1144
1145        vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
1146        vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
1147        vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
1148        vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
1149        vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
1150        svm->vcpu.arch.apicv_active = true;
1151}
1152
1153static void init_vmcb(struct vcpu_svm *svm)
1154{
1155        struct vmcb_control_area *control = &svm->vmcb->control;
1156        struct vmcb_save_area *save = &svm->vmcb->save;
1157
1158        svm->vcpu.arch.hflags = 0;
1159
1160        set_cr_intercept(svm, INTERCEPT_CR0_READ);
1161        set_cr_intercept(svm, INTERCEPT_CR3_READ);
1162        set_cr_intercept(svm, INTERCEPT_CR4_READ);
1163        set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1164        set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1165        set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1166        if (!kvm_vcpu_apicv_active(&svm->vcpu))
1167                set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
1168
1169        set_dr_intercepts(svm);
1170
1171        set_exception_intercept(svm, PF_VECTOR);
1172        set_exception_intercept(svm, UD_VECTOR);
1173        set_exception_intercept(svm, MC_VECTOR);
1174        set_exception_intercept(svm, AC_VECTOR);
1175        set_exception_intercept(svm, DB_VECTOR);
1176
1177        set_intercept(svm, INTERCEPT_INTR);
1178        set_intercept(svm, INTERCEPT_NMI);
1179        set_intercept(svm, INTERCEPT_SMI);
1180        set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1181        set_intercept(svm, INTERCEPT_RDPMC);
1182        set_intercept(svm, INTERCEPT_CPUID);
1183        set_intercept(svm, INTERCEPT_INVD);
1184        set_intercept(svm, INTERCEPT_HLT);
1185        set_intercept(svm, INTERCEPT_INVLPG);
1186        set_intercept(svm, INTERCEPT_INVLPGA);
1187        set_intercept(svm, INTERCEPT_IOIO_PROT);
1188        set_intercept(svm, INTERCEPT_MSR_PROT);
1189        set_intercept(svm, INTERCEPT_TASK_SWITCH);
1190        set_intercept(svm, INTERCEPT_SHUTDOWN);
1191        set_intercept(svm, INTERCEPT_VMRUN);
1192        set_intercept(svm, INTERCEPT_VMMCALL);
1193        set_intercept(svm, INTERCEPT_VMLOAD);
1194        set_intercept(svm, INTERCEPT_VMSAVE);
1195        set_intercept(svm, INTERCEPT_STGI);
1196        set_intercept(svm, INTERCEPT_CLGI);
1197        set_intercept(svm, INTERCEPT_SKINIT);
1198        set_intercept(svm, INTERCEPT_WBINVD);
1199        set_intercept(svm, INTERCEPT_XSETBV);
1200
1201        if (!kvm_mwait_in_guest()) {
1202                set_intercept(svm, INTERCEPT_MONITOR);
1203                set_intercept(svm, INTERCEPT_MWAIT);
1204        }
1205
1206        control->iopm_base_pa = iopm_base;
1207        control->msrpm_base_pa = __pa(svm->msrpm);
1208        control->int_ctl = V_INTR_MASKING_MASK;
1209
1210        init_seg(&save->es);
1211        init_seg(&save->ss);
1212        init_seg(&save->ds);
1213        init_seg(&save->fs);
1214        init_seg(&save->gs);
1215
1216        save->cs.selector = 0xf000;
1217        save->cs.base = 0xffff0000;
1218        /* Executable/Readable Code Segment */
1219        save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1220                SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1221        save->cs.limit = 0xffff;
1222
1223        save->gdtr.limit = 0xffff;
1224        save->idtr.limit = 0xffff;
1225
1226        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1227        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1228
1229        svm_set_efer(&svm->vcpu, 0);
1230        save->dr6 = 0xffff0ff0;
1231        kvm_set_rflags(&svm->vcpu, 2);
1232        save->rip = 0x0000fff0;
1233        svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
1234
1235        /*
1236         * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1237         * It also updates the guest-visible cr0 value.
1238         */
1239        svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1240        kvm_mmu_reset_context(&svm->vcpu);
1241
1242        save->cr4 = X86_CR4_PAE;
1243        /* rdx = ?? */
1244
1245        if (npt_enabled) {
1246                /* Setup VMCB for Nested Paging */
1247                control->nested_ctl = 1;
1248                clr_intercept(svm, INTERCEPT_INVLPG);
1249                clr_exception_intercept(svm, PF_VECTOR);
1250                clr_cr_intercept(svm, INTERCEPT_CR3_READ);
1251                clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1252                save->g_pat = svm->vcpu.arch.pat;
1253                save->cr3 = 0;
1254                save->cr4 = 0;
1255        }
1256        svm->asid_generation = 0;
1257
1258        svm->nested.vmcb = 0;
1259        svm->vcpu.arch.hflags = 0;
1260
1261        if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1262                control->pause_filter_count = 3000;
1263                set_intercept(svm, INTERCEPT_PAUSE);
1264        }
1265
1266        if (avic)
1267                avic_init_vmcb(svm);
1268
1269        mark_all_dirty(svm->vmcb);
1270
1271        enable_gif(svm);
1272
1273}
1274
1275static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
1276                                       unsigned int index)
1277{
1278        u64 *avic_physical_id_table;
1279        struct kvm_arch *vm_data = &vcpu->kvm->arch;
1280
1281        if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
1282                return NULL;
1283
1284        avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page);
1285
1286        return &avic_physical_id_table[index];
1287}
1288
1289/**
1290 * Note:
1291 * AVIC hardware walks the nested page table to check permissions,
1292 * but does not use the SPA address specified in the leaf page
1293 * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
1294 * field of the VMCB. Therefore, we set up the
1295 * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
1296 */
1297static int avic_init_access_page(struct kvm_vcpu *vcpu)
1298{
1299        struct kvm *kvm = vcpu->kvm;
1300        int ret;
1301
1302        if (kvm->arch.apic_access_page_done)
1303                return 0;
1304
1305        ret = x86_set_memory_region(kvm,
1306                                    APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
1307                                    APIC_DEFAULT_PHYS_BASE,
1308                                    PAGE_SIZE);
1309        if (ret)
1310                return ret;
1311
1312        kvm->arch.apic_access_page_done = true;
1313        return 0;
1314}
1315
1316static int avic_init_backing_page(struct kvm_vcpu *vcpu)
1317{
1318        int ret;
1319        u64 *entry, new_entry;
1320        int id = vcpu->vcpu_id;
1321        struct vcpu_svm *svm = to_svm(vcpu);
1322
1323        ret = avic_init_access_page(vcpu);
1324        if (ret)
1325                return ret;
1326
1327        if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
1328                return -EINVAL;
1329
1330        if (!svm->vcpu.arch.apic->regs)
1331                return -EINVAL;
1332
1333        svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
1334
1335        /* Setting AVIC backing page address in the phy APIC ID table */
1336        entry = avic_get_physical_id_entry(vcpu, id);
1337        if (!entry)
1338                return -EINVAL;
1339
1340        new_entry = READ_ONCE(*entry);
1341        new_entry = (page_to_phys(svm->avic_backing_page) &
1342                     AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
1343                     AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
1344        WRITE_ONCE(*entry, new_entry);
1345
1346        svm->avic_physical_id_cache = entry;
1347
1348        return 0;
1349}
1350
1351static inline int avic_get_next_vm_id(void)
1352{
1353        int id;
1354
1355        spin_lock(&avic_vm_id_lock);
1356
1357        /* AVIC VM ID is one-based. */
1358        id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
1359        if (id <= AVIC_VM_ID_MASK)
1360                __set_bit(id, avic_vm_id_bitmap);
1361        else
1362                id = -EAGAIN;
1363
1364        spin_unlock(&avic_vm_id_lock);
1365        return id;
1366}
1367
1368static inline int avic_free_vm_id(int id)
1369{
1370        if (id <= 0 || id > AVIC_VM_ID_MASK)
1371                return -EINVAL;
1372
1373        spin_lock(&avic_vm_id_lock);
1374        __clear_bit(id, avic_vm_id_bitmap);
1375        spin_unlock(&avic_vm_id_lock);
1376        return 0;
1377}
1378
1379static void avic_vm_destroy(struct kvm *kvm)
1380{
1381        unsigned long flags;
1382        struct kvm_arch *vm_data = &kvm->arch;
1383
1384        if (!avic)
1385                return;
1386
1387        avic_free_vm_id(vm_data->avic_vm_id);
1388
1389        if (vm_data->avic_logical_id_table_page)
1390                __free_page(vm_data->avic_logical_id_table_page);
1391        if (vm_data->avic_physical_id_table_page)
1392                __free_page(vm_data->avic_physical_id_table_page);
1393
1394        spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1395        hash_del(&vm_data->hnode);
1396        spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1397}
1398
1399static int avic_vm_init(struct kvm *kvm)
1400{
1401        unsigned long flags;
1402        int vm_id, err = -ENOMEM;
1403        struct kvm_arch *vm_data = &kvm->arch;
1404        struct page *p_page;
1405        struct page *l_page;
1406
1407        if (!avic)
1408                return 0;
1409
1410        vm_id = avic_get_next_vm_id();
1411        if (vm_id < 0)
1412                return vm_id;
1413        vm_data->avic_vm_id = (u32)vm_id;
1414
1415        /* Allocating physical APIC ID table (4KB) */
1416        p_page = alloc_page(GFP_KERNEL);
1417        if (!p_page)
1418                goto free_avic;
1419
1420        vm_data->avic_physical_id_table_page = p_page;
1421        clear_page(page_address(p_page));
1422
1423        /* Allocating logical APIC ID table (4KB) */
1424        l_page = alloc_page(GFP_KERNEL);
1425        if (!l_page)
1426                goto free_avic;
1427
1428        vm_data->avic_logical_id_table_page = l_page;
1429        clear_page(page_address(l_page));
1430
1431        spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1432        hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
1433        spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1434
1435        return 0;
1436
1437free_avic:
1438        avic_vm_destroy(kvm);
1439        return err;
1440}
1441
1442static inline int
1443avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
1444{
1445        int ret = 0;
1446        unsigned long flags;
1447        struct amd_svm_iommu_ir *ir;
1448        struct vcpu_svm *svm = to_svm(vcpu);
1449
1450        if (!kvm_arch_has_assigned_device(vcpu->kvm))
1451                return 0;
1452
1453        /*
1454         * Here, we go through the per-vcpu ir_list to update all existing
1455         * interrupt remapping table entry targeting this vcpu.
1456         */
1457        spin_lock_irqsave(&svm->ir_list_lock, flags);
1458
1459        if (list_empty(&svm->ir_list))
1460                goto out;
1461
1462        list_for_each_entry(ir, &svm->ir_list, node) {
1463                ret = amd_iommu_update_ga(cpu, r, ir->data);
1464                if (ret)
1465                        break;
1466        }
1467out:
1468        spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1469        return ret;
1470}
1471
1472static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1473{
1474        u64 entry;
1475        /* ID = 0xff (broadcast), ID > 0xff (reserved) */
1476        int h_physical_id = kvm_cpu_get_apicid(cpu);
1477        struct vcpu_svm *svm = to_svm(vcpu);
1478
1479        if (!kvm_vcpu_apicv_active(vcpu))
1480                return;
1481
1482        if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
1483                return;
1484
1485        entry = READ_ONCE(*(svm->avic_physical_id_cache));
1486        WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
1487
1488        entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
1489        entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
1490
1491        entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1492        if (svm->avic_is_running)
1493                entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1494
1495        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1496        avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
1497                                        svm->avic_is_running);
1498}
1499
1500static void avic_vcpu_put(struct kvm_vcpu *vcpu)
1501{
1502        u64 entry;
1503        struct vcpu_svm *svm = to_svm(vcpu);
1504
1505        if (!kvm_vcpu_apicv_active(vcpu))
1506                return;
1507
1508        entry = READ_ONCE(*(svm->avic_physical_id_cache));
1509        if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
1510                avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
1511
1512        entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1513        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1514}
1515
1516/**
1517 * This function is called during VCPU halt/unhalt.
1518 */
1519static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
1520{
1521        struct vcpu_svm *svm = to_svm(vcpu);
1522
1523        svm->avic_is_running = is_run;
1524        if (is_run)
1525                avic_vcpu_load(vcpu, vcpu->cpu);
1526        else
1527                avic_vcpu_put(vcpu);
1528}
1529
1530static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1531{
1532        struct vcpu_svm *svm = to_svm(vcpu);
1533        u32 dummy;
1534        u32 eax = 1;
1535
1536        if (!init_event) {
1537                svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1538                                           MSR_IA32_APICBASE_ENABLE;
1539                if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1540                        svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1541        }
1542        init_vmcb(svm);
1543
1544        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1545        kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1546
1547        if (kvm_vcpu_apicv_active(vcpu) && !init_event)
1548                avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
1549}
1550
1551static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1552{
1553        struct vcpu_svm *svm;
1554        struct page *page;
1555        struct page *msrpm_pages;
1556        struct page *hsave_page;
1557        struct page *nested_msrpm_pages;
1558        int err;
1559
1560        svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1561        if (!svm) {
1562                err = -ENOMEM;
1563                goto out;
1564        }
1565
1566        err = kvm_vcpu_init(&svm->vcpu, kvm, id);
1567        if (err)
1568                goto free_svm;
1569
1570        err = -ENOMEM;
1571        page = alloc_page(GFP_KERNEL);
1572        if (!page)
1573                goto uninit;
1574
1575        msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1576        if (!msrpm_pages)
1577                goto free_page1;
1578
1579        nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1580        if (!nested_msrpm_pages)
1581                goto free_page2;
1582
1583        hsave_page = alloc_page(GFP_KERNEL);
1584        if (!hsave_page)
1585                goto free_page3;
1586
1587        if (avic) {
1588                err = avic_init_backing_page(&svm->vcpu);
1589                if (err)
1590                        goto free_page4;
1591
1592                INIT_LIST_HEAD(&svm->ir_list);
1593                spin_lock_init(&svm->ir_list_lock);
1594        }
1595
1596        /* We initialize this flag to true to make sure that the is_running
1597         * bit would be set the first time the vcpu is loaded.
1598         */
1599        svm->avic_is_running = true;
1600
1601        svm->nested.hsave = page_address(hsave_page);
1602
1603        svm->msrpm = page_address(msrpm_pages);
1604        svm_vcpu_init_msrpm(svm->msrpm);
1605
1606        svm->nested.msrpm = page_address(nested_msrpm_pages);
1607        svm_vcpu_init_msrpm(svm->nested.msrpm);
1608
1609        svm->vmcb = page_address(page);
1610        clear_page(svm->vmcb);
1611        svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1612        svm->asid_generation = 0;
1613        init_vmcb(svm);
1614
1615        svm_init_osvw(&svm->vcpu);
1616
1617        return &svm->vcpu;
1618
1619free_page4:
1620        __free_page(hsave_page);
1621free_page3:
1622        __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1623free_page2:
1624        __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
1625free_page1:
1626        __free_page(page);
1627uninit:
1628        kvm_vcpu_uninit(&svm->vcpu);
1629free_svm:
1630        kmem_cache_free(kvm_vcpu_cache, svm);
1631out:
1632        return ERR_PTR(err);
1633}
1634
1635static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1636{
1637        struct vcpu_svm *svm = to_svm(vcpu);
1638
1639        __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
1640        __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1641        __free_page(virt_to_page(svm->nested.hsave));
1642        __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1643        kvm_vcpu_uninit(vcpu);
1644        kmem_cache_free(kvm_vcpu_cache, svm);
1645}
1646
1647static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1648{
1649        struct vcpu_svm *svm = to_svm(vcpu);
1650        int i;
1651
1652        if (unlikely(cpu != vcpu->cpu)) {
1653                svm->asid_generation = 0;
1654                mark_all_dirty(svm->vmcb);
1655        }
1656
1657#ifdef CONFIG_X86_64
1658        rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1659#endif
1660        savesegment(fs, svm->host.fs);
1661        savesegment(gs, svm->host.gs);
1662        svm->host.ldt = kvm_read_ldt();
1663
1664        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1665                rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1666
1667        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1668                u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
1669                if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
1670                        __this_cpu_write(current_tsc_ratio, tsc_ratio);
1671                        wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
1672                }
1673        }
1674        /* This assumes that the kernel never uses MSR_TSC_AUX */
1675        if (static_cpu_has(X86_FEATURE_RDTSCP))
1676                wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
1677
1678        avic_vcpu_load(vcpu, cpu);
1679}
1680
1681static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1682{
1683        struct vcpu_svm *svm = to_svm(vcpu);
1684        int i;
1685
1686        avic_vcpu_put(vcpu);
1687
1688        ++vcpu->stat.host_state_reload;
1689        kvm_load_ldt(svm->host.ldt);
1690#ifdef CONFIG_X86_64
1691        loadsegment(fs, svm->host.fs);
1692        wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
1693        load_gs_index(svm->host.gs);
1694#else
1695#ifdef CONFIG_X86_32_LAZY_GS
1696        loadsegment(gs, svm->host.gs);
1697#endif
1698#endif
1699        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1700                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1701}
1702
1703static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
1704{
1705        avic_set_running(vcpu, false);
1706}
1707
1708static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
1709{
1710        avic_set_running(vcpu, true);
1711}
1712
1713static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1714{
1715        return to_svm(vcpu)->vmcb->save.rflags;
1716}
1717
1718static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1719{
1720       /*
1721        * Any change of EFLAGS.VM is accompanied by a reload of SS
1722        * (caused by either a task switch or an inter-privilege IRET),
1723        * so we do not need to update the CPL here.
1724        */
1725        to_svm(vcpu)->vmcb->save.rflags = rflags;
1726}
1727
1728static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
1729{
1730        return 0;
1731}
1732
1733static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1734{
1735        switch (reg) {
1736        case VCPU_EXREG_PDPTR:
1737                BUG_ON(!npt_enabled);
1738                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1739                break;
1740        default:
1741                BUG();
1742        }
1743}
1744
1745static void svm_set_vintr(struct vcpu_svm *svm)
1746{
1747        set_intercept(svm, INTERCEPT_VINTR);
1748}
1749
1750static void svm_clear_vintr(struct vcpu_svm *svm)
1751{
1752        clr_intercept(svm, INTERCEPT_VINTR);
1753}
1754
1755static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1756{
1757        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1758
1759        switch (seg) {
1760        case VCPU_SREG_CS: return &save->cs;
1761        case VCPU_SREG_DS: return &save->ds;
1762        case VCPU_SREG_ES: return &save->es;
1763        case VCPU_SREG_FS: return &save->fs;
1764        case VCPU_SREG_GS: return &save->gs;
1765        case VCPU_SREG_SS: return &save->ss;
1766        case VCPU_SREG_TR: return &save->tr;
1767        case VCPU_SREG_LDTR: return &save->ldtr;
1768        }
1769        BUG();
1770        return NULL;
1771}
1772
1773static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1774{
1775        struct vmcb_seg *s = svm_seg(vcpu, seg);
1776
1777        return s->base;
1778}
1779
1780static void svm_get_segment(struct kvm_vcpu *vcpu,
1781                            struct kvm_segment *var, int seg)
1782{
1783        struct vmcb_seg *s = svm_seg(vcpu, seg);
1784
1785        var->base = s->base;
1786        var->limit = s->limit;
1787        var->selector = s->selector;
1788        var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1789        var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1790        var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1791        var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1792        var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1793        var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1794        var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1795
1796        /*
1797         * AMD CPUs circa 2014 track the G bit for all segments except CS.
1798         * However, the SVM spec states that the G bit is not observed by the
1799         * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1800         * So let's synthesize a legal G bit for all segments, this helps
1801         * running KVM nested. It also helps cross-vendor migration, because
1802         * Intel's vmentry has a check on the 'G' bit.
1803         */
1804        var->g = s->limit > 0xfffff;
1805
1806        /*
1807         * AMD's VMCB does not have an explicit unusable field, so emulate it
1808         * for cross vendor migration purposes by "not present"
1809         */
1810        var->unusable = !var->present;
1811
1812        switch (seg) {
1813        case VCPU_SREG_TR:
1814                /*
1815                 * Work around a bug where the busy flag in the tr selector
1816                 * isn't exposed
1817                 */
1818                var->type |= 0x2;
1819                break;
1820        case VCPU_SREG_DS:
1821        case VCPU_SREG_ES:
1822        case VCPU_SREG_FS:
1823        case VCPU_SREG_GS:
1824                /*
1825                 * The accessed bit must always be set in the segment
1826                 * descriptor cache, although it can be cleared in the
1827                 * descriptor, the cached bit always remains at 1. Since
1828                 * Intel has a check on this, set it here to support
1829                 * cross-vendor migration.
1830                 */
1831                if (!var->unusable)
1832                        var->type |= 0x1;
1833                break;
1834        case VCPU_SREG_SS:
1835                /*
1836                 * On AMD CPUs sometimes the DB bit in the segment
1837                 * descriptor is left as 1, although the whole segment has
1838                 * been made unusable. Clear it here to pass an Intel VMX
1839                 * entry check when cross vendor migrating.
1840                 */
1841                if (var->unusable)
1842                        var->db = 0;
1843                /* This is symmetric with svm_set_segment() */
1844                var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1845                break;
1846        }
1847}
1848
1849static int svm_get_cpl(struct kvm_vcpu *vcpu)
1850{
1851        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1852
1853        return save->cpl;
1854}
1855
1856static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1857{
1858        struct vcpu_svm *svm = to_svm(vcpu);
1859
1860        dt->size = svm->vmcb->save.idtr.limit;
1861        dt->address = svm->vmcb->save.idtr.base;
1862}
1863
1864static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1865{
1866        struct vcpu_svm *svm = to_svm(vcpu);
1867
1868        svm->vmcb->save.idtr.limit = dt->size;
1869        svm->vmcb->save.idtr.base = dt->address ;
1870        mark_dirty(svm->vmcb, VMCB_DT);
1871}
1872
1873static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1874{
1875        struct vcpu_svm *svm = to_svm(vcpu);
1876
1877        dt->size = svm->vmcb->save.gdtr.limit;
1878        dt->address = svm->vmcb->save.gdtr.base;
1879}
1880
1881static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1882{
1883        struct vcpu_svm *svm = to_svm(vcpu);
1884
1885        svm->vmcb->save.gdtr.limit = dt->size;
1886        svm->vmcb->save.gdtr.base = dt->address ;
1887        mark_dirty(svm->vmcb, VMCB_DT);
1888}
1889
1890static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1891{
1892}
1893
1894static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1895{
1896}
1897
1898static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1899{
1900}
1901
1902static void update_cr0_intercept(struct vcpu_svm *svm)
1903{
1904        ulong gcr0 = svm->vcpu.arch.cr0;
1905        u64 *hcr0 = &svm->vmcb->save.cr0;
1906
1907        *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1908                | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1909
1910        mark_dirty(svm->vmcb, VMCB_CR);
1911
1912        if (gcr0 == *hcr0) {
1913                clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1914                clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1915        } else {
1916                set_cr_intercept(svm, INTERCEPT_CR0_READ);
1917                set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1918        }
1919}
1920
1921static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1922{
1923        struct vcpu_svm *svm = to_svm(vcpu);
1924
1925#ifdef CONFIG_X86_64
1926        if (vcpu->arch.efer & EFER_LME) {
1927                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1928                        vcpu->arch.efer |= EFER_LMA;
1929                        svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1930                }
1931
1932                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1933                        vcpu->arch.efer &= ~EFER_LMA;
1934                        svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1935                }
1936        }
1937#endif
1938        vcpu->arch.cr0 = cr0;
1939
1940        if (!npt_enabled)
1941                cr0 |= X86_CR0_PG | X86_CR0_WP;
1942
1943        /*
1944         * re-enable caching here because the QEMU bios
1945         * does not do it - this results in some delay at
1946         * reboot
1947         */
1948        if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1949                cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1950        svm->vmcb->save.cr0 = cr0;
1951        mark_dirty(svm->vmcb, VMCB_CR);
1952        update_cr0_intercept(svm);
1953}
1954
1955static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1956{
1957        unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1958        unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1959
1960        if (cr4 & X86_CR4_VMXE)
1961                return 1;
1962
1963        if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1964                svm_flush_tlb(vcpu);
1965
1966        vcpu->arch.cr4 = cr4;
1967        if (!npt_enabled)
1968                cr4 |= X86_CR4_PAE;
1969        cr4 |= host_cr4_mce;
1970        to_svm(vcpu)->vmcb->save.cr4 = cr4;
1971        mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1972        return 0;
1973}
1974
1975static void svm_set_segment(struct kvm_vcpu *vcpu,
1976                            struct kvm_segment *var, int seg)
1977{
1978        struct vcpu_svm *svm = to_svm(vcpu);
1979        struct vmcb_seg *s = svm_seg(vcpu, seg);
1980
1981        s->base = var->base;
1982        s->limit = var->limit;
1983        s->selector = var->selector;
1984        s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1985        s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1986        s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1987        s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1988        s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1989        s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1990        s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1991        s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1992
1993        /*
1994         * This is always accurate, except if SYSRET returned to a segment
1995         * with SS.DPL != 3.  Intel does not have this quirk, and always
1996         * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1997         * would entail passing the CPL to userspace and back.
1998         */
1999        if (seg == VCPU_SREG_SS)
2000                /* This is symmetric with svm_get_segment() */
2001                svm->vmcb->save.cpl = (var->dpl & 3);
2002
2003        mark_dirty(svm->vmcb, VMCB_SEG);
2004}
2005
2006static void update_bp_intercept(struct kvm_vcpu *vcpu)
2007{
2008        struct vcpu_svm *svm = to_svm(vcpu);
2009
2010        clr_exception_intercept(svm, BP_VECTOR);
2011
2012        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
2013                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2014                        set_exception_intercept(svm, BP_VECTOR);
2015        } else
2016                vcpu->guest_debug = 0;
2017}
2018
2019static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
2020{
2021        if (sd->next_asid > sd->max_asid) {
2022                ++sd->asid_generation;
2023                sd->next_asid = 1;
2024                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
2025        }
2026
2027        svm->asid_generation = sd->asid_generation;
2028        svm->vmcb->control.asid = sd->next_asid++;
2029
2030        mark_dirty(svm->vmcb, VMCB_ASID);
2031}
2032
2033static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
2034{
2035        return to_svm(vcpu)->vmcb->save.dr6;
2036}
2037
2038static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
2039{
2040        struct vcpu_svm *svm = to_svm(vcpu);
2041
2042        svm->vmcb->save.dr6 = value;
2043        mark_dirty(svm->vmcb, VMCB_DR);
2044}
2045
2046static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
2047{
2048        struct vcpu_svm *svm = to_svm(vcpu);
2049
2050        get_debugreg(vcpu->arch.db[0], 0);
2051        get_debugreg(vcpu->arch.db[1], 1);
2052        get_debugreg(vcpu->arch.db[2], 2);
2053        get_debugreg(vcpu->arch.db[3], 3);
2054        vcpu->arch.dr6 = svm_get_dr6(vcpu);
2055        vcpu->arch.dr7 = svm->vmcb->save.dr7;
2056
2057        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
2058        set_dr_intercepts(svm);
2059}
2060
2061static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
2062{
2063        struct vcpu_svm *svm = to_svm(vcpu);
2064
2065        svm->vmcb->save.dr7 = value;
2066        mark_dirty(svm->vmcb, VMCB_DR);
2067}
2068
2069static int pf_interception(struct vcpu_svm *svm)
2070{
2071        u64 fault_address = svm->vmcb->control.exit_info_2;
2072        u64 error_code;
2073        int r = 1;
2074
2075        switch (svm->apf_reason) {
2076        default:
2077                error_code = svm->vmcb->control.exit_info_1;
2078
2079                trace_kvm_page_fault(fault_address, error_code);
2080                if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
2081                        kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
2082                r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
2083                        svm->vmcb->control.insn_bytes,
2084                        svm->vmcb->control.insn_len);
2085                break;
2086        case KVM_PV_REASON_PAGE_NOT_PRESENT:
2087                svm->apf_reason = 0;
2088                local_irq_disable();
2089                kvm_async_pf_task_wait(fault_address);
2090                local_irq_enable();
2091                break;
2092        case KVM_PV_REASON_PAGE_READY:
2093                svm->apf_reason = 0;
2094                local_irq_disable();
2095                kvm_async_pf_task_wake(fault_address);
2096                local_irq_enable();
2097                break;
2098        }
2099        return r;
2100}
2101
2102static int db_interception(struct vcpu_svm *svm)
2103{
2104        struct kvm_run *kvm_run = svm->vcpu.run;
2105
2106        if (!(svm->vcpu.guest_debug &
2107              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
2108                !svm->nmi_singlestep) {
2109                kvm_queue_exception(&svm->vcpu, DB_VECTOR);
2110                return 1;
2111        }
2112
2113        if (svm->nmi_singlestep) {
2114                svm->nmi_singlestep = false;
2115                if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
2116                        svm->vmcb->save.rflags &=
2117                                ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2118        }
2119
2120        if (svm->vcpu.guest_debug &
2121            (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
2122                kvm_run->exit_reason = KVM_EXIT_DEBUG;
2123                kvm_run->debug.arch.pc =
2124                        svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2125                kvm_run->debug.arch.exception = DB_VECTOR;
2126                return 0;
2127        }
2128
2129        return 1;
2130}
2131
2132static int bp_interception(struct vcpu_svm *svm)
2133{
2134        struct kvm_run *kvm_run = svm->vcpu.run;
2135
2136        kvm_run->exit_reason = KVM_EXIT_DEBUG;
2137        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2138        kvm_run->debug.arch.exception = BP_VECTOR;
2139        return 0;
2140}
2141
2142static int ud_interception(struct vcpu_svm *svm)
2143{
2144        int er;
2145
2146        er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
2147        if (er != EMULATE_DONE)
2148                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2149        return 1;
2150}
2151
2152static int ac_interception(struct vcpu_svm *svm)
2153{
2154        kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
2155        return 1;
2156}
2157
2158static bool is_erratum_383(void)
2159{
2160        int err, i;
2161        u64 value;
2162
2163        if (!erratum_383_found)
2164                return false;
2165
2166        value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2167        if (err)
2168                return false;
2169
2170        /* Bit 62 may or may not be set for this mce */
2171        value &= ~(1ULL << 62);
2172
2173        if (value != 0xb600000000010015ULL)
2174                return false;
2175
2176        /* Clear MCi_STATUS registers */
2177        for (i = 0; i < 6; ++i)
2178                native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2179
2180        value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2181        if (!err) {
2182                u32 low, high;
2183
2184                value &= ~(1ULL << 2);
2185                low    = lower_32_bits(value);
2186                high   = upper_32_bits(value);
2187
2188                native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2189        }
2190
2191        /* Flush tlb to evict multi-match entries */
2192        __flush_tlb_all();
2193
2194        return true;
2195}
2196
2197static void svm_handle_mce(struct vcpu_svm *svm)
2198{
2199        if (is_erratum_383()) {
2200                /*
2201                 * Erratum 383 triggered. Guest state is corrupt so kill the
2202                 * guest.
2203                 */
2204                pr_err("KVM: Guest triggered AMD Erratum 383\n");
2205
2206                kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
2207
2208                return;
2209        }
2210
2211        /*
2212         * On an #MC intercept the MCE handler is not called automatically in
2213         * the host. So do it by hand here.
2214         */
2215        asm volatile (
2216                "int $0x12\n");
2217        /* not sure if we ever come back to this point */
2218
2219        return;
2220}
2221
2222static int mc_interception(struct vcpu_svm *svm)
2223{
2224        return 1;
2225}
2226
2227static int shutdown_interception(struct vcpu_svm *svm)
2228{
2229        struct kvm_run *kvm_run = svm->vcpu.run;
2230
2231        /*
2232         * VMCB is undefined after a SHUTDOWN intercept
2233         * so reinitialize it.
2234         */
2235        clear_page(svm->vmcb);
2236        init_vmcb(svm);
2237
2238        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2239        return 0;
2240}
2241
2242static int io_interception(struct vcpu_svm *svm)
2243{
2244        struct kvm_vcpu *vcpu = &svm->vcpu;
2245        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2246        int size, in, string;
2247        unsigned port;
2248
2249        ++svm->vcpu.stat.io_exits;
2250        string = (io_info & SVM_IOIO_STR_MASK) != 0;
2251        in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2252        if (string)
2253                return emulate_instruction(vcpu, 0) == EMULATE_DONE;
2254
2255        port = io_info >> 16;
2256        size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2257        svm->next_rip = svm->vmcb->control.exit_info_2;
2258        skip_emulated_instruction(&svm->vcpu);
2259
2260        return in ? kvm_fast_pio_in(vcpu, size, port)
2261                  : kvm_fast_pio_out(vcpu, size, port);
2262}
2263
2264static int nmi_interception(struct vcpu_svm *svm)
2265{
2266        return 1;
2267}
2268
2269static int intr_interception(struct vcpu_svm *svm)
2270{
2271        ++svm->vcpu.stat.irq_exits;
2272        return 1;
2273}
2274
2275static int nop_on_interception(struct vcpu_svm *svm)
2276{
2277        return 1;
2278}
2279
2280static int halt_interception(struct vcpu_svm *svm)
2281{
2282        svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
2283        return kvm_emulate_halt(&svm->vcpu);
2284}
2285
2286static int vmmcall_interception(struct vcpu_svm *svm)
2287{
2288        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2289        return kvm_emulate_hypercall(&svm->vcpu);
2290}
2291
2292static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
2293{
2294        struct vcpu_svm *svm = to_svm(vcpu);
2295
2296        return svm->nested.nested_cr3;
2297}
2298
2299static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
2300{
2301        struct vcpu_svm *svm = to_svm(vcpu);
2302        u64 cr3 = svm->nested.nested_cr3;
2303        u64 pdpte;
2304        int ret;
2305
2306        ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
2307                                       offset_in_page(cr3) + index * 8, 8);
2308        if (ret)
2309                return 0;
2310        return pdpte;
2311}
2312
2313static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
2314                                   unsigned long root)
2315{
2316        struct vcpu_svm *svm = to_svm(vcpu);
2317
2318        svm->vmcb->control.nested_cr3 = root;
2319        mark_dirty(svm->vmcb, VMCB_NPT);
2320        svm_flush_tlb(vcpu);
2321}
2322
2323static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
2324                                       struct x86_exception *fault)
2325{
2326        struct vcpu_svm *svm = to_svm(vcpu);
2327
2328        if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
2329                /*
2330                 * TODO: track the cause of the nested page fault, and
2331                 * correctly fill in the high bits of exit_info_1.
2332                 */
2333                svm->vmcb->control.exit_code = SVM_EXIT_NPF;
2334                svm->vmcb->control.exit_code_hi = 0;
2335                svm->vmcb->control.exit_info_1 = (1ULL << 32);
2336                svm->vmcb->control.exit_info_2 = fault->address;
2337        }
2338
2339        svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
2340        svm->vmcb->control.exit_info_1 |= fault->error_code;
2341
2342        /*
2343         * The present bit is always zero for page structure faults on real
2344         * hardware.
2345         */
2346        if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
2347                svm->vmcb->control.exit_info_1 &= ~1;
2348
2349        nested_svm_vmexit(svm);
2350}
2351
2352static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
2353{
2354        WARN_ON(mmu_is_nested(vcpu));
2355        kvm_init_shadow_mmu(vcpu);
2356        vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
2357        vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
2358        vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
2359        vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
2360        vcpu->arch.mmu.shadow_root_level = get_npt_level();
2361        reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
2362        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
2363}
2364
2365static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
2366{
2367        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
2368}
2369
2370static int nested_svm_check_permissions(struct vcpu_svm *svm)
2371{
2372        if (!(svm->vcpu.arch.efer & EFER_SVME)
2373            || !is_paging(&svm->vcpu)) {
2374                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2375                return 1;
2376        }
2377
2378        if (svm->vmcb->save.cpl) {
2379                kvm_inject_gp(&svm->vcpu, 0);
2380                return 1;
2381        }
2382
2383       return 0;
2384}
2385
2386static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
2387                                      bool has_error_code, u32 error_code)
2388{
2389        int vmexit;
2390
2391        if (!is_guest_mode(&svm->vcpu))
2392                return 0;
2393
2394        svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
2395        svm->vmcb->control.exit_code_hi = 0;
2396        svm->vmcb->control.exit_info_1 = error_code;
2397        svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
2398
2399        vmexit = nested_svm_intercept(svm);
2400        if (vmexit == NESTED_EXIT_DONE)
2401                svm->nested.exit_required = true;
2402
2403        return vmexit;
2404}
2405
2406/* This function returns true if it is save to enable the irq window */
2407static inline bool nested_svm_intr(struct vcpu_svm *svm)
2408{
2409        if (!is_guest_mode(&svm->vcpu))
2410                return true;
2411
2412        if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2413                return true;
2414
2415        if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
2416                return false;
2417
2418        /*
2419         * if vmexit was already requested (by intercepted exception
2420         * for instance) do not overwrite it with "external interrupt"
2421         * vmexit.
2422         */
2423        if (svm->nested.exit_required)
2424                return false;
2425
2426        svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
2427        svm->vmcb->control.exit_info_1 = 0;
2428        svm->vmcb->control.exit_info_2 = 0;
2429
2430        if (svm->nested.intercept & 1ULL) {
2431                /*
2432                 * The #vmexit can't be emulated here directly because this
2433                 * code path runs with irqs and preemption disabled. A
2434                 * #vmexit emulation might sleep. Only signal request for
2435                 * the #vmexit here.
2436                 */
2437                svm->nested.exit_required = true;
2438                trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
2439                return false;
2440        }
2441
2442        return true;
2443}
2444
2445/* This function returns true if it is save to enable the nmi window */
2446static inline bool nested_svm_nmi(struct vcpu_svm *svm)
2447{
2448        if (!is_guest_mode(&svm->vcpu))
2449                return true;
2450
2451        if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
2452                return true;
2453
2454        svm->vmcb->control.exit_code = SVM_EXIT_NMI;
2455        svm->nested.exit_required = true;
2456
2457        return false;
2458}
2459
2460static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
2461{
2462        struct page *page;
2463
2464        might_sleep();
2465
2466        page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
2467        if (is_error_page(page))
2468                goto error;
2469
2470        *_page = page;
2471
2472        return kmap(page);
2473
2474error:
2475        kvm_inject_gp(&svm->vcpu, 0);
2476
2477        return NULL;
2478}
2479
2480static void nested_svm_unmap(struct page *page)
2481{
2482        kunmap(page);
2483        kvm_release_page_dirty(page);
2484}
2485
2486static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
2487{
2488        unsigned port, size, iopm_len;
2489        u16 val, mask;
2490        u8 start_bit;
2491        u64 gpa;
2492
2493        if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
2494                return NESTED_EXIT_HOST;
2495
2496        port = svm->vmcb->control.exit_info_1 >> 16;
2497        size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
2498                SVM_IOIO_SIZE_SHIFT;
2499        gpa  = svm->nested.vmcb_iopm + (port / 8);
2500        start_bit = port % 8;
2501        iopm_len = (start_bit + size > 8) ? 2 : 1;
2502        mask = (0xf >> (4 - size)) << start_bit;
2503        val = 0;
2504
2505        if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
2506                return NESTED_EXIT_DONE;
2507
2508        return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2509}
2510
2511static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
2512{
2513        u32 offset, msr, value;
2514        int write, mask;
2515
2516        if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2517                return NESTED_EXIT_HOST;
2518
2519        msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2520        offset = svm_msrpm_offset(msr);
2521        write  = svm->vmcb->control.exit_info_1 & 1;
2522        mask   = 1 << ((2 * (msr & 0xf)) + write);
2523
2524        if (offset == MSR_INVALID)
2525                return NESTED_EXIT_DONE;
2526
2527        /* Offset is in 32 bit units but need in 8 bit units */
2528        offset *= 4;
2529
2530        if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
2531                return NESTED_EXIT_DONE;
2532
2533        return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2534}
2535
2536static int nested_svm_exit_special(struct vcpu_svm *svm)
2537{
2538        u32 exit_code = svm->vmcb->control.exit_code;
2539
2540        switch (exit_code) {
2541        case SVM_EXIT_INTR:
2542        case SVM_EXIT_NMI:
2543        case SVM_EXIT_EXCP_BASE + MC_VECTOR:
2544                return NESTED_EXIT_HOST;
2545        case SVM_EXIT_NPF:
2546                /* For now we are always handling NPFs when using them */
2547                if (npt_enabled)
2548                        return NESTED_EXIT_HOST;
2549                break;
2550        case SVM_EXIT_EXCP_BASE + PF_VECTOR:
2551                /* When we're shadowing, trap PFs, but not async PF */
2552                if (!npt_enabled && svm->apf_reason == 0)
2553                        return NESTED_EXIT_HOST;
2554                break;
2555        default:
2556                break;
2557        }
2558
2559        return NESTED_EXIT_CONTINUE;
2560}
2561
2562/*
2563 * If this function returns true, this #vmexit was already handled
2564 */
2565static int nested_svm_intercept(struct vcpu_svm *svm)
2566{
2567        u32 exit_code = svm->vmcb->control.exit_code;
2568        int vmexit = NESTED_EXIT_HOST;
2569
2570        switch (exit_code) {
2571        case SVM_EXIT_MSR:
2572                vmexit = nested_svm_exit_handled_msr(svm);
2573                break;
2574        case SVM_EXIT_IOIO:
2575                vmexit = nested_svm_intercept_ioio(svm);
2576                break;
2577        case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
2578                u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
2579                if (svm->nested.intercept_cr & bit)
2580                        vmexit = NESTED_EXIT_DONE;
2581                break;
2582        }
2583        case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
2584                u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
2585                if (svm->nested.intercept_dr & bit)
2586                        vmexit = NESTED_EXIT_DONE;
2587                break;
2588        }
2589        case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
2590                u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
2591                if (svm->nested.intercept_exceptions & excp_bits)
2592                        vmexit = NESTED_EXIT_DONE;
2593                /* async page fault always cause vmexit */
2594                else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2595                         svm->apf_reason != 0)
2596                        vmexit = NESTED_EXIT_DONE;
2597                break;
2598        }
2599        case SVM_EXIT_ERR: {
2600                vmexit = NESTED_EXIT_DONE;
2601                break;
2602        }
2603        default: {
2604                u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
2605                if (svm->nested.intercept & exit_bits)
2606                        vmexit = NESTED_EXIT_DONE;
2607        }
2608        }
2609
2610        return vmexit;
2611}
2612
2613static int nested_svm_exit_handled(struct vcpu_svm *svm)
2614{
2615        int vmexit;
2616
2617        vmexit = nested_svm_intercept(svm);
2618
2619        if (vmexit == NESTED_EXIT_DONE)
2620                nested_svm_vmexit(svm);
2621
2622        return vmexit;
2623}
2624
2625static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
2626{
2627        struct vmcb_control_area *dst  = &dst_vmcb->control;
2628        struct vmcb_control_area *from = &from_vmcb->control;
2629
2630        dst->intercept_cr         = from->intercept_cr;
2631        dst->intercept_dr         = from->intercept_dr;
2632        dst->intercept_exceptions = from->intercept_exceptions;
2633        dst->intercept            = from->intercept;
2634        dst->iopm_base_pa         = from->iopm_base_pa;
2635        dst->msrpm_base_pa        = from->msrpm_base_pa;
2636        dst->tsc_offset           = from->tsc_offset;
2637        dst->asid                 = from->asid;
2638        dst->tlb_ctl              = from->tlb_ctl;
2639        dst->int_ctl              = from->int_ctl;
2640        dst->int_vector           = from->int_vector;
2641        dst->int_state            = from->int_state;
2642        dst->exit_code            = from->exit_code;
2643        dst->exit_code_hi         = from->exit_code_hi;
2644        dst->exit_info_1          = from->exit_info_1;
2645        dst->exit_info_2          = from->exit_info_2;
2646        dst->exit_int_info        = from->exit_int_info;
2647        dst->exit_int_info_err    = from->exit_int_info_err;
2648        dst->nested_ctl           = from->nested_ctl;
2649        dst->event_inj            = from->event_inj;
2650        dst->event_inj_err        = from->event_inj_err;
2651        dst->nested_cr3           = from->nested_cr3;
2652        dst->lbr_ctl              = from->lbr_ctl;
2653}
2654
2655static int nested_svm_vmexit(struct vcpu_svm *svm)
2656{
2657        struct vmcb *nested_vmcb;
2658        struct vmcb *hsave = svm->nested.hsave;
2659        struct vmcb *vmcb = svm->vmcb;
2660        struct page *page;
2661
2662        trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
2663                                       vmcb->control.exit_info_1,
2664                                       vmcb->control.exit_info_2,
2665                                       vmcb->control.exit_int_info,
2666                                       vmcb->control.exit_int_info_err,
2667                                       KVM_ISA_SVM);
2668
2669        nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
2670        if (!nested_vmcb)
2671                return 1;
2672
2673        /* Exit Guest-Mode */
2674        leave_guest_mode(&svm->vcpu);
2675        svm->nested.vmcb = 0;
2676
2677        /* Give the current vmcb to the guest */
2678        disable_gif(svm);
2679
2680        nested_vmcb->save.es     = vmcb->save.es;
2681        nested_vmcb->save.cs     = vmcb->save.cs;
2682        nested_vmcb->save.ss     = vmcb->save.ss;
2683        nested_vmcb->save.ds     = vmcb->save.ds;
2684        nested_vmcb->save.gdtr   = vmcb->save.gdtr;
2685        nested_vmcb->save.idtr   = vmcb->save.idtr;
2686        nested_vmcb->save.efer   = svm->vcpu.arch.efer;
2687        nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
2688        nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
2689        nested_vmcb->save.cr2    = vmcb->save.cr2;
2690        nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
2691        nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
2692        nested_vmcb->save.rip    = vmcb->save.rip;
2693        nested_vmcb->save.rsp    = vmcb->save.rsp;
2694        nested_vmcb->save.rax    = vmcb->save.rax;
2695        nested_vmcb->save.dr7    = vmcb->save.dr7;
2696        nested_vmcb->save.dr6    = vmcb->save.dr6;
2697        nested_vmcb->save.cpl    = vmcb->save.cpl;
2698
2699        nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
2700        nested_vmcb->control.int_vector        = vmcb->control.int_vector;
2701        nested_vmcb->control.int_state         = vmcb->control.int_state;
2702        nested_vmcb->control.exit_code         = vmcb->control.exit_code;
2703        nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
2704        nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
2705        nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
2706        nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
2707        nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2708
2709        if (svm->nrips_enabled)
2710                nested_vmcb->control.next_rip  = vmcb->control.next_rip;
2711
2712        /*
2713         * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
2714         * to make sure that we do not lose injected events. So check event_inj
2715         * here and copy it to exit_int_info if it is valid.
2716         * Exit_int_info and event_inj can't be both valid because the case
2717         * below only happens on a VMRUN instruction intercept which has
2718         * no valid exit_int_info set.
2719         */
2720        if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
2721                struct vmcb_control_area *nc = &nested_vmcb->control;
2722
2723                nc->exit_int_info     = vmcb->control.event_inj;
2724                nc->exit_int_info_err = vmcb->control.event_inj_err;
2725        }
2726
2727        nested_vmcb->control.tlb_ctl           = 0;
2728        nested_vmcb->control.event_inj         = 0;
2729        nested_vmcb->control.event_inj_err     = 0;
2730
2731        /* We always set V_INTR_MASKING and remember the old value in hflags */
2732        if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2733                nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
2734
2735        /* Restore the original control entries */
2736        copy_vmcb_control_area(vmcb, hsave);
2737
2738        kvm_clear_exception_queue(&svm->vcpu);
2739        kvm_clear_interrupt_queue(&svm->vcpu);
2740
2741        svm->nested.nested_cr3 = 0;
2742
2743        /* Restore selected save entries */
2744        svm->vmcb->save.es = hsave->save.es;
2745        svm->vmcb->save.cs = hsave->save.cs;
2746        svm->vmcb->save.ss = hsave->save.ss;
2747        svm->vmcb->save.ds = hsave->save.ds;
2748        svm->vmcb->save.gdtr = hsave->save.gdtr;
2749        svm->vmcb->save.idtr = hsave->save.idtr;
2750        kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
2751        svm_set_efer(&svm->vcpu, hsave->save.efer);
2752        svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
2753        svm_set_cr4(&svm->vcpu, hsave->save.cr4);
2754        if (npt_enabled) {
2755                svm->vmcb->save.cr3 = hsave->save.cr3;
2756                svm->vcpu.arch.cr3 = hsave->save.cr3;
2757        } else {
2758                (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
2759        }
2760        kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
2761        kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
2762        kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
2763        svm->vmcb->save.dr7 = 0;
2764        svm->vmcb->save.cpl = 0;
2765        svm->vmcb->control.exit_int_info = 0;
2766
2767        mark_all_dirty(svm->vmcb);
2768
2769        nested_svm_unmap(page);
2770
2771        nested_svm_uninit_mmu_context(&svm->vcpu);
2772        kvm_mmu_reset_context(&svm->vcpu);
2773        kvm_mmu_load(&svm->vcpu);
2774
2775        return 0;
2776}
2777
2778static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2779{
2780        /*
2781         * This function merges the msr permission bitmaps of kvm and the
2782         * nested vmcb. It is optimized in that it only merges the parts where
2783         * the kvm msr permission bitmap may contain zero bits
2784         */
2785        int i;
2786
2787        if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2788                return true;
2789
2790        for (i = 0; i < MSRPM_OFFSETS; i++) {
2791                u32 value, p;
2792                u64 offset;
2793
2794                if (msrpm_offsets[i] == 0xffffffff)
2795                        break;
2796
2797                p      = msrpm_offsets[i];
2798                offset = svm->nested.vmcb_msrpm + (p * 4);
2799
2800                if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
2801                        return false;
2802
2803                svm->nested.msrpm[p] = svm->msrpm[p] | value;
2804        }
2805
2806        svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
2807
2808        return true;
2809}
2810
2811static bool nested_vmcb_checks(struct vmcb *vmcb)
2812{
2813        if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2814                return false;
2815
2816        if (vmcb->control.asid == 0)
2817                return false;
2818
2819        if (vmcb->control.nested_ctl && !npt_enabled)
2820                return false;
2821
2822        return true;
2823}
2824
2825static bool nested_svm_vmrun(struct vcpu_svm *svm)
2826{
2827        struct vmcb *nested_vmcb;
2828        struct vmcb *hsave = svm->nested.hsave;
2829        struct vmcb *vmcb = svm->vmcb;
2830        struct page *page;
2831        u64 vmcb_gpa;
2832
2833        vmcb_gpa = svm->vmcb->save.rax;
2834
2835        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2836        if (!nested_vmcb)
2837                return false;
2838
2839        if (!nested_vmcb_checks(nested_vmcb)) {
2840                nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
2841                nested_vmcb->control.exit_code_hi = 0;
2842                nested_vmcb->control.exit_info_1  = 0;
2843                nested_vmcb->control.exit_info_2  = 0;
2844
2845                nested_svm_unmap(page);
2846
2847                return false;
2848        }
2849
2850        trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2851                               nested_vmcb->save.rip,
2852                               nested_vmcb->control.int_ctl,
2853                               nested_vmcb->control.event_inj,
2854                               nested_vmcb->control.nested_ctl);
2855
2856        trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2857                                    nested_vmcb->control.intercept_cr >> 16,
2858                                    nested_vmcb->control.intercept_exceptions,
2859                                    nested_vmcb->control.intercept);
2860
2861        /* Clear internal status */
2862        kvm_clear_exception_queue(&svm->vcpu);
2863        kvm_clear_interrupt_queue(&svm->vcpu);
2864
2865        /*
2866         * Save the old vmcb, so we don't need to pick what we save, but can
2867         * restore everything when a VMEXIT occurs
2868         */
2869        hsave->save.es     = vmcb->save.es;
2870        hsave->save.cs     = vmcb->save.cs;
2871        hsave->save.ss     = vmcb->save.ss;
2872        hsave->save.ds     = vmcb->save.ds;
2873        hsave->save.gdtr   = vmcb->save.gdtr;
2874        hsave->save.idtr   = vmcb->save.idtr;
2875        hsave->save.efer   = svm->vcpu.arch.efer;
2876        hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
2877        hsave->save.cr4    = svm->vcpu.arch.cr4;
2878        hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
2879        hsave->save.rip    = kvm_rip_read(&svm->vcpu);
2880        hsave->save.rsp    = vmcb->save.rsp;
2881        hsave->save.rax    = vmcb->save.rax;
2882        if (npt_enabled)
2883                hsave->save.cr3    = vmcb->save.cr3;
2884        else
2885                hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
2886
2887        copy_vmcb_control_area(hsave, vmcb);
2888
2889        if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
2890                svm->vcpu.arch.hflags |= HF_HIF_MASK;
2891        else
2892                svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2893
2894        if (nested_vmcb->control.nested_ctl) {
2895                kvm_mmu_unload(&svm->vcpu);
2896                svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2897                nested_svm_init_mmu_context(&svm->vcpu);
2898        }
2899
2900        /* Load the nested guest state */
2901        svm->vmcb->save.es = nested_vmcb->save.es;
2902        svm->vmcb->save.cs = nested_vmcb->save.cs;
2903        svm->vmcb->save.ss = nested_vmcb->save.ss;
2904        svm->vmcb->save.ds = nested_vmcb->save.ds;
2905        svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
2906        svm->vmcb->save.idtr = nested_vmcb->save.idtr;
2907        kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
2908        svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
2909        svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
2910        svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
2911        if (npt_enabled) {
2912                svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2913                svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2914        } else
2915                (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2916
2917        /* Guest paging mode is active - reset mmu */
2918        kvm_mmu_reset_context(&svm->vcpu);
2919
2920        svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
2921        kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
2922        kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
2923        kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2924
2925        /* In case we don't even reach vcpu_run, the fields are not updated */
2926        svm->vmcb->save.rax = nested_vmcb->save.rax;
2927        svm->vmcb->save.rsp = nested_vmcb->save.rsp;
2928        svm->vmcb->save.rip = nested_vmcb->save.rip;
2929        svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
2930        svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
2931        svm->vmcb->save.cpl = nested_vmcb->save.cpl;
2932
2933        svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
2934        svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
2935
2936        /* cache intercepts */
2937        svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
2938        svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
2939        svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2940        svm->nested.intercept            = nested_vmcb->control.intercept;
2941
2942        svm_flush_tlb(&svm->vcpu);
2943        svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2944        if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2945                svm->vcpu.arch.hflags |= HF_VINTR_MASK;
2946        else
2947                svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
2948
2949        if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2950                /* We only want the cr8 intercept bits of the guest */
2951                clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2952                clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2953        }
2954
2955        /* We don't want to see VMMCALLs from a nested guest */
2956        clr_intercept(svm, INTERCEPT_VMMCALL);
2957
2958        svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2959        svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
2960        svm->vmcb->control.int_state = nested_vmcb->control.int_state;
2961        svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
2962        svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
2963        svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
2964
2965        nested_svm_unmap(page);
2966
2967        /* Enter Guest-Mode */
2968        enter_guest_mode(&svm->vcpu);
2969
2970        /*
2971         * Merge guest and host intercepts - must be called  with vcpu in
2972         * guest-mode to take affect here
2973         */
2974        recalc_intercepts(svm);
2975
2976        svm->nested.vmcb = vmcb_gpa;
2977
2978        enable_gif(svm);
2979
2980        mark_all_dirty(svm->vmcb);
2981
2982        return true;
2983}
2984
2985static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
2986{
2987        to_vmcb->save.fs = from_vmcb->save.fs;
2988        to_vmcb->save.gs = from_vmcb->save.gs;
2989        to_vmcb->save.tr = from_vmcb->save.tr;
2990        to_vmcb->save.ldtr = from_vmcb->save.ldtr;
2991        to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
2992        to_vmcb->save.star = from_vmcb->save.star;
2993        to_vmcb->save.lstar = from_vmcb->save.lstar;
2994        to_vmcb->save.cstar = from_vmcb->save.cstar;
2995        to_vmcb->save.sfmask = from_vmcb->save.sfmask;
2996        to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
2997        to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
2998        to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
2999}
3000
3001static int vmload_interception(struct vcpu_svm *svm)
3002{
3003        struct vmcb *nested_vmcb;
3004        struct page *page;
3005
3006        if (nested_svm_check_permissions(svm))
3007                return 1;
3008
3009        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
3010        if (!nested_vmcb)
3011                return 1;
3012
3013        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3014        skip_emulated_instruction(&svm->vcpu);
3015
3016        nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
3017        nested_svm_unmap(page);
3018
3019        return 1;
3020}
3021
3022static int vmsave_interception(struct vcpu_svm *svm)
3023{
3024        struct vmcb *nested_vmcb;
3025        struct page *page;
3026
3027        if (nested_svm_check_permissions(svm))
3028                return 1;
3029
3030        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
3031        if (!nested_vmcb)
3032                return 1;
3033
3034        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3035        skip_emulated_instruction(&svm->vcpu);
3036
3037        nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
3038        nested_svm_unmap(page);
3039
3040        return 1;
3041}
3042
3043static int vmrun_interception(struct vcpu_svm *svm)
3044{
3045        if (nested_svm_check_permissions(svm))
3046                return 1;
3047
3048        /* Save rip after vmrun instruction */
3049        kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
3050
3051        if (!nested_svm_vmrun(svm))
3052                return 1;
3053
3054        if (!nested_svm_vmrun_msrpm(svm))
3055                goto failed;
3056
3057        return 1;
3058
3059failed:
3060
3061        svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
3062        svm->vmcb->control.exit_code_hi = 0;
3063        svm->vmcb->control.exit_info_1  = 0;
3064        svm->vmcb->control.exit_info_2  = 0;
3065
3066        nested_svm_vmexit(svm);
3067
3068        return 1;
3069}
3070
3071static int stgi_interception(struct vcpu_svm *svm)
3072{
3073        if (nested_svm_check_permissions(svm))
3074                return 1;
3075
3076        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3077        skip_emulated_instruction(&svm->vcpu);
3078        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3079
3080        enable_gif(svm);
3081
3082        return 1;
3083}
3084
3085static int clgi_interception(struct vcpu_svm *svm)
3086{
3087        if (nested_svm_check_permissions(svm))
3088                return 1;
3089
3090        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3091        skip_emulated_instruction(&svm->vcpu);
3092
3093        disable_gif(svm);
3094
3095        /* After a CLGI no interrupts should come */
3096        if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
3097                svm_clear_vintr(svm);
3098                svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3099                mark_dirty(svm->vmcb, VMCB_INTR);
3100        }
3101
3102        return 1;
3103}
3104
3105static int invlpga_interception(struct vcpu_svm *svm)
3106{
3107        struct kvm_vcpu *vcpu = &svm->vcpu;
3108
3109        trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
3110                          kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
3111
3112        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
3113        kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
3114
3115        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3116        skip_emulated_instruction(&svm->vcpu);
3117        return 1;
3118}
3119
3120static int skinit_interception(struct vcpu_svm *svm)
3121{
3122        trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
3123
3124        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3125        return 1;
3126}
3127
3128static int wbinvd_interception(struct vcpu_svm *svm)
3129{
3130        return kvm_emulate_wbinvd(&svm->vcpu);
3131}
3132
3133static int xsetbv_interception(struct vcpu_svm *svm)
3134{
3135        u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
3136        u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3137
3138        if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
3139                svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3140                skip_emulated_instruction(&svm->vcpu);
3141        }
3142
3143        return 1;
3144}
3145
3146static int task_switch_interception(struct vcpu_svm *svm)
3147{
3148        u16 tss_selector;
3149        int reason;
3150        int int_type = svm->vmcb->control.exit_int_info &
3151                SVM_EXITINTINFO_TYPE_MASK;
3152        int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
3153        uint32_t type =
3154                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
3155        uint32_t idt_v =
3156                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
3157        bool has_error_code = false;
3158        u32 error_code = 0;
3159
3160        tss_selector = (u16)svm->vmcb->control.exit_info_1;
3161
3162        if (svm->vmcb->control.exit_info_2 &
3163            (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
3164                reason = TASK_SWITCH_IRET;
3165        else if (svm->vmcb->control.exit_info_2 &
3166                 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
3167                reason = TASK_SWITCH_JMP;
3168        else if (idt_v)
3169                reason = TASK_SWITCH_GATE;
3170        else
3171                reason = TASK_SWITCH_CALL;
3172
3173        if (reason == TASK_SWITCH_GATE) {
3174                switch (type) {
3175                case SVM_EXITINTINFO_TYPE_NMI:
3176                        svm->vcpu.arch.nmi_injected = false;
3177                        break;
3178                case SVM_EXITINTINFO_TYPE_EXEPT:
3179                        if (svm->vmcb->control.exit_info_2 &
3180                            (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
3181                                has_error_code = true;
3182                                error_code =
3183                                        (u32)svm->vmcb->control.exit_info_2;
3184                        }
3185                        kvm_clear_exception_queue(&svm->vcpu);
3186                        break;
3187                case SVM_EXITINTINFO_TYPE_INTR:
3188                        kvm_clear_interrupt_queue(&svm->vcpu);
3189                        break;
3190                default:
3191                        break;
3192                }
3193        }
3194
3195        if (reason != TASK_SWITCH_GATE ||
3196            int_type == SVM_EXITINTINFO_TYPE_SOFT ||
3197            (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
3198             (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
3199                skip_emulated_instruction(&svm->vcpu);
3200
3201        if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
3202                int_vec = -1;
3203
3204        if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
3205                                has_error_code, error_code) == EMULATE_FAIL) {
3206                svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3207                svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3208                svm->vcpu.run->internal.ndata = 0;
3209                return 0;
3210        }
3211        return 1;
3212}
3213
3214static int cpuid_interception(struct vcpu_svm *svm)
3215{
3216        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3217        return kvm_emulate_cpuid(&svm->vcpu);
3218}
3219
3220static int iret_interception(struct vcpu_svm *svm)
3221{
3222        ++svm->vcpu.stat.nmi_window_exits;
3223        clr_intercept(svm, INTERCEPT_IRET);
3224        svm->vcpu.arch.hflags |= HF_IRET_MASK;
3225        svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
3226        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3227        return 1;
3228}
3229
3230static int invlpg_interception(struct vcpu_svm *svm)
3231{
3232        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
3233                return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
3234
3235        kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
3236        skip_emulated_instruction(&svm->vcpu);
3237        return 1;
3238}
3239
3240static int emulate_on_interception(struct vcpu_svm *svm)
3241{
3242        return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
3243}
3244
3245static int rdpmc_interception(struct vcpu_svm *svm)
3246{
3247        int err;
3248
3249        if (!static_cpu_has(X86_FEATURE_NRIPS))
3250                return emulate_on_interception(svm);
3251
3252        err = kvm_rdpmc(&svm->vcpu);
3253        return kvm_complete_insn_gp(&svm->vcpu, err);
3254}
3255
3256static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
3257                                            unsigned long val)
3258{
3259        unsigned long cr0 = svm->vcpu.arch.cr0;
3260        bool ret = false;
3261        u64 intercept;
3262
3263        intercept = svm->nested.intercept;
3264
3265        if (!is_guest_mode(&svm->vcpu) ||
3266            (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
3267                return false;
3268
3269        cr0 &= ~SVM_CR0_SELECTIVE_MASK;
3270        val &= ~SVM_CR0_SELECTIVE_MASK;
3271
3272        if (cr0 ^ val) {
3273                svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
3274                ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
3275        }
3276
3277        return ret;
3278}
3279
3280#define CR_VALID (1ULL << 63)
3281
3282static int cr_interception(struct vcpu_svm *svm)
3283{
3284        int reg, cr;
3285        unsigned long val;
3286        int err;
3287
3288        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
3289                return emulate_on_interception(svm);
3290
3291        if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
3292                return emulate_on_interception(svm);
3293
3294        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3295        if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
3296                cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
3297        else
3298                cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
3299
3300        err = 0;
3301        if (cr >= 16) { /* mov to cr */
3302                cr -= 16;
3303                val = kvm_register_read(&svm->vcpu, reg);
3304                switch (cr) {
3305                case 0:
3306                        if (!check_selective_cr0_intercepted(svm, val))
3307                                err = kvm_set_cr0(&svm->vcpu, val);
3308                        else
3309                                return 1;
3310
3311                        break;
3312                case 3:
3313                        err = kvm_set_cr3(&svm->vcpu, val);
3314                        break;
3315                case 4:
3316                        err = kvm_set_cr4(&svm->vcpu, val);
3317                        break;
3318                case 8:
3319                        err = kvm_set_cr8(&svm->vcpu, val);
3320                        break;
3321                default:
3322                        WARN(1, "unhandled write to CR%d", cr);
3323                        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3324                        return 1;
3325                }
3326        } else { /* mov from cr */
3327                switch (cr) {
3328                case 0:
3329                        val = kvm_read_cr0(&svm->vcpu);
3330                        break;
3331                case 2:
3332                        val = svm->vcpu.arch.cr2;
3333                        break;
3334                case 3:
3335                        val = kvm_read_cr3(&svm->vcpu);
3336                        break;
3337                case 4:
3338                        val = kvm_read_cr4(&svm->vcpu);
3339                        break;
3340                case 8:
3341                        val = kvm_get_cr8(&svm->vcpu);
3342                        break;
3343                default:
3344                        WARN(1, "unhandled read from CR%d", cr);
3345                        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3346                        return 1;
3347                }
3348                kvm_register_write(&svm->vcpu, reg, val);
3349        }
3350        return kvm_complete_insn_gp(&svm->vcpu, err);
3351}
3352
3353static int dr_interception(struct vcpu_svm *svm)
3354{
3355        int reg, dr;
3356        unsigned long val;
3357
3358        if (svm->vcpu.guest_debug == 0) {
3359                /*
3360                 * No more DR vmexits; force a reload of the debug registers
3361                 * and reenter on this instruction.  The next vmexit will
3362                 * retrieve the full state of the debug registers.
3363                 */
3364                clr_dr_intercepts(svm);
3365                svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
3366                return 1;
3367        }
3368
3369        if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
3370                return emulate_on_interception(svm);
3371
3372        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3373        dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
3374
3375        if (dr >= 16) { /* mov to DRn */
3376                if (!kvm_require_dr(&svm->vcpu, dr - 16))
3377                        return 1;
3378                val = kvm_register_read(&svm->vcpu, reg);
3379                kvm_set_dr(&svm->vcpu, dr - 16, val);
3380        } else {
3381                if (!kvm_require_dr(&svm->vcpu, dr))
3382                        return 1;
3383                kvm_get_dr(&svm->vcpu, dr, &val);
3384                kvm_register_write(&svm->vcpu, reg, val);
3385        }
3386
3387        skip_emulated_instruction(&svm->vcpu);
3388
3389        return 1;
3390}
3391
3392static int cr8_write_interception(struct vcpu_svm *svm)
3393{
3394        struct kvm_run *kvm_run = svm->vcpu.run;
3395        int r;
3396
3397        u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
3398        /* instruction emulation calls kvm_set_cr8() */
3399        r = cr_interception(svm);
3400        if (lapic_in_kernel(&svm->vcpu))
3401                return r;
3402        if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
3403                return r;
3404        kvm_run->exit_reason = KVM_EXIT_SET_TPR;
3405        return 0;
3406}
3407
3408static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3409{
3410        struct vcpu_svm *svm = to_svm(vcpu);
3411
3412        switch (msr_info->index) {
3413        case MSR_IA32_TSC: {
3414                msr_info->data = svm->vmcb->control.tsc_offset +
3415                        kvm_scale_tsc(vcpu, rdtsc());
3416
3417                break;
3418        }
3419        case MSR_STAR:
3420                msr_info->data = svm->vmcb->save.star;
3421                break;
3422#ifdef CONFIG_X86_64
3423        case MSR_LSTAR:
3424                msr_info->data = svm->vmcb->save.lstar;
3425                break;
3426        case MSR_CSTAR:
3427                msr_info->data = svm->vmcb->save.cstar;
3428                break;
3429        case MSR_KERNEL_GS_BASE:
3430                msr_info->data = svm->vmcb->save.kernel_gs_base;
3431                break;
3432        case MSR_SYSCALL_MASK:
3433                msr_info->data = svm->vmcb->save.sfmask;
3434                break;
3435#endif
3436        case MSR_IA32_SYSENTER_CS:
3437                msr_info->data = svm->vmcb->save.sysenter_cs;
3438                break;
3439        case MSR_IA32_SYSENTER_EIP:
3440                msr_info->data = svm->sysenter_eip;
3441                break;
3442        case MSR_IA32_SYSENTER_ESP:
3443                msr_info->data = svm->sysenter_esp;
3444                break;
3445        case MSR_TSC_AUX:
3446                if (!boot_cpu_has(X86_FEATURE_RDTSCP))
3447                        return 1;
3448                msr_info->data = svm->tsc_aux;
3449                break;
3450        /*
3451         * Nobody will change the following 5 values in the VMCB so we can
3452         * safely return them on rdmsr. They will always be 0 until LBRV is
3453         * implemented.
3454         */
3455        case MSR_IA32_DEBUGCTLMSR:
3456                msr_info->data = svm->vmcb->save.dbgctl;
3457                break;
3458        case MSR_IA32_LASTBRANCHFROMIP:
3459                msr_info->data = svm->vmcb->save.br_from;
3460                break;
3461        case MSR_IA32_LASTBRANCHTOIP:
3462                msr_info->data = svm->vmcb->save.br_to;
3463                break;
3464        case MSR_IA32_LASTINTFROMIP:
3465                msr_info->data = svm->vmcb->save.last_excp_from;
3466                break;
3467        case MSR_IA32_LASTINTTOIP:
3468                msr_info->data = svm->vmcb->save.last_excp_to;
3469                break;
3470        case MSR_VM_HSAVE_PA:
3471                msr_info->data = svm->nested.hsave_msr;
3472                break;
3473        case MSR_VM_CR:
3474                msr_info->data = svm->nested.vm_cr_msr;
3475                break;
3476        case MSR_IA32_UCODE_REV:
3477                msr_info->data = 0x01000065;
3478                break;
3479        case MSR_F15H_IC_CFG: {
3480
3481                int family, model;
3482
3483                family = guest_cpuid_family(vcpu);
3484                model  = guest_cpuid_model(vcpu);
3485
3486                if (family < 0 || model < 0)
3487                        return kvm_get_msr_common(vcpu, msr_info);
3488
3489                msr_info->data = 0;
3490
3491                if (family == 0x15 &&
3492                    (model >= 0x2 && model < 0x20))
3493                        msr_info->data = 0x1E;
3494                }
3495                break;
3496        default:
3497                return kvm_get_msr_common(vcpu, msr_info);
3498        }
3499        return 0;
3500}
3501
3502static int rdmsr_interception(struct vcpu_svm *svm)
3503{
3504        u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3505        struct msr_data msr_info;
3506
3507        msr_info.index = ecx;
3508        msr_info.host_initiated = false;
3509        if (svm_get_msr(&svm->vcpu, &msr_info)) {
3510                trace_kvm_msr_read_ex(ecx);
3511                kvm_inject_gp(&svm->vcpu, 0);
3512        } else {
3513                trace_kvm_msr_read(ecx, msr_info.data);
3514
3515                kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
3516                                   msr_info.data & 0xffffffff);
3517                kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
3518                                   msr_info.data >> 32);
3519                svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3520                skip_emulated_instruction(&svm->vcpu);
3521        }
3522        return 1;
3523}
3524
3525static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
3526{
3527        struct vcpu_svm *svm = to_svm(vcpu);
3528        int svm_dis, chg_mask;
3529
3530        if (data & ~SVM_VM_CR_VALID_MASK)
3531                return 1;
3532
3533        chg_mask = SVM_VM_CR_VALID_MASK;
3534
3535        if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
3536                chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
3537
3538        svm->nested.vm_cr_msr &= ~chg_mask;
3539        svm->nested.vm_cr_msr |= (data & chg_mask);
3540
3541        svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
3542
3543        /* check for svm_disable while efer.svme is set */
3544        if (svm_dis && (vcpu->arch.efer & EFER_SVME))
3545                return 1;
3546
3547        return 0;
3548}
3549
3550static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3551{
3552        struct vcpu_svm *svm = to_svm(vcpu);
3553
3554        u32 ecx = msr->index;
3555        u64 data = msr->data;
3556        switch (ecx) {
3557        case MSR_IA32_TSC:
3558                kvm_write_tsc(vcpu, msr);
3559                break;
3560        case MSR_STAR:
3561                svm->vmcb->save.star = data;
3562                break;
3563#ifdef CONFIG_X86_64
3564        case MSR_LSTAR:
3565                svm->vmcb->save.lstar = data;
3566                break;
3567        case MSR_CSTAR:
3568                svm->vmcb->save.cstar = data;
3569                break;
3570        case MSR_KERNEL_GS_BASE:
3571                svm->vmcb->save.kernel_gs_base = data;
3572                break;
3573        case MSR_SYSCALL_MASK:
3574                svm->vmcb->save.sfmask = data;
3575                break;
3576#endif
3577        case MSR_IA32_SYSENTER_CS:
3578                svm->vmcb->save.sysenter_cs = data;
3579                break;
3580        case MSR_IA32_SYSENTER_EIP:
3581                svm->sysenter_eip = data;
3582                svm->vmcb->save.sysenter_eip = data;
3583                break;
3584        case MSR_IA32_SYSENTER_ESP:
3585                svm->sysenter_esp = data;
3586                svm->vmcb->save.sysenter_esp = data;
3587                break;
3588        case MSR_TSC_AUX:
3589                if (!boot_cpu_has(X86_FEATURE_RDTSCP))
3590                        return 1;
3591
3592                /*
3593                 * This is rare, so we update the MSR here instead of using
3594                 * direct_access_msrs.  Doing that would require a rdmsr in
3595                 * svm_vcpu_put.
3596                 */
3597                svm->tsc_aux = data;
3598                wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
3599                break;
3600        case MSR_IA32_DEBUGCTLMSR:
3601                if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3602                        vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3603                                    __func__, data);
3604                        break;
3605                }
3606                if (data & DEBUGCTL_RESERVED_BITS)
3607                        return 1;
3608
3609                svm->vmcb->save.dbgctl = data;
3610                mark_dirty(svm->vmcb, VMCB_LBR);
3611                if (data & (1ULL<<0))
3612                        svm_enable_lbrv(svm);
3613                else
3614                        svm_disable_lbrv(svm);
3615                break;
3616        case MSR_VM_HSAVE_PA:
3617                svm->nested.hsave_msr = data;
3618                break;
3619        case MSR_VM_CR:
3620                return svm_set_vm_cr(vcpu, data);
3621        case MSR_VM_IGNNE:
3622                vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3623                break;
3624        case MSR_IA32_APICBASE:
3625                if (kvm_vcpu_apicv_active(vcpu))
3626                        avic_update_vapic_bar(to_svm(vcpu), data);
3627                /* Follow through */
3628        default:
3629                return kvm_set_msr_common(vcpu, msr);
3630        }
3631        return 0;
3632}
3633
3634static int wrmsr_interception(struct vcpu_svm *svm)
3635{
3636        struct msr_data msr;
3637        u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3638        u64 data = kvm_read_edx_eax(&svm->vcpu);
3639
3640        msr.data = data;
3641        msr.index = ecx;
3642        msr.host_initiated = false;
3643
3644        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3645        if (kvm_set_msr(&svm->vcpu, &msr)) {
3646                trace_kvm_msr_write_ex(ecx, data);
3647                kvm_inject_gp(&svm->vcpu, 0);
3648        } else {
3649                trace_kvm_msr_write(ecx, data);
3650                skip_emulated_instruction(&svm->vcpu);
3651        }
3652        return 1;
3653}
3654
3655static int msr_interception(struct vcpu_svm *svm)
3656{
3657        if (svm->vmcb->control.exit_info_1)
3658                return wrmsr_interception(svm);
3659        else
3660                return rdmsr_interception(svm);
3661}
3662
3663static int interrupt_window_interception(struct vcpu_svm *svm)
3664{
3665        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3666        svm_clear_vintr(svm);
3667        svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3668        mark_dirty(svm->vmcb, VMCB_INTR);
3669        ++svm->vcpu.stat.irq_window_exits;
3670        return 1;
3671}
3672
3673static int pause_interception(struct vcpu_svm *svm)
3674{
3675        kvm_vcpu_on_spin(&(svm->vcpu));
3676        return 1;
3677}
3678
3679static int nop_interception(struct vcpu_svm *svm)
3680{
3681        skip_emulated_instruction(&(svm->vcpu));
3682        return 1;
3683}
3684
3685static int monitor_interception(struct vcpu_svm *svm)
3686{
3687        printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
3688        return nop_interception(svm);
3689}
3690
3691static int mwait_interception(struct vcpu_svm *svm)
3692{
3693        printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
3694        return nop_interception(svm);
3695}
3696
3697enum avic_ipi_failure_cause {
3698        AVIC_IPI_FAILURE_INVALID_INT_TYPE,
3699        AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
3700        AVIC_IPI_FAILURE_INVALID_TARGET,
3701        AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
3702};
3703
3704static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
3705{
3706        u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
3707        u32 icrl = svm->vmcb->control.exit_info_1;
3708        u32 id = svm->vmcb->control.exit_info_2 >> 32;
3709        u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
3710        struct kvm_lapic *apic = svm->vcpu.arch.apic;
3711
3712        trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
3713
3714        switch (id) {
3715        case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
3716                /*
3717                 * AVIC hardware handles the generation of
3718                 * IPIs when the specified Message Type is Fixed
3719                 * (also known as fixed delivery mode) and
3720                 * the Trigger Mode is edge-triggered. The hardware
3721                 * also supports self and broadcast delivery modes
3722                 * specified via the Destination Shorthand(DSH)
3723                 * field of the ICRL. Logical and physical APIC ID
3724                 * formats are supported. All other IPI types cause
3725                 * a #VMEXIT, which needs to emulated.
3726                 */
3727                kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
3728                kvm_lapic_reg_write(apic, APIC_ICR, icrl);
3729                break;
3730        case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
3731                int i;
3732                struct kvm_vcpu *vcpu;
3733                struct kvm *kvm = svm->vcpu.kvm;
3734                struct kvm_lapic *apic = svm->vcpu.arch.apic;
3735
3736                /*
3737                 * At this point, we expect that the AVIC HW has already
3738                 * set the appropriate IRR bits on the valid target
3739                 * vcpus. So, we just need to kick the appropriate vcpu.
3740                 */
3741                kvm_for_each_vcpu(i, vcpu, kvm) {
3742                        bool m = kvm_apic_match_dest(vcpu, apic,
3743                                                     icrl & KVM_APIC_SHORT_MASK,
3744                                                     GET_APIC_DEST_FIELD(icrh),
3745                                                     icrl & KVM_APIC_DEST_MASK);
3746
3747                        if (m && !avic_vcpu_is_running(vcpu))
3748                                kvm_vcpu_wake_up(vcpu);
3749                }
3750                break;
3751        }
3752        case AVIC_IPI_FAILURE_INVALID_TARGET:
3753                break;
3754        case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
3755                WARN_ONCE(1, "Invalid backing page\n");
3756                break;
3757        default:
3758                pr_err("Unknown IPI interception\n");
3759        }
3760
3761        return 1;
3762}
3763
3764static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
3765{
3766        struct kvm_arch *vm_data = &vcpu->kvm->arch;
3767        int index;
3768        u32 *logical_apic_id_table;
3769        int dlid = GET_APIC_LOGICAL_ID(ldr);
3770
3771        if (!dlid)
3772                return NULL;
3773
3774        if (flat) { /* flat */
3775                index = ffs(dlid) - 1;
3776                if (index > 7)
3777                        return NULL;
3778        } else { /* cluster */
3779                int cluster = (dlid & 0xf0) >> 4;
3780                int apic = ffs(dlid & 0x0f) - 1;
3781
3782                if ((apic < 0) || (apic > 7) ||
3783                    (cluster >= 0xf))
3784                        return NULL;
3785                index = (cluster << 2) + apic;
3786        }
3787
3788        logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page);
3789
3790        return &logical_apic_id_table[index];
3791}
3792
3793static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
3794                          bool valid)
3795{
3796        bool flat;
3797        u32 *entry, new_entry;
3798
3799        flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
3800        entry = avic_get_logical_id_entry(vcpu, ldr, flat);
3801        if (!entry)
3802                return -EINVAL;
3803
3804        new_entry = READ_ONCE(*entry);
3805        new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
3806        new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
3807        if (valid)
3808                new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
3809        else
3810                new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
3811        WRITE_ONCE(*entry, new_entry);
3812
3813        return 0;
3814}
3815
3816static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
3817{
3818        int ret;
3819        struct vcpu_svm *svm = to_svm(vcpu);
3820        u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
3821
3822        if (!ldr)
3823                return 1;
3824
3825        ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
3826        if (ret && svm->ldr_reg) {
3827                avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
3828                svm->ldr_reg = 0;
3829        } else {
3830                svm->ldr_reg = ldr;
3831        }
3832        return ret;
3833}
3834
3835static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
3836{
3837        u64 *old, *new;
3838        struct vcpu_svm *svm = to_svm(vcpu);
3839        u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
3840        u32 id = (apic_id_reg >> 24) & 0xff;
3841
3842        if (vcpu->vcpu_id == id)
3843                return 0;
3844
3845        old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
3846        new = avic_get_physical_id_entry(vcpu, id);
3847        if (!new || !old)
3848                return 1;
3849
3850        /* We need to move physical_id_entry to new offset */
3851        *new = *old;
3852        *old = 0ULL;
3853        to_svm(vcpu)->avic_physical_id_cache = new;
3854
3855        /*
3856         * Also update the guest physical APIC ID in the logical
3857         * APIC ID table entry if already setup the LDR.
3858         */
3859        if (svm->ldr_reg)
3860                avic_handle_ldr_update(vcpu);
3861
3862        return 0;
3863}
3864
3865static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
3866{
3867        struct vcpu_svm *svm = to_svm(vcpu);
3868        struct kvm_arch *vm_data = &vcpu->kvm->arch;
3869        u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
3870        u32 mod = (dfr >> 28) & 0xf;
3871
3872        /*
3873         * We assume that all local APICs are using the same type.
3874         * If this changes, we need to flush the AVIC logical
3875         * APID id table.
3876         */
3877        if (vm_data->ldr_mode == mod)
3878                return 0;
3879
3880        clear_page(page_address(vm_data->avic_logical_id_table_page));
3881        vm_data->ldr_mode = mod;
3882
3883        if (svm->ldr_reg)
3884                avic_handle_ldr_update(vcpu);
3885        return 0;
3886}
3887
3888static int avic_unaccel_trap_write(struct vcpu_svm *svm)
3889{
3890        struct kvm_lapic *apic = svm->vcpu.arch.apic;
3891        u32 offset = svm->vmcb->control.exit_info_1 &
3892                                AVIC_UNACCEL_ACCESS_OFFSET_MASK;
3893
3894        switch (offset) {
3895        case APIC_ID:
3896                if (avic_handle_apic_id_update(&svm->vcpu))
3897                        return 0;
3898                break;
3899        case APIC_LDR:
3900                if (avic_handle_ldr_update(&svm->vcpu))
3901                        return 0;
3902                break;
3903        case APIC_DFR:
3904                avic_handle_dfr_update(&svm->vcpu);
3905                break;
3906        default:
3907                break;
3908        }
3909
3910        kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
3911
3912        return 1;
3913}
3914
3915static bool is_avic_unaccelerated_access_trap(u32 offset)
3916{
3917        bool ret = false;
3918
3919        switch (offset) {
3920        case APIC_ID:
3921        case APIC_EOI:
3922        case APIC_RRR:
3923        case APIC_LDR:
3924        case APIC_DFR:
3925        case APIC_SPIV:
3926        case APIC_ESR:
3927        case APIC_ICR:
3928        case APIC_LVTT:
3929        case APIC_LVTTHMR:
3930        case APIC_LVTPC:
3931        case APIC_LVT0:
3932        case APIC_LVT1:
3933        case APIC_LVTERR:
3934        case APIC_TMICT:
3935        case APIC_TDCR:
3936                ret = true;
3937                break;
3938        default:
3939                break;
3940        }
3941        return ret;
3942}
3943
3944static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
3945{
3946        int ret = 0;
3947        u32 offset = svm->vmcb->control.exit_info_1 &
3948                     AVIC_UNACCEL_ACCESS_OFFSET_MASK;
3949        u32 vector = svm->vmcb->control.exit_info_2 &
3950                     AVIC_UNACCEL_ACCESS_VECTOR_MASK;
3951        bool write = (svm->vmcb->control.exit_info_1 >> 32) &
3952                     AVIC_UNACCEL_ACCESS_WRITE_MASK;
3953        bool trap = is_avic_unaccelerated_access_trap(offset);
3954
3955        trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
3956                                            trap, write, vector);
3957        if (trap) {
3958                /* Handling Trap */
3959                WARN_ONCE(!write, "svm: Handling trap read.\n");
3960                ret = avic_unaccel_trap_write(svm);
3961        } else {
3962                /* Handling Fault */
3963                ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
3964        }
3965
3966        return ret;
3967}
3968
3969static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3970        [SVM_EXIT_READ_CR0]                     = cr_interception,
3971        [SVM_EXIT_READ_CR3]                     = cr_interception,
3972        [SVM_EXIT_READ_CR4]                     = cr_interception,
3973        [SVM_EXIT_READ_CR8]                     = cr_interception,
3974        [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3975        [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3976        [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3977        [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3978        [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3979        [SVM_EXIT_READ_DR0]                     = dr_interception,
3980        [SVM_EXIT_READ_DR1]                     = dr_interception,
3981        [SVM_EXIT_READ_DR2]                     = dr_interception,
3982        [SVM_EXIT_READ_DR3]                     = dr_interception,
3983        [SVM_EXIT_READ_DR4]                     = dr_interception,
3984        [SVM_EXIT_READ_DR5]                     = dr_interception,
3985        [SVM_EXIT_READ_DR6]                     = dr_interception,
3986        [SVM_EXIT_READ_DR7]                     = dr_interception,
3987        [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3988        [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3989        [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3990        [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3991        [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3992        [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3993        [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3994        [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3995        [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3996        [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3997        [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3998        [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3999        [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
4000        [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
4001        [SVM_EXIT_INTR]                         = intr_interception,
4002        [SVM_EXIT_NMI]                          = nmi_interception,
4003        [SVM_EXIT_SMI]                          = nop_on_interception,
4004        [SVM_EXIT_INIT]                         = nop_on_interception,
4005        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
4006        [SVM_EXIT_RDPMC]                        = rdpmc_interception,
4007        [SVM_EXIT_CPUID]                        = cpuid_interception,
4008        [SVM_EXIT_IRET]                         = iret_interception,
4009        [SVM_EXIT_INVD]                         = emulate_on_interception,
4010        [SVM_EXIT_PAUSE]                        = pause_interception,
4011        [SVM_EXIT_HLT]                          = halt_interception,
4012        [SVM_EXIT_INVLPG]                       = invlpg_interception,
4013        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
4014        [SVM_EXIT_IOIO]                         = io_interception,
4015        [SVM_EXIT_MSR]                          = msr_interception,
4016        [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
4017        [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
4018        [SVM_EXIT_VMRUN]                        = vmrun_interception,
4019        [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
4020        [SVM_EXIT_VMLOAD]                       = vmload_interception,
4021        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
4022        [SVM_EXIT_STGI]                         = stgi_interception,
4023        [SVM_EXIT_CLGI]                         = clgi_interception,
4024        [SVM_EXIT_SKINIT]                       = skinit_interception,
4025        [SVM_EXIT_WBINVD]                       = wbinvd_interception,
4026        [SVM_EXIT_MONITOR]                      = monitor_interception,
4027        [SVM_EXIT_MWAIT]                        = mwait_interception,
4028        [SVM_EXIT_XSETBV]                       = xsetbv_interception,
4029        [SVM_EXIT_NPF]                          = pf_interception,
4030        [SVM_EXIT_RSM]                          = emulate_on_interception,
4031        [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
4032        [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
4033};
4034
4035static void dump_vmcb(struct kvm_vcpu *vcpu)
4036{
4037        struct vcpu_svm *svm = to_svm(vcpu);
4038        struct vmcb_control_area *control = &svm->vmcb->control;
4039        struct vmcb_save_area *save = &svm->vmcb->save;
4040
4041        pr_err("VMCB Control Area:\n");
4042        pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
4043        pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
4044        pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
4045        pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
4046        pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
4047        pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
4048        pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
4049        pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
4050        pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
4051        pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
4052        pr_err("%-20s%d\n", "asid:", control->asid);
4053        pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
4054        pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
4055        pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
4056        pr_err("%-20s%08x\n", "int_state:", control->int_state);
4057        pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
4058        pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
4059        pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
4060        pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
4061        pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
4062        pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
4063        pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
4064        pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
4065        pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
4066        pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
4067        pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
4068        pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
4069        pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
4070        pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
4071        pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
4072        pr_err("VMCB State Save Area:\n");
4073        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4074               "es:",
4075               save->es.selector, save->es.attrib,
4076               save->es.limit, save->es.base);
4077        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4078               "cs:",
4079               save->cs.selector, save->cs.attrib,
4080               save->cs.limit, save->cs.base);
4081        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4082               "ss:",
4083               save->ss.selector, save->ss.attrib,
4084               save->ss.limit, save->ss.base);
4085        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4086               "ds:",
4087               save->ds.selector, save->ds.attrib,
4088               save->ds.limit, save->ds.base);
4089        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4090               "fs:",
4091               save->fs.selector, save->fs.attrib,
4092               save->fs.limit, save->fs.base);
4093        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4094               "gs:",
4095               save->gs.selector, save->gs.attrib,
4096               save->gs.limit, save->gs.base);
4097        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4098               "gdtr:",
4099               save->gdtr.selector, save->gdtr.attrib,
4100               save->gdtr.limit, save->gdtr.base);
4101        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4102               "ldtr:",
4103               save->ldtr.selector, save->ldtr.attrib,
4104               save->ldtr.limit, save->ldtr.base);
4105        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4106               "idtr:",
4107               save->idtr.selector, save->idtr.attrib,
4108               save->idtr.limit, save->idtr.base);
4109        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4110               "tr:",
4111               save->tr.selector, save->tr.attrib,
4112               save->tr.limit, save->tr.base);
4113        pr_err("cpl:            %d                efer:         %016llx\n",
4114                save->cpl, save->efer);
4115        pr_err("%-15s %016llx %-13s %016llx\n",
4116               "cr0:", save->cr0, "cr2:", save->cr2);
4117        pr_err("%-15s %016llx %-13s %016llx\n",
4118               "cr3:", save->cr3, "cr4:", save->cr4);
4119        pr_err("%-15s %016llx %-13s %016llx\n",
4120               "dr6:", save->dr6, "dr7:", save->dr7);
4121        pr_err("%-15s %016llx %-13s %016llx\n",
4122               "rip:", save->rip, "rflags:", save->rflags);
4123        pr_err("%-15s %016llx %-13s %016llx\n",
4124               "rsp:", save->rsp, "rax:", save->rax);
4125        pr_err("%-15s %016llx %-13s %016llx\n",
4126               "star:", save->star, "lstar:", save->lstar);
4127        pr_err("%-15s %016llx %-13s %016llx\n",
4128               "cstar:", save->cstar, "sfmask:", save->sfmask);
4129        pr_err("%-15s %016llx %-13s %016llx\n",
4130               "kernel_gs_base:", save->kernel_gs_base,
4131               "sysenter_cs:", save->sysenter_cs);
4132        pr_err("%-15s %016llx %-13s %016llx\n",
4133               "sysenter_esp:", save->sysenter_esp,
4134               "sysenter_eip:", save->sysenter_eip);
4135        pr_err("%-15s %016llx %-13s %016llx\n",
4136               "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
4137        pr_err("%-15s %016llx %-13s %016llx\n",
4138               "br_from:", save->br_from, "br_to:", save->br_to);
4139        pr_err("%-15s %016llx %-13s %016llx\n",
4140               "excp_from:", save->last_excp_from,
4141               "excp_to:", save->last_excp_to);
4142}
4143
4144static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
4145{
4146        struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
4147
4148        *info1 = control->exit_info_1;
4149        *info2 = control->exit_info_2;
4150}
4151
4152static int handle_exit(struct kvm_vcpu *vcpu)
4153{
4154        struct vcpu_svm *svm = to_svm(vcpu);
4155        struct kvm_run *kvm_run = vcpu->run;
4156        u32 exit_code = svm->vmcb->control.exit_code;
4157
4158        trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
4159
4160        vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF);
4161
4162        if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
4163                vcpu->arch.cr0 = svm->vmcb->save.cr0;
4164        if (npt_enabled)
4165                vcpu->arch.cr3 = svm->vmcb->save.cr3;
4166
4167        if (unlikely(svm->nested.exit_required)) {
4168                nested_svm_vmexit(svm);
4169                svm->nested.exit_required = false;
4170
4171                return 1;
4172        }
4173
4174        if (is_guest_mode(vcpu)) {
4175                int vmexit;
4176
4177                trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
4178                                        svm->vmcb->control.exit_info_1,
4179                                        svm->vmcb->control.exit_info_2,
4180                                        svm->vmcb->control.exit_int_info,
4181                                        svm->vmcb->control.exit_int_info_err,
4182                                        KVM_ISA_SVM);
4183
4184                vmexit = nested_svm_exit_special(svm);
4185
4186                if (vmexit == NESTED_EXIT_CONTINUE)
4187                        vmexit = nested_svm_exit_handled(svm);
4188
4189                if (vmexit == NESTED_EXIT_DONE)
4190                        return 1;
4191        }
4192
4193        svm_complete_interrupts(svm);
4194
4195        if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
4196                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4197                kvm_run->fail_entry.hardware_entry_failure_reason
4198                        = svm->vmcb->control.exit_code;
4199                pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
4200                dump_vmcb(vcpu);
4201                return 0;
4202        }
4203
4204        if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
4205            exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
4206            exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
4207            exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
4208                printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
4209                       "exit_code 0x%x\n",
4210                       __func__, svm->vmcb->control.exit_int_info,
4211                       exit_code);
4212
4213        if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
4214            || !svm_exit_handlers[exit_code]) {
4215                WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
4216                kvm_queue_exception(vcpu, UD_VECTOR);
4217                return 1;
4218        }
4219
4220        return svm_exit_handlers[exit_code](svm);
4221}
4222
4223static void reload_tss(struct kvm_vcpu *vcpu)
4224{
4225        int cpu = raw_smp_processor_id();
4226
4227        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
4228        sd->tss_desc->type = 9; /* available 32/64-bit TSS */
4229        load_TR_desc();
4230}
4231
4232static void pre_svm_run(struct vcpu_svm *svm)
4233{
4234        int cpu = raw_smp_processor_id();
4235
4236        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
4237
4238        /* FIXME: handle wraparound of asid_generation */
4239        if (svm->asid_generation != sd->asid_generation)
4240                new_asid(svm, sd);
4241}
4242
4243static void svm_inject_nmi(struct kvm_vcpu *vcpu)
4244{
4245        struct vcpu_svm *svm = to_svm(vcpu);
4246
4247        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
4248        vcpu->arch.hflags |= HF_NMI_MASK;
4249        set_intercept(svm, INTERCEPT_IRET);
4250        ++vcpu->stat.nmi_injections;
4251}
4252
4253static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
4254{
4255        struct vmcb_control_area *control;
4256
4257        /* The following fields are ignored when AVIC is enabled */
4258        control = &svm->vmcb->control;
4259        control->int_vector = irq;
4260        control->int_ctl &= ~V_INTR_PRIO_MASK;
4261        control->int_ctl |= V_IRQ_MASK |
4262                ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
4263        mark_dirty(svm->vmcb, VMCB_INTR);
4264}
4265
4266static void svm_set_irq(struct kvm_vcpu *vcpu)
4267{
4268        struct vcpu_svm *svm = to_svm(vcpu);
4269
4270        BUG_ON(!(gif_set(svm)));
4271
4272        trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
4273        ++vcpu->stat.irq_injections;
4274
4275        svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
4276                SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
4277}
4278
4279static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
4280{
4281        return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
4282}
4283
4284static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
4285{
4286        struct vcpu_svm *svm = to_svm(vcpu);
4287
4288        if (svm_nested_virtualize_tpr(vcpu) ||
4289            kvm_vcpu_apicv_active(vcpu))
4290                return;
4291
4292        clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
4293
4294        if (irr == -1)
4295                return;
4296
4297        if (tpr >= irr)
4298                set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
4299}
4300
4301static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
4302{
4303        return;
4304}
4305
4306static bool svm_get_enable_apicv(void)
4307{
4308        return avic;
4309}
4310
4311static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
4312{
4313}
4314
4315static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
4316{
4317}
4318
4319/* Note: Currently only used by Hyper-V. */
4320static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4321{
4322        struct vcpu_svm *svm = to_svm(vcpu);
4323        struct vmcb *vmcb = svm->vmcb;
4324
4325        if (!avic)
4326                return;
4327
4328        vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
4329        mark_dirty(vmcb, VMCB_INTR);
4330}
4331
4332static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
4333{
4334        return;
4335}
4336
4337static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
4338{
4339        kvm_lapic_set_irr(vec, vcpu->arch.apic);
4340        smp_mb__after_atomic();
4341
4342        if (avic_vcpu_is_running(vcpu))
4343                wrmsrl(SVM_AVIC_DOORBELL,
4344                       kvm_cpu_get_apicid(vcpu->cpu));
4345        else
4346                kvm_vcpu_wake_up(vcpu);
4347}
4348
4349static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
4350{
4351        unsigned long flags;
4352        struct amd_svm_iommu_ir *cur;
4353
4354        spin_lock_irqsave(&svm->ir_list_lock, flags);
4355        list_for_each_entry(cur, &svm->ir_list, node) {
4356                if (cur->data != pi->ir_data)
4357                        continue;
4358                list_del(&cur->node);
4359                kfree(cur);
4360                break;
4361        }
4362        spin_unlock_irqrestore(&svm->ir_list_lock, flags);
4363}
4364
4365static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
4366{
4367        int ret = 0;
4368        unsigned long flags;
4369        struct amd_svm_iommu_ir *ir;
4370
4371        /**
4372         * In some cases, the existing irte is updaed and re-set,
4373         * so we need to check here if it's already been * added
4374         * to the ir_list.
4375         */
4376        if (pi->ir_data && (pi->prev_ga_tag != 0)) {
4377                struct kvm *kvm = svm->vcpu.kvm;
4378                u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
4379                struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
4380                struct vcpu_svm *prev_svm;
4381
4382                if (!prev_vcpu) {
4383                        ret = -EINVAL;
4384                        goto out;
4385                }
4386
4387                prev_svm = to_svm(prev_vcpu);
4388                svm_ir_list_del(prev_svm, pi);
4389        }
4390
4391        /**
4392         * Allocating new amd_iommu_pi_data, which will get
4393         * add to the per-vcpu ir_list.
4394         */
4395        ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
4396        if (!ir) {
4397                ret = -ENOMEM;
4398                goto out;
4399        }
4400        ir->data = pi->ir_data;
4401
4402        spin_lock_irqsave(&svm->ir_list_lock, flags);
4403        list_add(&ir->node, &svm->ir_list);
4404        spin_unlock_irqrestore(&svm->ir_list_lock, flags);
4405out:
4406        return ret;
4407}
4408
4409/**
4410 * Note:
4411 * The HW cannot support posting multicast/broadcast
4412 * interrupts to a vCPU. So, we still use legacy interrupt
4413 * remapping for these kind of interrupts.
4414 *
4415 * For lowest-priority interrupts, we only support
4416 * those with single CPU as the destination, e.g. user
4417 * configures the interrupts via /proc/irq or uses
4418 * irqbalance to make the interrupts single-CPU.
4419 */
4420static int
4421get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
4422                 struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
4423{
4424        struct kvm_lapic_irq irq;
4425        struct kvm_vcpu *vcpu = NULL;
4426
4427        kvm_set_msi_irq(kvm, e, &irq);
4428
4429        if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
4430                pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
4431                         __func__, irq.vector);
4432                return -1;
4433        }
4434
4435        pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
4436                 irq.vector);
4437        *svm = to_svm(vcpu);
4438        vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
4439        vcpu_info->vector = irq.vector;
4440
4441        return 0;
4442}
4443
4444/*
4445 * svm_update_pi_irte - set IRTE for Posted-Interrupts
4446 *
4447 * @kvm: kvm
4448 * @host_irq: host irq of the interrupt
4449 * @guest_irq: gsi of the interrupt
4450 * @set: set or unset PI
4451 * returns 0 on success, < 0 on failure
4452 */
4453static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
4454                              uint32_t guest_irq, bool set)
4455{
4456        struct kvm_kernel_irq_routing_entry *e;
4457        struct kvm_irq_routing_table *irq_rt;
4458        int idx, ret = -EINVAL;
4459
4460        if (!kvm_arch_has_assigned_device(kvm) ||
4461            !irq_remapping_cap(IRQ_POSTING_CAP))
4462                return 0;
4463
4464        pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
4465                 __func__, host_irq, guest_irq, set);
4466
4467        idx = srcu_read_lock(&kvm->irq_srcu);
4468        irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
4469        WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
4470
4471        hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
4472                struct vcpu_data vcpu_info;
4473                struct vcpu_svm *svm = NULL;
4474
4475                if (e->type != KVM_IRQ_ROUTING_MSI)
4476                        continue;
4477
4478                /**
4479                 * Here, we setup with legacy mode in the following cases:
4480                 * 1. When cannot target interrupt to a specific vcpu.
4481                 * 2. Unsetting posted interrupt.
4482                 * 3. APIC virtialization is disabled for the vcpu.
4483                 */
4484                if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
4485                    kvm_vcpu_apicv_active(&svm->vcpu)) {
4486                        struct amd_iommu_pi_data pi;
4487
4488                        /* Try to enable guest_mode in IRTE */
4489                        pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
4490                        pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
4491                                                     svm->vcpu.vcpu_id);
4492                        pi.is_guest_mode = true;
4493                        pi.vcpu_data = &vcpu_info;
4494                        ret = irq_set_vcpu_affinity(host_irq, &pi);
4495
4496                        /**
4497                         * Here, we successfully setting up vcpu affinity in
4498                         * IOMMU guest mode. Now, we need to store the posted
4499                         * interrupt information in a per-vcpu ir_list so that
4500                         * we can reference to them directly when we update vcpu
4501                         * scheduling information in IOMMU irte.
4502                         */
4503                        if (!ret && pi.is_guest_mode)
4504                                svm_ir_list_add(svm, &pi);
4505                } else {
4506                        /* Use legacy mode in IRTE */
4507                        struct amd_iommu_pi_data pi;
4508
4509                        /**
4510                         * Here, pi is used to:
4511                         * - Tell IOMMU to use legacy mode for this interrupt.
4512                         * - Retrieve ga_tag of prior interrupt remapping data.
4513                         */
4514                        pi.is_guest_mode = false;
4515                        ret = irq_set_vcpu_affinity(host_irq, &pi);
4516
4517                        /**
4518                         * Check if the posted interrupt was previously
4519                         * setup with the guest_mode by checking if the ga_tag
4520                         * was cached. If so, we need to clean up the per-vcpu
4521                         * ir_list.
4522                         */
4523                        if (!ret && pi.prev_ga_tag) {
4524                                int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
4525                                struct kvm_vcpu *vcpu;
4526
4527                                vcpu = kvm_get_vcpu_by_id(kvm, id);
4528                                if (vcpu)
4529                                        svm_ir_list_del(to_svm(vcpu), &pi);
4530                        }
4531                }
4532
4533                if (!ret && svm) {
4534                        trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
4535                                                 host_irq, e->gsi,
4536                                                 vcpu_info.vector,
4537                                                 vcpu_info.pi_desc_addr, set);
4538                }
4539
4540                if (ret < 0) {
4541                        pr_err("%s: failed to update PI IRTE\n", __func__);
4542                        goto out;
4543                }
4544        }
4545
4546        ret = 0;
4547out:
4548        srcu_read_unlock(&kvm->irq_srcu, idx);
4549        return ret;
4550}
4551
4552static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
4553{
4554        struct vcpu_svm *svm = to_svm(vcpu);
4555        struct vmcb *vmcb = svm->vmcb;
4556        int ret;
4557        ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
4558              !(svm->vcpu.arch.hflags & HF_NMI_MASK);
4559        ret = ret && gif_set(svm) && nested_svm_nmi(svm);
4560
4561        return ret;
4562}
4563
4564static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
4565{
4566        struct vcpu_svm *svm = to_svm(vcpu);
4567
4568        return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
4569}
4570
4571static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4572{
4573        struct vcpu_svm *svm = to_svm(vcpu);
4574
4575        if (masked) {
4576                svm->vcpu.arch.hflags |= HF_NMI_MASK;
4577                set_intercept(svm, INTERCEPT_IRET);
4578        } else {
4579                svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
4580                clr_intercept(svm, INTERCEPT_IRET);
4581        }
4582}
4583
4584static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
4585{
4586        struct vcpu_svm *svm = to_svm(vcpu);
4587        struct vmcb *vmcb = svm->vmcb;
4588        int ret;
4589
4590        if (!gif_set(svm) ||
4591             (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
4592                return 0;
4593
4594        ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
4595
4596        if (is_guest_mode(vcpu))
4597                return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
4598
4599        return ret;
4600}
4601
4602static void enable_irq_window(struct kvm_vcpu *vcpu)
4603{
4604        struct vcpu_svm *svm = to_svm(vcpu);
4605
4606        if (kvm_vcpu_apicv_active(vcpu))
4607                return;
4608
4609        /*
4610         * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
4611         * 1, because that's a separate STGI/VMRUN intercept.  The next time we
4612         * get that intercept, this function will be called again though and
4613         * we'll get the vintr intercept.
4614         */
4615        if (gif_set(svm) && nested_svm_intr(svm)) {
4616                svm_set_vintr(svm);
4617                svm_inject_irq(svm, 0x0);
4618        }
4619}
4620
4621static void enable_nmi_window(struct kvm_vcpu *vcpu)
4622{
4623        struct vcpu_svm *svm = to_svm(vcpu);
4624
4625        if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
4626            == HF_NMI_MASK)
4627                return; /* IRET will cause a vm exit */
4628
4629        /*
4630         * Something prevents NMI from been injected. Single step over possible
4631         * problem (IRET or exception injection or interrupt shadow)
4632         */
4633        svm->nmi_singlestep = true;
4634        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
4635}
4636
4637static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
4638{
4639        return 0;
4640}
4641
4642static void svm_flush_tlb(struct kvm_vcpu *vcpu)
4643{
4644        struct vcpu_svm *svm = to_svm(vcpu);
4645
4646        if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
4647                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
4648        else
4649                svm->asid_generation--;
4650}
4651
4652static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
4653{
4654}
4655
4656static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
4657{
4658        struct vcpu_svm *svm = to_svm(vcpu);
4659
4660        if (svm_nested_virtualize_tpr(vcpu))
4661                return;
4662
4663        if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
4664                int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
4665                kvm_set_cr8(vcpu, cr8);
4666        }
4667}
4668
4669static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
4670{
4671        struct vcpu_svm *svm = to_svm(vcpu);
4672        u64 cr8;
4673
4674        if (svm_nested_virtualize_tpr(vcpu) ||
4675            kvm_vcpu_apicv_active(vcpu))
4676                return;
4677
4678        cr8 = kvm_get_cr8(vcpu);
4679        svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
4680        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
4681}
4682
4683static void svm_complete_interrupts(struct vcpu_svm *svm)
4684{
4685        u8 vector;
4686        int type;
4687        u32 exitintinfo = svm->vmcb->control.exit_int_info;
4688        unsigned int3_injected = svm->int3_injected;
4689
4690        svm->int3_injected = 0;
4691
4692        /*
4693         * If we've made progress since setting HF_IRET_MASK, we've
4694         * executed an IRET and can allow NMI injection.
4695         */
4696        if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
4697            && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
4698                svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
4699                kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
4700        }
4701
4702        svm->vcpu.arch.nmi_injected = false;
4703        kvm_clear_exception_queue(&svm->vcpu);
4704        kvm_clear_interrupt_queue(&svm->vcpu);
4705
4706        if (!(exitintinfo & SVM_EXITINTINFO_VALID))
4707                return;
4708
4709        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
4710
4711        vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
4712        type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
4713
4714        switch (type) {
4715        case SVM_EXITINTINFO_TYPE_NMI:
4716                svm->vcpu.arch.nmi_injected = true;
4717                break;
4718        case SVM_EXITINTINFO_TYPE_EXEPT:
4719                /*
4720                 * In case of software exceptions, do not reinject the vector,
4721                 * but re-execute the instruction instead. Rewind RIP first
4722                 * if we emulated INT3 before.
4723                 */
4724                if (kvm_exception_is_soft(vector)) {
4725                        if (vector == BP_VECTOR && int3_injected &&
4726                            kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
4727                                kvm_rip_write(&svm->vcpu,
4728                                              kvm_rip_read(&svm->vcpu) -
4729                                              int3_injected);
4730                        break;
4731                }
4732                if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
4733                        u32 err = svm->vmcb->control.exit_int_info_err;
4734                        kvm_requeue_exception_e(&svm->vcpu, vector, err);
4735
4736                } else
4737                        kvm_requeue_exception(&svm->vcpu, vector);
4738                break;
4739        case SVM_EXITINTINFO_TYPE_INTR:
4740                kvm_queue_interrupt(&svm->vcpu, vector, false);
4741                break;
4742        default:
4743                break;
4744        }
4745}
4746
4747static void svm_cancel_injection(struct kvm_vcpu *vcpu)
4748{
4749        struct vcpu_svm *svm = to_svm(vcpu);
4750        struct vmcb_control_area *control = &svm->vmcb->control;
4751
4752        control->exit_int_info = control->event_inj;
4753        control->exit_int_info_err = control->event_inj_err;
4754        control->event_inj = 0;
4755        svm_complete_interrupts(svm);
4756}
4757
4758static void svm_vcpu_run(struct kvm_vcpu *vcpu)
4759{
4760        struct vcpu_svm *svm = to_svm(vcpu);
4761
4762        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4763        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4764        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4765
4766        /*
4767         * A vmexit emulation is required before the vcpu can be executed
4768         * again.
4769         */
4770        if (unlikely(svm->nested.exit_required))
4771                return;
4772
4773        pre_svm_run(svm);
4774
4775        sync_lapic_to_cr8(vcpu);
4776
4777        svm->vmcb->save.cr2 = vcpu->arch.cr2;
4778
4779        clgi();
4780
4781        local_irq_enable();
4782
4783        asm volatile (
4784                "push %%" _ASM_BP "; \n\t"
4785                "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
4786                "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
4787                "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
4788                "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
4789                "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
4790                "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
4791#ifdef CONFIG_X86_64
4792                "mov %c[r8](%[svm]),  %%r8  \n\t"
4793                "mov %c[r9](%[svm]),  %%r9  \n\t"
4794                "mov %c[r10](%[svm]), %%r10 \n\t"
4795                "mov %c[r11](%[svm]), %%r11 \n\t"
4796                "mov %c[r12](%[svm]), %%r12 \n\t"
4797                "mov %c[r13](%[svm]), %%r13 \n\t"
4798                "mov %c[r14](%[svm]), %%r14 \n\t"
4799                "mov %c[r15](%[svm]), %%r15 \n\t"
4800#endif
4801
4802                /* Enter guest mode */
4803                "push %%" _ASM_AX " \n\t"
4804                "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
4805                __ex(SVM_VMLOAD) "\n\t"
4806                __ex(SVM_VMRUN) "\n\t"
4807                __ex(SVM_VMSAVE) "\n\t"
4808                "pop %%" _ASM_AX " \n\t"
4809
4810                /* Save guest registers, load host registers */
4811                "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
4812                "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
4813                "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
4814                "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
4815                "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
4816                "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
4817#ifdef CONFIG_X86_64
4818                "mov %%r8,  %c[r8](%[svm]) \n\t"
4819                "mov %%r9,  %c[r9](%[svm]) \n\t"
4820                "mov %%r10, %c[r10](%[svm]) \n\t"
4821                "mov %%r11, %c[r11](%[svm]) \n\t"
4822                "mov %%r12, %c[r12](%[svm]) \n\t"
4823                "mov %%r13, %c[r13](%[svm]) \n\t"
4824                "mov %%r14, %c[r14](%[svm]) \n\t"
4825                "mov %%r15, %c[r15](%[svm]) \n\t"
4826#endif
4827                "pop %%" _ASM_BP
4828                :
4829                : [svm]"a"(svm),
4830                  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
4831                  [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
4832                  [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
4833                  [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
4834                  [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
4835                  [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
4836                  [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
4837#ifdef CONFIG_X86_64
4838                  , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
4839                  [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
4840                  [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
4841                  [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
4842                  [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
4843                  [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
4844                  [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
4845                  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
4846#endif
4847                : "cc", "memory"
4848#ifdef CONFIG_X86_64
4849                , "rbx", "rcx", "rdx", "rsi", "rdi"
4850                , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
4851#else
4852                , "ebx", "ecx", "edx", "esi", "edi"
4853#endif
4854                );
4855
4856#ifdef CONFIG_X86_64
4857        wrmsrl(MSR_GS_BASE, svm->host.gs_base);
4858#else
4859        loadsegment(fs, svm->host.fs);
4860#ifndef CONFIG_X86_32_LAZY_GS
4861        loadsegment(gs, svm->host.gs);
4862#endif
4863#endif
4864
4865        reload_tss(vcpu);
4866
4867        local_irq_disable();
4868
4869        vcpu->arch.cr2 = svm->vmcb->save.cr2;
4870        vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4871        vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4872        vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4873
4874        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4875                kvm_before_handle_nmi(&svm->vcpu);
4876
4877        stgi();
4878
4879        /* Any pending NMI will happen here */
4880
4881        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4882                kvm_after_handle_nmi(&svm->vcpu);
4883
4884        sync_cr8_to_lapic(vcpu);
4885
4886        svm->next_rip = 0;
4887
4888        svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4889
4890        /* if exit due to PF check for async PF */
4891        if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4892                svm->apf_reason = kvm_read_and_reset_pf_reason();
4893
4894        if (npt_enabled) {
4895                vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
4896                vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
4897        }
4898
4899        /*
4900         * We need to handle MC intercepts here before the vcpu has a chance to
4901         * change the physical cpu
4902         */
4903        if (unlikely(svm->vmcb->control.exit_code ==
4904                     SVM_EXIT_EXCP_BASE + MC_VECTOR))
4905                svm_handle_mce(svm);
4906
4907        mark_all_clean(svm->vmcb);
4908}
4909
4910static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
4911{
4912        struct vcpu_svm *svm = to_svm(vcpu);
4913
4914        svm->vmcb->save.cr3 = root;
4915        mark_dirty(svm->vmcb, VMCB_CR);
4916        svm_flush_tlb(vcpu);
4917}
4918
4919static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
4920{
4921        struct vcpu_svm *svm = to_svm(vcpu);
4922
4923        svm->vmcb->control.nested_cr3 = root;
4924        mark_dirty(svm->vmcb, VMCB_NPT);
4925
4926        /* Also sync guest cr3 here in case we live migrate */
4927        svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
4928        mark_dirty(svm->vmcb, VMCB_CR);
4929
4930        svm_flush_tlb(vcpu);
4931}
4932
4933static int is_disabled(void)
4934{
4935        u64 vm_cr;
4936
4937        rdmsrl(MSR_VM_CR, vm_cr);
4938        if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4939                return 1;
4940
4941        return 0;
4942}
4943
4944static void
4945svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4946{
4947        /*
4948         * Patch in the VMMCALL instruction:
4949         */
4950        hypercall[0] = 0x0f;
4951        hypercall[1] = 0x01;
4952        hypercall[2] = 0xd9;
4953}
4954
4955static void svm_check_processor_compat(void *rtn)
4956{
4957        *(int *)rtn = 0;
4958}
4959
4960static bool svm_cpu_has_accelerated_tpr(void)
4961{
4962        return false;
4963}
4964
4965static bool svm_has_high_real_mode_segbase(void)
4966{
4967        return true;
4968}
4969
4970static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4971{
4972        return 0;
4973}
4974
4975static void svm_cpuid_update(struct kvm_vcpu *vcpu)
4976{
4977        struct vcpu_svm *svm = to_svm(vcpu);
4978        struct kvm_cpuid_entry2 *entry;
4979
4980        /* Update nrips enabled cache */
4981        svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
4982
4983        if (!kvm_vcpu_apicv_active(vcpu))
4984                return;
4985
4986        entry = kvm_find_cpuid_entry(vcpu, 1, 0);
4987        if (entry)
4988                entry->ecx &= ~bit(X86_FEATURE_X2APIC);
4989}
4990
4991static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4992{
4993        switch (func) {
4994        case 0x1:
4995                if (avic)
4996                        entry->ecx &= ~bit(X86_FEATURE_X2APIC);
4997                break;
4998        case 0x80000001:
4999                if (nested)
5000                        entry->ecx |= (1 << 2); /* Set SVM bit */
5001                break;
5002        case 0x8000000A:
5003                entry->eax = 1; /* SVM revision 1 */
5004                entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
5005                                   ASID emulation to nested SVM */
5006                entry->ecx = 0; /* Reserved */
5007                entry->edx = 0; /* Per default do not support any
5008                                   additional features */
5009
5010                /* Support next_rip if host supports it */
5011                if (boot_cpu_has(X86_FEATURE_NRIPS))
5012                        entry->edx |= SVM_FEATURE_NRIP;
5013
5014                /* Support NPT for the guest if enabled */
5015                if (npt_enabled)
5016                        entry->edx |= SVM_FEATURE_NPT;
5017
5018                break;
5019        }
5020}
5021
5022static int svm_get_lpage_level(void)
5023{
5024        return PT_PDPE_LEVEL;
5025}
5026
5027static bool svm_rdtscp_supported(void)
5028{
5029        return boot_cpu_has(X86_FEATURE_RDTSCP);
5030}
5031
5032static bool svm_invpcid_supported(void)
5033{
5034        return false;
5035}
5036
5037static bool svm_mpx_supported(void)
5038{
5039        return false;
5040}
5041
5042static bool svm_xsaves_supported(void)
5043{
5044        return false;
5045}
5046
5047static bool svm_has_wbinvd_exit(void)
5048{
5049        return true;
5050}
5051
5052#define PRE_EX(exit)  { .exit_code = (exit), \
5053                        .stage = X86_ICPT_PRE_EXCEPT, }
5054#define POST_EX(exit) { .exit_code = (exit), \
5055                        .stage = X86_ICPT_POST_EXCEPT, }
5056#define POST_MEM(exit) { .exit_code = (exit), \
5057                        .stage = X86_ICPT_POST_MEMACCESS, }
5058
5059static const struct __x86_intercept {
5060        u32 exit_code;
5061        enum x86_intercept_stage stage;
5062} x86_intercept_map[] = {
5063        [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
5064        [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
5065        [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
5066        [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
5067        [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
5068        [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
5069        [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
5070        [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
5071        [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
5072        [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
5073        [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
5074        [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
5075        [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
5076        [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
5077        [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
5078        [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
5079        [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
5080        [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
5081        [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
5082        [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
5083        [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
5084        [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
5085        [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
5086        [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
5087        [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
5088        [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
5089        [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
5090        [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
5091        [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
5092        [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
5093        [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
5094        [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
5095        [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
5096        [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
5097        [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
5098        [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
5099        [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
5100        [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
5101        [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
5102        [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
5103        [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
5104        [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
5105        [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
5106        [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
5107        [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
5108        [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
5109};
5110
5111#undef PRE_EX
5112#undef POST_EX
5113#undef POST_MEM
5114
5115static int svm_check_intercept(struct kvm_vcpu *vcpu,
5116                               struct x86_instruction_info *info,
5117                               enum x86_intercept_stage stage)
5118{
5119        struct vcpu_svm *svm = to_svm(vcpu);
5120        int vmexit, ret = X86EMUL_CONTINUE;
5121        struct __x86_intercept icpt_info;
5122        struct vmcb *vmcb = svm->vmcb;
5123
5124        if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
5125                goto out;
5126
5127        icpt_info = x86_intercept_map[info->intercept];
5128
5129        if (stage != icpt_info.stage)
5130                goto out;
5131
5132        switch (icpt_info.exit_code) {
5133        case SVM_EXIT_READ_CR0:
5134                if (info->intercept == x86_intercept_cr_read)
5135                        icpt_info.exit_code += info->modrm_reg;
5136                break;
5137        case SVM_EXIT_WRITE_CR0: {
5138                unsigned long cr0, val;
5139                u64 intercept;
5140
5141                if (info->intercept == x86_intercept_cr_write)
5142                        icpt_info.exit_code += info->modrm_reg;
5143
5144                if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
5145                    info->intercept == x86_intercept_clts)
5146                        break;
5147
5148                intercept = svm->nested.intercept;
5149
5150                if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
5151                        break;
5152
5153                cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
5154                val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
5155
5156                if (info->intercept == x86_intercept_lmsw) {
5157                        cr0 &= 0xfUL;
5158                        val &= 0xfUL;
5159                        /* lmsw can't clear PE - catch this here */
5160                        if (cr0 & X86_CR0_PE)
5161                                val |= X86_CR0_PE;
5162                }
5163
5164                if (cr0 ^ val)
5165                        icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
5166
5167                break;
5168        }
5169        case SVM_EXIT_READ_DR0:
5170        case SVM_EXIT_WRITE_DR0:
5171                icpt_info.exit_code += info->modrm_reg;
5172                break;
5173        case SVM_EXIT_MSR:
5174                if (info->intercept == x86_intercept_wrmsr)
5175                        vmcb->control.exit_info_1 = 1;
5176                else
5177                        vmcb->control.exit_info_1 = 0;
5178                break;
5179        case SVM_EXIT_PAUSE:
5180                /*
5181                 * We get this for NOP only, but pause
5182                 * is rep not, check this here
5183                 */
5184                if (info->rep_prefix != REPE_PREFIX)
5185                        goto out;
5186        case SVM_EXIT_IOIO: {
5187                u64 exit_info;
5188                u32 bytes;
5189
5190                if (info->intercept == x86_intercept_in ||
5191                    info->intercept == x86_intercept_ins) {
5192                        exit_info = ((info->src_val & 0xffff) << 16) |
5193                                SVM_IOIO_TYPE_MASK;
5194                        bytes = info->dst_bytes;
5195                } else {
5196                        exit_info = (info->dst_val & 0xffff) << 16;
5197                        bytes = info->src_bytes;
5198                }
5199
5200                if (info->intercept == x86_intercept_outs ||
5201                    info->intercept == x86_intercept_ins)
5202                        exit_info |= SVM_IOIO_STR_MASK;
5203
5204                if (info->rep_prefix)
5205                        exit_info |= SVM_IOIO_REP_MASK;
5206
5207                bytes = min(bytes, 4u);
5208
5209                exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
5210
5211                exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
5212
5213                vmcb->control.exit_info_1 = exit_info;
5214                vmcb->control.exit_info_2 = info->next_rip;
5215
5216                break;
5217        }
5218        default:
5219                break;
5220        }
5221
5222        /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
5223        if (static_cpu_has(X86_FEATURE_NRIPS))
5224                vmcb->control.next_rip  = info->next_rip;
5225        vmcb->control.exit_code = icpt_info.exit_code;
5226        vmexit = nested_svm_exit_handled(svm);
5227
5228        ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
5229                                           : X86EMUL_CONTINUE;
5230
5231out:
5232        return ret;
5233}
5234
5235static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
5236{
5237        local_irq_enable();
5238        /*
5239         * We must have an instruction with interrupts enabled, so
5240         * the timer interrupt isn't delayed by the interrupt shadow.
5241         */
5242        asm("nop");
5243        local_irq_disable();
5244}
5245
5246static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
5247{
5248}
5249
5250static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
5251{
5252        if (avic_handle_apic_id_update(vcpu) != 0)
5253                return;
5254        if (avic_handle_dfr_update(vcpu) != 0)
5255                return;
5256        avic_handle_ldr_update(vcpu);
5257}
5258
5259static void svm_setup_mce(struct kvm_vcpu *vcpu)
5260{
5261        /* [63:9] are reserved. */
5262        vcpu->arch.mcg_cap &= 0x1ff;
5263}
5264
5265static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
5266        .cpu_has_kvm_support = has_svm,
5267        .disabled_by_bios = is_disabled,
5268        .hardware_setup = svm_hardware_setup,
5269        .hardware_unsetup = svm_hardware_unsetup,
5270        .check_processor_compatibility = svm_check_processor_compat,
5271        .hardware_enable = svm_hardware_enable,
5272        .hardware_disable = svm_hardware_disable,
5273        .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
5274        .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
5275
5276        .vcpu_create = svm_create_vcpu,
5277        .vcpu_free = svm_free_vcpu,
5278        .vcpu_reset = svm_vcpu_reset,
5279
5280        .vm_init = avic_vm_init,
5281        .vm_destroy = avic_vm_destroy,
5282
5283        .prepare_guest_switch = svm_prepare_guest_switch,
5284        .vcpu_load = svm_vcpu_load,
5285        .vcpu_put = svm_vcpu_put,
5286        .vcpu_blocking = svm_vcpu_blocking,
5287        .vcpu_unblocking = svm_vcpu_unblocking,
5288
5289        .update_bp_intercept = update_bp_intercept,
5290        .get_msr = svm_get_msr,
5291        .set_msr = svm_set_msr,
5292        .get_segment_base = svm_get_segment_base,
5293        .get_segment = svm_get_segment,
5294        .set_segment = svm_set_segment,
5295        .get_cpl = svm_get_cpl,
5296        .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
5297        .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
5298        .decache_cr3 = svm_decache_cr3,
5299        .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
5300        .set_cr0 = svm_set_cr0,
5301        .set_cr3 = svm_set_cr3,
5302        .set_cr4 = svm_set_cr4,
5303        .set_efer = svm_set_efer,
5304        .get_idt = svm_get_idt,
5305        .set_idt = svm_set_idt,
5306        .get_gdt = svm_get_gdt,
5307        .set_gdt = svm_set_gdt,
5308        .get_dr6 = svm_get_dr6,
5309        .set_dr6 = svm_set_dr6,
5310        .set_dr7 = svm_set_dr7,
5311        .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
5312        .cache_reg = svm_cache_reg,
5313        .get_rflags = svm_get_rflags,
5314        .set_rflags = svm_set_rflags,
5315
5316        .get_pkru = svm_get_pkru,
5317
5318        .tlb_flush = svm_flush_tlb,
5319
5320        .run = svm_vcpu_run,
5321        .handle_exit = handle_exit,
5322        .skip_emulated_instruction = skip_emulated_instruction,
5323        .set_interrupt_shadow = svm_set_interrupt_shadow,
5324        .get_interrupt_shadow = svm_get_interrupt_shadow,
5325        .patch_hypercall = svm_patch_hypercall,
5326        .set_irq = svm_set_irq,
5327        .set_nmi = svm_inject_nmi,
5328        .queue_exception = svm_queue_exception,
5329        .cancel_injection = svm_cancel_injection,
5330        .interrupt_allowed = svm_interrupt_allowed,
5331        .nmi_allowed = svm_nmi_allowed,
5332        .get_nmi_mask = svm_get_nmi_mask,
5333        .set_nmi_mask = svm_set_nmi_mask,
5334        .enable_nmi_window = enable_nmi_window,
5335        .enable_irq_window = enable_irq_window,
5336        .update_cr8_intercept = update_cr8_intercept,
5337        .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
5338        .get_enable_apicv = svm_get_enable_apicv,
5339        .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
5340        .load_eoi_exitmap = svm_load_eoi_exitmap,
5341        .hwapic_irr_update = svm_hwapic_irr_update,
5342        .hwapic_isr_update = svm_hwapic_isr_update,
5343        .apicv_post_state_restore = avic_post_state_restore,
5344
5345        .set_tss_addr = svm_set_tss_addr,
5346        .get_tdp_level = get_npt_level,
5347        .get_mt_mask = svm_get_mt_mask,
5348
5349        .get_exit_info = svm_get_exit_info,
5350
5351        .get_lpage_level = svm_get_lpage_level,
5352
5353        .cpuid_update = svm_cpuid_update,
5354
5355        .rdtscp_supported = svm_rdtscp_supported,
5356        .invpcid_supported = svm_invpcid_supported,
5357        .mpx_supported = svm_mpx_supported,
5358        .xsaves_supported = svm_xsaves_supported,
5359
5360        .set_supported_cpuid = svm_set_supported_cpuid,
5361
5362        .has_wbinvd_exit = svm_has_wbinvd_exit,
5363
5364        .write_tsc_offset = svm_write_tsc_offset,
5365
5366        .set_tdp_cr3 = set_tdp_cr3,
5367
5368        .check_intercept = svm_check_intercept,
5369        .handle_external_intr = svm_handle_external_intr,
5370
5371        .sched_in = svm_sched_in,
5372
5373        .pmu_ops = &amd_pmu_ops,
5374        .deliver_posted_interrupt = svm_deliver_avic_intr,
5375        .update_pi_irte = svm_update_pi_irte,
5376        .setup_mce = svm_setup_mce,
5377};
5378
5379static int __init svm_init(void)
5380{
5381        return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
5382                        __alignof__(struct vcpu_svm), THIS_MODULE);
5383}
5384
5385static void __exit svm_exit(void)
5386{
5387        kvm_exit();
5388}
5389
5390module_init(svm_init)
5391module_exit(svm_exit)
5392