linux/arch/x86/kvm/svm.c
<<
>>
Prefs
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * AMD SVM support
   5 *
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   8 *
   9 * Authors:
  10 *   Yaniv Kamay  <yaniv@qumranet.com>
  11 *   Avi Kivity   <avi@qumranet.com>
  12 *
  13 * This work is licensed under the terms of the GNU GPL, version 2.  See
  14 * the COPYING file in the top-level directory.
  15 *
  16 */
  17
  18#define pr_fmt(fmt) "SVM: " fmt
  19
  20#include <linux/kvm_host.h>
  21
  22#include "irq.h"
  23#include "mmu.h"
  24#include "kvm_cache_regs.h"
  25#include "x86.h"
  26#include "cpuid.h"
  27#include "pmu.h"
  28
  29#include <linux/module.h>
  30#include <linux/mod_devicetable.h>
  31#include <linux/kernel.h>
  32#include <linux/vmalloc.h>
  33#include <linux/highmem.h>
  34#include <linux/sched.h>
  35#include <linux/ftrace_event.h>
  36#include <linux/slab.h>
  37#include <linux/amd-iommu.h>
  38#include <linux/hashtable.h>
  39
  40#include <asm/perf_event.h>
  41#include <asm/tlbflush.h>
  42#include <asm/desc.h>
  43#include <asm/debugreg.h>
  44#include <asm/kvm_para.h>
  45#include <asm/irq_remapping.h>
  46#include <asm/nospec-branch.h>
  47
  48#include <asm/virtext.h>
  49#include "trace.h"
  50
  51#define __ex(x) __kvm_handle_fault_on_reboot(x)
  52
  53MODULE_AUTHOR("Qumranet");
  54MODULE_LICENSE("GPL");
  55
  56static const struct x86_cpu_id svm_cpu_id[] = {
  57        X86_FEATURE_MATCH(X86_FEATURE_SVM),
  58        {}
  59};
  60MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  61
  62#define IOPM_ALLOC_ORDER 2
  63#define MSRPM_ALLOC_ORDER 1
  64
  65#define SEG_TYPE_LDT 2
  66#define SEG_TYPE_BUSY_TSS16 3
  67
  68#define SVM_FEATURE_NPT            (1 <<  0)
  69#define SVM_FEATURE_LBRV           (1 <<  1)
  70#define SVM_FEATURE_SVML           (1 <<  2)
  71#define SVM_FEATURE_NRIP           (1 <<  3)
  72#define SVM_FEATURE_TSC_RATE       (1 <<  4)
  73#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
  74#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
  75#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
  76#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
  77
  78#define SVM_AVIC_DOORBELL       0xc001011b
  79
  80#define NESTED_EXIT_HOST        0       /* Exit handled on host level */
  81#define NESTED_EXIT_DONE        1       /* Exit caused nested vmexit  */
  82#define NESTED_EXIT_CONTINUE    2       /* Further checks needed      */
  83
  84#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
  85
  86#define TSC_RATIO_RSVD          0xffffff0000000000ULL
  87#define TSC_RATIO_MIN           0x0000000000000001ULL
  88#define TSC_RATIO_MAX           0x000000ffffffffffULL
  89
  90#define AVIC_HPA_MASK   ~((0xFFFULL << 52) | 0xFFF)
  91
  92/*
  93 * 0xff is broadcast, so the max index allowed for physical APIC ID
  94 * table is 0xfe.  APIC IDs above 0xff are reserved.
  95 */
  96#define AVIC_MAX_PHYSICAL_ID_COUNT      255
  97
  98#define AVIC_UNACCEL_ACCESS_WRITE_MASK          1
  99#define AVIC_UNACCEL_ACCESS_OFFSET_MASK         0xFF0
 100#define AVIC_UNACCEL_ACCESS_VECTOR_MASK         0xFFFFFFFF
 101
 102/* AVIC GATAG is encoded using VM and VCPU IDs */
 103#define AVIC_VCPU_ID_BITS               8
 104#define AVIC_VCPU_ID_MASK               ((1 << AVIC_VCPU_ID_BITS) - 1)
 105
 106#define AVIC_VM_ID_BITS                 24
 107#define AVIC_VM_ID_NR                   (1 << AVIC_VM_ID_BITS)
 108#define AVIC_VM_ID_MASK                 ((1 << AVIC_VM_ID_BITS) - 1)
 109
 110#define AVIC_GATAG(x, y)                (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
 111                                                (y & AVIC_VCPU_ID_MASK))
 112#define AVIC_GATAG_TO_VMID(x)           ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
 113#define AVIC_GATAG_TO_VCPUID(x)         (x & AVIC_VCPU_ID_MASK)
 114
 115static bool erratum_383_found __read_mostly;
 116
 117static const u32 host_save_user_msrs[] = {
 118#ifdef CONFIG_X86_64
 119        MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
 120        MSR_FS_BASE,
 121#endif
 122        MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 123        MSR_TSC_AUX,
 124};
 125
 126#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
 127
 128struct kvm_vcpu;
 129
 130struct nested_state {
 131        struct vmcb *hsave;
 132        u64 hsave_msr;
 133        u64 vm_cr_msr;
 134        u64 vmcb;
 135
 136        /* These are the merged vectors */
 137        u32 *msrpm;
 138
 139        /* gpa pointers to the real vectors */
 140        u64 vmcb_msrpm;
 141        u64 vmcb_iopm;
 142
 143        /* A VMEXIT is required but not yet emulated */
 144        bool exit_required;
 145
 146        /* cache for intercepts of the guest */
 147        u32 intercept_cr;
 148        u32 intercept_dr;
 149        u32 intercept_exceptions;
 150        u64 intercept;
 151
 152        /* Nested Paging related state */
 153        u64 nested_cr3;
 154};
 155
 156#define MSRPM_OFFSETS   16
 157static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 158
 159/*
 160 * Set osvw_len to higher value when updated Revision Guides
 161 * are published and we know what the new status bits are
 162 */
 163static uint64_t osvw_len = 4, osvw_status;
 164
 165struct vcpu_svm {
 166        struct kvm_vcpu vcpu;
 167        struct vmcb *vmcb;
 168        unsigned long vmcb_pa;
 169        struct svm_cpu_data *svm_data;
 170        uint64_t asid_generation;
 171        uint64_t sysenter_esp;
 172        uint64_t sysenter_eip;
 173        uint64_t tsc_aux;
 174
 175        u64 next_rip;
 176
 177        u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
 178        struct {
 179                u16 fs;
 180                u16 gs;
 181                u16 ldt;
 182                u64 gs_base;
 183        } host;
 184
 185        u64 spec_ctrl;
 186        /*
 187         * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
 188         * translated into the appropriate L2_CFG bits on the host to
 189         * perform speculative control.
 190         */
 191        u64 virt_spec_ctrl;
 192
 193        u32 *msrpm;
 194
 195        ulong nmi_iret_rip;
 196
 197        struct nested_state nested;
 198
 199        bool nmi_singlestep;
 200        u64 nmi_singlestep_guest_rflags;
 201
 202        unsigned int3_injected;
 203        unsigned long int3_rip;
 204        u32 apf_reason;
 205
 206        /* cached guest cpuid flags for faster access */
 207        bool nrips_enabled      : 1;
 208
 209        u32 ldr_reg;
 210        struct page *avic_backing_page;
 211        u64 *avic_physical_id_cache;
 212        bool avic_is_running;
 213
 214        /*
 215         * Per-vcpu list of struct amd_svm_iommu_ir:
 216         * This is used mainly to store interrupt remapping information used
 217         * when update the vcpu affinity. This avoids the need to scan for
 218         * IRTE and try to match ga_tag in the IOMMU driver.
 219         */
 220        struct list_head ir_list;
 221        spinlock_t ir_list_lock;
 222};
 223
 224/*
 225 * This is a wrapper of struct amd_iommu_ir_data.
 226 */
 227struct amd_svm_iommu_ir {
 228        struct list_head node;  /* Used by SVM for per-vcpu ir_list */
 229        void *data;             /* Storing pointer to struct amd_ir_data */
 230};
 231
 232#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK    (0xFF)
 233#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK                (1 << 31)
 234
 235#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK    (0xFFULL)
 236#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK        (0xFFFFFFFFFFULL << 12)
 237#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK          (1ULL << 62)
 238#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK               (1ULL << 63)
 239
 240static DEFINE_PER_CPU(u64, current_tsc_ratio);
 241#define TSC_RATIO_DEFAULT       0x0100000000ULL
 242
 243#define MSR_INVALID                     0xffffffffU
 244
 245static const struct svm_direct_access_msrs {
 246        u32 index;   /* Index of the MSR */
 247        bool always; /* True if intercept is always off */
 248} direct_access_msrs[] = {
 249        { .index = MSR_STAR,                            .always = true  },
 250        { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
 251#ifdef CONFIG_X86_64
 252        { .index = MSR_GS_BASE,                         .always = true  },
 253        { .index = MSR_FS_BASE,                         .always = true  },
 254        { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
 255        { .index = MSR_LSTAR,                           .always = true  },
 256        { .index = MSR_CSTAR,                           .always = true  },
 257        { .index = MSR_SYSCALL_MASK,                    .always = true  },
 258#endif
 259        { .index = MSR_IA32_SPEC_CTRL,                  .always = true  },
 260        { .index = MSR_IA32_PRED_CMD,                   .always = false },
 261        { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
 262        { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
 263        { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
 264        { .index = MSR_IA32_LASTINTTOIP,                .always = false },
 265        { .index = MSR_INVALID,                         .always = false },
 266};
 267
 268/* enable NPT for AMD64 and X86 with PAE */
 269#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 270static bool npt_enabled = true;
 271#else
 272static bool npt_enabled;
 273#endif
 274
 275/*
 276 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 277 * pause_filter_count: On processors that support Pause filtering(indicated
 278 *      by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 279 *      count value. On VMRUN this value is loaded into an internal counter.
 280 *      Each time a pause instruction is executed, this counter is decremented
 281 *      until it reaches zero at which time a #VMEXIT is generated if pause
 282 *      intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 283 *      Intercept Filtering for more details.
 284 *      This also indicate if ple logic enabled.
 285 *
 286 * pause_filter_thresh: In addition, some processor families support advanced
 287 *      pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 288 *      the amount of time a guest is allowed to execute in a pause loop.
 289 *      In this mode, a 16-bit pause filter threshold field is added in the
 290 *      VMCB. The threshold value is a cycle count that is used to reset the
 291 *      pause counter. As with simple pause filtering, VMRUN loads the pause
 292 *      count value from VMCB into an internal counter. Then, on each pause
 293 *      instruction the hardware checks the elapsed number of cycles since
 294 *      the most recent pause instruction against the pause filter threshold.
 295 *      If the elapsed cycle count is greater than the pause filter threshold,
 296 *      then the internal pause count is reloaded from the VMCB and execution
 297 *      continues. If the elapsed cycle count is less than the pause filter
 298 *      threshold, then the internal pause count is decremented. If the count
 299 *      value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 300 *      triggered. If advanced pause filtering is supported and pause filter
 301 *      threshold field is set to zero, the filter will operate in the simpler,
 302 *      count only mode.
 303 */
 304
 305static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
 306module_param(pause_filter_thresh, ushort, 0444);
 307
 308static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
 309module_param(pause_filter_count, ushort, 0444);
 310
 311/* Default doubles per-vcpu window every exit. */
 312static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 313module_param(pause_filter_count_grow, ushort, 0444);
 314
 315/* Default resets per-vcpu window every exit to pause_filter_count. */
 316static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 317module_param(pause_filter_count_shrink, ushort, 0444);
 318
 319/* Default is to compute the maximum so we can never overflow. */
 320static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
 321module_param(pause_filter_count_max, ushort, 0444);
 322
 323/* allow nested paging (virtualized MMU) for all guests */
 324static int npt = true;
 325module_param(npt, int, S_IRUGO);
 326
 327/* allow nested virtualization in KVM/SVM */
 328static int nested = false;
 329module_param(nested, int, S_IRUGO);
 330
 331/* enable / disable AVIC */
 332static int avic;
 333#ifdef CONFIG_X86_LOCAL_APIC
 334module_param(avic, int, S_IRUGO);
 335#endif
 336
 337/* enable/disable Virtual VMLOAD VMSAVE */
 338static int vls = false;
 339module_param(vls, int, 0444);
 340
 341/* enable/disable Virtual GIF */
 342static int vgif = false;
 343module_param(vgif, int, 0444);
 344
 345/* AVIC VM ID bit masks and lock */
 346static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
 347static DEFINE_SPINLOCK(avic_vm_id_lock);
 348
 349static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 350static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 351static void svm_complete_interrupts(struct vcpu_svm *svm);
 352
 353static int nested_svm_exit_handled(struct vcpu_svm *svm);
 354static int nested_svm_intercept(struct vcpu_svm *svm);
 355static int nested_svm_vmexit(struct vcpu_svm *svm);
 356static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 357                                      bool has_error_code, u32 error_code);
 358
 359enum {
 360        VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
 361                            pause filter count */
 362        VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
 363        VMCB_ASID,       /* ASID */
 364        VMCB_INTR,       /* int_ctl, int_vector */
 365        VMCB_NPT,        /* npt_en, nCR3, gPAT */
 366        VMCB_CR,         /* CR0, CR3, CR4, EFER */
 367        VMCB_DR,         /* DR6, DR7 */
 368        VMCB_DT,         /* GDT, IDT */
 369        VMCB_SEG,        /* CS, DS, SS, ES, CPL */
 370        VMCB_CR2,        /* CR2 only */
 371        VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
 372        VMCB_AVIC,       /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
 373                          * AVIC PHYSICAL_TABLE pointer,
 374                          * AVIC LOGICAL_TABLE pointer
 375                          */
 376        VMCB_DIRTY_MAX,
 377};
 378
 379/* TPR and CR2 are always written before VMRUN */
 380#define VMCB_ALWAYS_DIRTY_MASK  ((1U << VMCB_INTR) | (1U << VMCB_CR2))
 381
 382#define VMCB_AVIC_APIC_BAR_MASK         0xFFFFFFFFFF000ULL
 383
 384static inline void mark_all_dirty(struct vmcb *vmcb)
 385{
 386        vmcb->control.clean = 0;
 387}
 388
 389static inline void mark_all_clean(struct vmcb *vmcb)
 390{
 391        vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
 392                               & ~VMCB_ALWAYS_DIRTY_MASK;
 393}
 394
 395static inline void mark_dirty(struct vmcb *vmcb, int bit)
 396{
 397        vmcb->control.clean &= ~(1 << bit);
 398}
 399
 400static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 401{
 402        return container_of(vcpu, struct vcpu_svm, vcpu);
 403}
 404
 405static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
 406{
 407        svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
 408        mark_dirty(svm->vmcb, VMCB_AVIC);
 409}
 410
 411static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
 412{
 413        struct vcpu_svm *svm = to_svm(vcpu);
 414        u64 *entry = svm->avic_physical_id_cache;
 415
 416        if (!entry)
 417                return false;
 418
 419        return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
 420}
 421
 422static void recalc_intercepts(struct vcpu_svm *svm)
 423{
 424        struct vmcb_control_area *c, *h;
 425        struct nested_state *g;
 426
 427        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 428
 429        if (!is_guest_mode(&svm->vcpu))
 430                return;
 431
 432        c = &svm->vmcb->control;
 433        h = &svm->nested.hsave->control;
 434        g = &svm->nested;
 435
 436        c->intercept_cr = h->intercept_cr | g->intercept_cr;
 437        c->intercept_dr = h->intercept_dr | g->intercept_dr;
 438        c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 439        c->intercept = h->intercept | g->intercept;
 440}
 441
 442static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
 443{
 444        if (is_guest_mode(&svm->vcpu))
 445                return svm->nested.hsave;
 446        else
 447                return svm->vmcb;
 448}
 449
 450static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
 451{
 452        struct vmcb *vmcb = get_host_vmcb(svm);
 453
 454        vmcb->control.intercept_cr |= (1U << bit);
 455
 456        recalc_intercepts(svm);
 457}
 458
 459static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
 460{
 461        struct vmcb *vmcb = get_host_vmcb(svm);
 462
 463        vmcb->control.intercept_cr &= ~(1U << bit);
 464
 465        recalc_intercepts(svm);
 466}
 467
 468static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
 469{
 470        struct vmcb *vmcb = get_host_vmcb(svm);
 471
 472        return vmcb->control.intercept_cr & (1U << bit);
 473}
 474
 475static inline void set_dr_intercepts(struct vcpu_svm *svm)
 476{
 477        struct vmcb *vmcb = get_host_vmcb(svm);
 478
 479        vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
 480                | (1 << INTERCEPT_DR1_READ)
 481                | (1 << INTERCEPT_DR2_READ)
 482                | (1 << INTERCEPT_DR3_READ)
 483                | (1 << INTERCEPT_DR4_READ)
 484                | (1 << INTERCEPT_DR5_READ)
 485                | (1 << INTERCEPT_DR6_READ)
 486                | (1 << INTERCEPT_DR7_READ)
 487                | (1 << INTERCEPT_DR0_WRITE)
 488                | (1 << INTERCEPT_DR1_WRITE)
 489                | (1 << INTERCEPT_DR2_WRITE)
 490                | (1 << INTERCEPT_DR3_WRITE)
 491                | (1 << INTERCEPT_DR4_WRITE)
 492                | (1 << INTERCEPT_DR5_WRITE)
 493                | (1 << INTERCEPT_DR6_WRITE)
 494                | (1 << INTERCEPT_DR7_WRITE);
 495
 496        recalc_intercepts(svm);
 497}
 498
 499static inline void clr_dr_intercepts(struct vcpu_svm *svm)
 500{
 501        struct vmcb *vmcb = get_host_vmcb(svm);
 502
 503        vmcb->control.intercept_dr = 0;
 504
 505        recalc_intercepts(svm);
 506}
 507
 508static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
 509{
 510        struct vmcb *vmcb = get_host_vmcb(svm);
 511
 512        vmcb->control.intercept_exceptions |= (1U << bit);
 513
 514        recalc_intercepts(svm);
 515}
 516
 517static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
 518{
 519        struct vmcb *vmcb = get_host_vmcb(svm);
 520
 521        vmcb->control.intercept_exceptions &= ~(1U << bit);
 522
 523        recalc_intercepts(svm);
 524}
 525
 526static inline void set_intercept(struct vcpu_svm *svm, int bit)
 527{
 528        struct vmcb *vmcb = get_host_vmcb(svm);
 529
 530        vmcb->control.intercept |= (1ULL << bit);
 531
 532        recalc_intercepts(svm);
 533}
 534
 535static inline void clr_intercept(struct vcpu_svm *svm, int bit)
 536{
 537        struct vmcb *vmcb = get_host_vmcb(svm);
 538
 539        vmcb->control.intercept &= ~(1ULL << bit);
 540
 541        recalc_intercepts(svm);
 542}
 543
 544static inline bool vgif_enabled(struct vcpu_svm *svm)
 545{
 546        return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
 547}
 548
 549static inline void enable_gif(struct vcpu_svm *svm)
 550{
 551        if (vgif_enabled(svm))
 552                svm->vmcb->control.int_ctl |= V_GIF_MASK;
 553        else
 554                svm->vcpu.arch.hflags |= HF_GIF_MASK;
 555}
 556
 557static inline void disable_gif(struct vcpu_svm *svm)
 558{
 559        if (vgif_enabled(svm))
 560                svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
 561        else
 562                svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
 563}
 564
 565static inline bool gif_set(struct vcpu_svm *svm)
 566{
 567        if (vgif_enabled(svm))
 568                return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
 569        else
 570                return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
 571}
 572
 573static unsigned long iopm_base;
 574
 575struct kvm_ldttss_desc {
 576        u16 limit0;
 577        u16 base0;
 578        unsigned base1:8, type:5, dpl:2, p:1;
 579        unsigned limit1:4, zero0:3, g:1, base2:8;
 580        u32 base3;
 581        u32 zero1;
 582} __attribute__((packed));
 583
 584struct svm_cpu_data {
 585        int cpu;
 586
 587        u64 asid_generation;
 588        u32 max_asid;
 589        u32 next_asid;
 590        struct kvm_ldttss_desc *tss_desc;
 591
 592        struct page *save_area;
 593
 594        struct vmcb *current_vmcb;
 595};
 596
 597static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 598
 599struct svm_init_data {
 600        int cpu;
 601        int r;
 602};
 603
 604static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 605
 606#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 607#define MSRS_RANGE_SIZE 2048
 608#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 609
 610static u32 svm_msrpm_offset(u32 msr)
 611{
 612        u32 offset;
 613        int i;
 614
 615        for (i = 0; i < NUM_MSR_MAPS; i++) {
 616                if (msr < msrpm_ranges[i] ||
 617                    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 618                        continue;
 619
 620                offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 621                offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 622
 623                /* Now we have the u8 offset - but need the u32 offset */
 624                return offset / 4;
 625        }
 626
 627        /* MSR not in any range */
 628        return MSR_INVALID;
 629}
 630
 631#define MAX_INST_SIZE 15
 632
 633static inline void clgi(void)
 634{
 635        asm volatile (__ex(SVM_CLGI));
 636}
 637
 638static inline void stgi(void)
 639{
 640        asm volatile (__ex(SVM_STGI));
 641}
 642
 643static inline void invlpga(unsigned long addr, u32 asid)
 644{
 645        asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
 646}
 647
 648static int get_npt_level(void)
 649{
 650#ifdef CONFIG_X86_64
 651        return PT64_ROOT_LEVEL;
 652#else
 653        return PT32E_ROOT_LEVEL;
 654#endif
 655}
 656
 657static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 658{
 659        vcpu->arch.efer = efer;
 660        if (!npt_enabled && !(efer & EFER_LMA))
 661                efer &= ~EFER_LME;
 662
 663        to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
 664        mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
 665}
 666
 667static int is_external_interrupt(u32 info)
 668{
 669        info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
 670        return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 671}
 672
 673static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 674{
 675        struct vcpu_svm *svm = to_svm(vcpu);
 676        u32 ret = 0;
 677
 678        if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 679                ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 680        return ret;
 681}
 682
 683static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 684{
 685        struct vcpu_svm *svm = to_svm(vcpu);
 686
 687        if (mask == 0)
 688                svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 689        else
 690                svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 691
 692}
 693
 694static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 695{
 696        struct vcpu_svm *svm = to_svm(vcpu);
 697
 698        if (svm->vmcb->control.next_rip != 0) {
 699                WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 700                svm->next_rip = svm->vmcb->control.next_rip;
 701        }
 702
 703        if (!svm->next_rip) {
 704                if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
 705                                EMULATE_DONE)
 706                        printk(KERN_DEBUG "%s: NOP\n", __func__);
 707                return;
 708        }
 709        if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
 710                printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
 711                       __func__, kvm_rip_read(vcpu), svm->next_rip);
 712
 713        kvm_rip_write(vcpu, svm->next_rip);
 714        svm_set_interrupt_shadow(vcpu, 0);
 715}
 716
 717static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 718                                bool has_error_code, u32 error_code,
 719                                bool reinject)
 720{
 721        struct vcpu_svm *svm = to_svm(vcpu);
 722
 723        /*
 724         * If we are within a nested VM we'd better #VMEXIT and let the guest
 725         * handle the exception
 726         */
 727        if (!reinject &&
 728            nested_svm_check_exception(svm, nr, has_error_code, error_code))
 729                return;
 730
 731        if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
 732                unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
 733
 734                /*
 735                 * For guest debugging where we have to reinject #BP if some
 736                 * INT3 is guest-owned:
 737                 * Emulate nRIP by moving RIP forward. Will fail if injection
 738                 * raises a fault that is not intercepted. Still better than
 739                 * failing in all cases.
 740                 */
 741                skip_emulated_instruction(&svm->vcpu);
 742                rip = kvm_rip_read(&svm->vcpu);
 743                svm->int3_rip = rip + svm->vmcb->save.cs.base;
 744                svm->int3_injected = rip - old_rip;
 745        }
 746
 747        svm->vmcb->control.event_inj = nr
 748                | SVM_EVTINJ_VALID
 749                | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 750                | SVM_EVTINJ_TYPE_EXEPT;
 751        svm->vmcb->control.event_inj_err = error_code;
 752}
 753
 754static void svm_init_erratum_383(void)
 755{
 756        u32 low, high;
 757        int err;
 758        u64 val;
 759
 760        if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 761                return;
 762
 763        /* Use _safe variants to not break nested virtualization */
 764        val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 765        if (err)
 766                return;
 767
 768        val |= (1ULL << 47);
 769
 770        low  = lower_32_bits(val);
 771        high = upper_32_bits(val);
 772
 773        native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 774
 775        erratum_383_found = true;
 776}
 777
 778static void svm_init_osvw(struct kvm_vcpu *vcpu)
 779{
 780        /*
 781         * Guests should see errata 400 and 415 as fixed (assuming that
 782         * HLT and IO instructions are intercepted).
 783         */
 784        vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 785        vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 786
 787        /*
 788         * By increasing VCPU's osvw.length to 3 we are telling the guest that
 789         * all osvw.status bits inside that length, including bit 0 (which is
 790         * reserved for erratum 298), are valid. However, if host processor's
 791         * osvw_len is 0 then osvw_status[0] carries no information. We need to
 792         * be conservative here and therefore we tell the guest that erratum 298
 793         * is present (because we really don't know).
 794         */
 795        if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 796                vcpu->arch.osvw.status |= 1;
 797}
 798
 799static int has_svm(void)
 800{
 801        const char *msg;
 802
 803        if (!cpu_has_svm(&msg)) {
 804                printk(KERN_INFO "has_svm: %s\n", msg);
 805                return 0;
 806        }
 807
 808        return 1;
 809}
 810
 811static void svm_hardware_disable(void)
 812{
 813        /* Make sure we clean up behind us */
 814        if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
 815                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 816
 817        cpu_svm_disable();
 818
 819        amd_pmu_disable_virt();
 820}
 821
 822static int svm_hardware_enable(void)
 823{
 824
 825        struct svm_cpu_data *sd;
 826        uint64_t efer;
 827        struct desc_ptr gdt_descr;
 828        struct desc_struct *gdt;
 829        int me = raw_smp_processor_id();
 830
 831        rdmsrl(MSR_EFER, efer);
 832        if (efer & EFER_SVME)
 833                return -EBUSY;
 834
 835        if (!has_svm()) {
 836                pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
 837                return -EINVAL;
 838        }
 839        sd = per_cpu(svm_data, me);
 840        if (!sd) {
 841                pr_err("%s: svm_data is NULL on %d\n", __func__, me);
 842                return -EINVAL;
 843        }
 844
 845        sd->asid_generation = 1;
 846        sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 847        sd->next_asid = sd->max_asid + 1;
 848
 849        native_store_gdt(&gdt_descr);
 850        gdt = (struct desc_struct *)gdt_descr.address;
 851        sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 852
 853        wrmsrl(MSR_EFER, efer | EFER_SVME);
 854
 855        wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
 856
 857        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 858                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 859                __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
 860        }
 861
 862
 863        /*
 864         * Get OSVW bits.
 865         *
 866         * Note that it is possible to have a system with mixed processor
 867         * revisions and therefore different OSVW bits. If bits are not the same
 868         * on different processors then choose the worst case (i.e. if erratum
 869         * is present on one processor and not on another then assume that the
 870         * erratum is present everywhere).
 871         */
 872        if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 873                uint64_t len, status = 0;
 874                int err;
 875
 876                len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 877                if (!err)
 878                        status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 879                                                      &err);
 880
 881                if (err)
 882                        osvw_status = osvw_len = 0;
 883                else {
 884                        if (len < osvw_len)
 885                                osvw_len = len;
 886                        osvw_status |= status;
 887                        osvw_status &= (1ULL << osvw_len) - 1;
 888                }
 889        } else
 890                osvw_status = osvw_len = 0;
 891
 892        svm_init_erratum_383();
 893
 894        amd_pmu_enable_virt();
 895
 896        return 0;
 897}
 898
 899static void svm_cpu_uninit(int cpu)
 900{
 901        struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
 902
 903        if (!sd)
 904                return;
 905
 906        per_cpu(svm_data, raw_smp_processor_id()) = NULL;
 907        __free_page(sd->save_area);
 908        kfree(sd);
 909}
 910
 911static int svm_cpu_init(int cpu)
 912{
 913        struct svm_cpu_data *sd;
 914        int r;
 915
 916        sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
 917        if (!sd)
 918                return -ENOMEM;
 919        sd->cpu = cpu;
 920        sd->save_area = alloc_page(GFP_KERNEL);
 921        r = -ENOMEM;
 922        if (!sd->save_area)
 923                goto err_1;
 924
 925        per_cpu(svm_data, cpu) = sd;
 926
 927        return 0;
 928
 929err_1:
 930        kfree(sd);
 931        return r;
 932
 933}
 934
 935static bool valid_msr_intercept(u32 index)
 936{
 937        int i;
 938
 939        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 940                if (direct_access_msrs[i].index == index)
 941                        return true;
 942
 943        return false;
 944}
 945
 946static void set_msr_interception(u32 *msrpm, unsigned msr,
 947                                 int read, int write)
 948{
 949        u8 bit_read, bit_write;
 950        unsigned long tmp;
 951        u32 offset;
 952
 953        /*
 954         * If this warning triggers extend the direct_access_msrs list at the
 955         * beginning of the file
 956         */
 957        WARN_ON(!valid_msr_intercept(msr));
 958
 959        offset    = svm_msrpm_offset(msr);
 960        bit_read  = 2 * (msr & 0x0f);
 961        bit_write = 2 * (msr & 0x0f) + 1;
 962        tmp       = msrpm[offset];
 963
 964        BUG_ON(offset == MSR_INVALID);
 965
 966        read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 967        write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 968
 969        msrpm[offset] = tmp;
 970}
 971
 972static void svm_vcpu_init_msrpm(u32 *msrpm)
 973{
 974        int i;
 975
 976        memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
 977
 978        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 979                if (!direct_access_msrs[i].always)
 980                        continue;
 981
 982                set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
 983        }
 984}
 985
 986static void add_msr_offset(u32 offset)
 987{
 988        int i;
 989
 990        for (i = 0; i < MSRPM_OFFSETS; ++i) {
 991
 992                /* Offset already in list? */
 993                if (msrpm_offsets[i] == offset)
 994                        return;
 995
 996                /* Slot used by another offset? */
 997                if (msrpm_offsets[i] != MSR_INVALID)
 998                        continue;
 999
1000                /* Add offset to list */
1001                msrpm_offsets[i] = offset;
1002
1003                return;
1004        }
1005
1006        /*
1007         * If this BUG triggers the msrpm_offsets table has an overflow. Just
1008         * increase MSRPM_OFFSETS in this case.
1009         */
1010        BUG();
1011}
1012
1013static void init_msrpm_offsets(void)
1014{
1015        int i;
1016
1017        memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
1018
1019        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
1020                u32 offset;
1021
1022                offset = svm_msrpm_offset(direct_access_msrs[i].index);
1023                BUG_ON(offset == MSR_INVALID);
1024
1025                add_msr_offset(offset);
1026        }
1027}
1028
1029static void svm_enable_lbrv(struct vcpu_svm *svm)
1030{
1031        u32 *msrpm = svm->msrpm;
1032
1033        svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
1034        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
1035        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
1036        set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
1037        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
1038}
1039
1040static void svm_disable_lbrv(struct vcpu_svm *svm)
1041{
1042        u32 *msrpm = svm->msrpm;
1043
1044        svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
1045        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
1046        set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
1047        set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
1048        set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
1049}
1050
1051/* Note:
1052 * This hash table is used to map VM_ID to a struct kvm_arch,
1053 * when handling AMD IOMMU GALOG notification to schedule in
1054 * a particular vCPU.
1055 */
1056#define SVM_VM_DATA_HASH_BITS   8
1057DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
1058static spinlock_t svm_vm_data_hash_lock;
1059
1060/* Note:
1061 * This function is called from IOMMU driver to notify
1062 * SVM to schedule in a particular vCPU of a particular VM.
1063 */
1064static int avic_ga_log_notifier(u32 ga_tag)
1065{
1066        unsigned long flags;
1067        struct kvm_arch *ka = NULL;
1068        struct kvm_vcpu *vcpu = NULL;
1069        u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
1070        u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
1071
1072        pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
1073
1074        spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1075        hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
1076                struct kvm *kvm = container_of(ka, struct kvm, arch);
1077                struct kvm_arch *vm_data = &kvm->arch;
1078
1079                if (vm_data->avic_vm_id != vm_id)
1080                        continue;
1081                vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
1082                break;
1083        }
1084        spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1085
1086        if (!vcpu)
1087                return 0;
1088
1089        /* Note:
1090         * At this point, the IOMMU should have already set the pending
1091         * bit in the vAPIC backing page. So, we just need to schedule
1092         * in the vcpu.
1093         */
1094        if (vcpu->mode == OUTSIDE_GUEST_MODE)
1095                kvm_vcpu_wake_up(vcpu);
1096
1097        return 0;
1098}
1099
1100static void disable_nmi_singlestep(struct vcpu_svm *svm)
1101{
1102        svm->nmi_singlestep = false;
1103
1104        if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
1105                /* Clear our flags if they were not set by the guest */
1106                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1107                        svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
1108                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1109                        svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
1110        }
1111}
1112
1113static void grow_ple_window(struct kvm_vcpu *vcpu)
1114{
1115        struct vcpu_svm *svm = to_svm(vcpu);
1116        struct vmcb_control_area *control = &svm->vmcb->control;
1117        int old = control->pause_filter_count;
1118
1119        control->pause_filter_count = __grow_ple_window(old,
1120                                                        pause_filter_count,
1121                                                        pause_filter_count_grow,
1122                                                        pause_filter_count_max);
1123
1124        if (control->pause_filter_count != old)
1125                mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1126
1127        trace_kvm_ple_window_grow(vcpu->vcpu_id,
1128                                  control->pause_filter_count, old);
1129}
1130
1131static void shrink_ple_window(struct kvm_vcpu *vcpu)
1132{
1133        struct vcpu_svm *svm = to_svm(vcpu);
1134        struct vmcb_control_area *control = &svm->vmcb->control;
1135        int old = control->pause_filter_count;
1136
1137        control->pause_filter_count =
1138                                __shrink_ple_window(old,
1139                                                    pause_filter_count,
1140                                                    pause_filter_count_shrink,
1141                                                    pause_filter_count);
1142        if (control->pause_filter_count != old)
1143                mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1144
1145        trace_kvm_ple_window_shrink(vcpu->vcpu_id,
1146                                    control->pause_filter_count, old);
1147}
1148
1149static __init int svm_hardware_setup(void)
1150{
1151        int cpu;
1152        struct page *iopm_pages;
1153        void *iopm_va;
1154        int r;
1155
1156        iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
1157
1158        if (!iopm_pages)
1159                return -ENOMEM;
1160
1161        iopm_va = page_address(iopm_pages);
1162        memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
1163        iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
1164
1165        init_msrpm_offsets();
1166
1167        if (boot_cpu_has(X86_FEATURE_NX))
1168                kvm_enable_efer_bits(EFER_NX);
1169
1170        if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
1171                kvm_enable_efer_bits(EFER_FFXSR);
1172
1173        if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1174                kvm_has_tsc_control = true;
1175                kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
1176                kvm_tsc_scaling_ratio_frac_bits = 32;
1177        }
1178
1179        /* Check for pause filtering support */
1180        if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1181                pause_filter_count = 0;
1182                pause_filter_thresh = 0;
1183        } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
1184                pause_filter_thresh = 0;
1185        }
1186
1187        if (nested) {
1188                printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
1189                kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
1190        }
1191
1192        for_each_possible_cpu(cpu) {
1193                r = svm_cpu_init(cpu);
1194                if (r)
1195                        goto err;
1196        }
1197
1198        if (!boot_cpu_has(X86_FEATURE_NPT))
1199                npt_enabled = false;
1200
1201        if (npt_enabled && !npt) {
1202                printk(KERN_INFO "kvm: Nested Paging disabled\n");
1203                npt_enabled = false;
1204        }
1205
1206        if (npt_enabled) {
1207                printk(KERN_INFO "kvm: Nested Paging enabled\n");
1208                kvm_enable_tdp();
1209        } else
1210                kvm_disable_tdp();
1211
1212        if (avic) {
1213                if (!npt_enabled ||
1214                    !boot_cpu_has(X86_FEATURE_AVIC) ||
1215                    !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
1216                        avic = false;
1217                } else {
1218                        pr_info("AVIC enabled\n");
1219
1220                        hash_init(svm_vm_data_hash);
1221                        spin_lock_init(&svm_vm_data_hash_lock);
1222                        amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1223                }
1224        }
1225
1226        if (vls) {
1227                if (!npt_enabled ||
1228                    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
1229                    !IS_ENABLED(CONFIG_X86_64)) {
1230                        vls = false;
1231                } else {
1232                        pr_info("Virtual VMLOAD VMSAVE supported\n");
1233                }
1234        }
1235
1236        if (vgif) {
1237                if (!boot_cpu_has(X86_FEATURE_VGIF))
1238                        vgif = false;
1239                else
1240                        pr_info("Virtual GIF supported\n");
1241        }
1242
1243        return 0;
1244
1245err:
1246        __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
1247        iopm_base = 0;
1248        return r;
1249}
1250
1251static __exit void svm_hardware_unsetup(void)
1252{
1253        int cpu;
1254
1255        for_each_possible_cpu(cpu)
1256                svm_cpu_uninit(cpu);
1257
1258        __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
1259        iopm_base = 0;
1260}
1261
1262static void init_seg(struct vmcb_seg *seg)
1263{
1264        seg->selector = 0;
1265        seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1266                      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1267        seg->limit = 0xffff;
1268        seg->base = 0;
1269}
1270
1271static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1272{
1273        seg->selector = 0;
1274        seg->attrib = SVM_SELECTOR_P_MASK | type;
1275        seg->limit = 0xffff;
1276        seg->base = 0;
1277}
1278
1279static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1280{
1281        struct vcpu_svm *svm = to_svm(vcpu);
1282        u64 g_tsc_offset = 0;
1283
1284        if (is_guest_mode(vcpu)) {
1285                g_tsc_offset = svm->vmcb->control.tsc_offset -
1286                               svm->nested.hsave->control.tsc_offset;
1287                svm->nested.hsave->control.tsc_offset = offset;
1288        } else
1289                trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1290                                           svm->vmcb->control.tsc_offset,
1291                                           offset);
1292
1293        svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1294
1295        mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1296}
1297
1298static void avic_init_vmcb(struct vcpu_svm *svm)
1299{
1300        struct vmcb *vmcb = svm->vmcb;
1301        struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
1302        phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
1303        phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page));
1304        phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page));
1305
1306        vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
1307        vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
1308        vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
1309        vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
1310        vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
1311}
1312
1313static void init_vmcb(struct vcpu_svm *svm)
1314{
1315        struct vmcb_control_area *control = &svm->vmcb->control;
1316        struct vmcb_save_area *save = &svm->vmcb->save;
1317
1318        svm->vcpu.fpu_active = 1;
1319        svm->vcpu.arch.hflags = 0;
1320
1321        set_cr_intercept(svm, INTERCEPT_CR0_READ);
1322        set_cr_intercept(svm, INTERCEPT_CR3_READ);
1323        set_cr_intercept(svm, INTERCEPT_CR4_READ);
1324        set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1325        set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1326        set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1327        if (!kvm_vcpu_apicv_active(&svm->vcpu))
1328                set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
1329
1330        set_dr_intercepts(svm);
1331
1332        set_exception_intercept(svm, PF_VECTOR);
1333        set_exception_intercept(svm, UD_VECTOR);
1334        set_exception_intercept(svm, MC_VECTOR);
1335        set_exception_intercept(svm, AC_VECTOR);
1336        set_exception_intercept(svm, DB_VECTOR);
1337
1338        set_intercept(svm, INTERCEPT_INTR);
1339        set_intercept(svm, INTERCEPT_NMI);
1340        set_intercept(svm, INTERCEPT_SMI);
1341        set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1342        set_intercept(svm, INTERCEPT_RDPMC);
1343        set_intercept(svm, INTERCEPT_CPUID);
1344        set_intercept(svm, INTERCEPT_INVD);
1345        set_intercept(svm, INTERCEPT_HLT);
1346        set_intercept(svm, INTERCEPT_INVLPG);
1347        set_intercept(svm, INTERCEPT_INVLPGA);
1348        set_intercept(svm, INTERCEPT_IOIO_PROT);
1349        set_intercept(svm, INTERCEPT_MSR_PROT);
1350        set_intercept(svm, INTERCEPT_TASK_SWITCH);
1351        set_intercept(svm, INTERCEPT_SHUTDOWN);
1352        set_intercept(svm, INTERCEPT_VMRUN);
1353        set_intercept(svm, INTERCEPT_VMMCALL);
1354        set_intercept(svm, INTERCEPT_VMLOAD);
1355        set_intercept(svm, INTERCEPT_VMSAVE);
1356        set_intercept(svm, INTERCEPT_STGI);
1357        set_intercept(svm, INTERCEPT_CLGI);
1358        set_intercept(svm, INTERCEPT_SKINIT);
1359        set_intercept(svm, INTERCEPT_WBINVD);
1360        set_intercept(svm, INTERCEPT_MONITOR);
1361        set_intercept(svm, INTERCEPT_MWAIT);
1362        set_intercept(svm, INTERCEPT_XSETBV);
1363
1364        control->iopm_base_pa = __sme_set(iopm_base);
1365        control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1366        control->int_ctl = V_INTR_MASKING_MASK;
1367
1368        init_seg(&save->es);
1369        init_seg(&save->ss);
1370        init_seg(&save->ds);
1371        init_seg(&save->fs);
1372        init_seg(&save->gs);
1373
1374        save->cs.selector = 0xf000;
1375        save->cs.base = 0xffff0000;
1376        /* Executable/Readable Code Segment */
1377        save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1378                SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1379        save->cs.limit = 0xffff;
1380
1381        save->gdtr.limit = 0xffff;
1382        save->idtr.limit = 0xffff;
1383
1384        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1385        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1386
1387        svm_set_efer(&svm->vcpu, 0);
1388        save->dr6 = 0xffff0ff0;
1389        kvm_set_rflags(&svm->vcpu, 2);
1390        save->rip = 0x0000fff0;
1391        svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
1392
1393        /*
1394         * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1395         * It also updates the guest-visible cr0 value.
1396         */
1397        svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1398        kvm_mmu_reset_context(&svm->vcpu);
1399
1400        save->cr4 = X86_CR4_PAE;
1401        /* rdx = ?? */
1402
1403        if (npt_enabled) {
1404                /* Setup VMCB for Nested Paging */
1405                control->nested_ctl = 1;
1406                clr_intercept(svm, INTERCEPT_INVLPG);
1407                clr_exception_intercept(svm, PF_VECTOR);
1408                clr_cr_intercept(svm, INTERCEPT_CR3_READ);
1409                clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1410                save->g_pat = svm->vcpu.arch.pat;
1411                save->cr3 = 0;
1412                save->cr4 = 0;
1413        }
1414        svm->asid_generation = 0;
1415
1416        svm->nested.vmcb = 0;
1417        svm->vcpu.arch.hflags = 0;
1418
1419        if (pause_filter_count) {
1420                control->pause_filter_count = pause_filter_count;
1421                if (pause_filter_thresh)
1422                        control->pause_filter_thresh = pause_filter_thresh;
1423                set_intercept(svm, INTERCEPT_PAUSE);
1424        } else {
1425                clr_intercept(svm, INTERCEPT_PAUSE);
1426        }
1427
1428        if (kvm_vcpu_apicv_active(&svm->vcpu))
1429                avic_init_vmcb(svm);
1430
1431        /*
1432         * If hardware supports Virtual VMLOAD VMSAVE then enable it
1433         * in VMCB and clear intercepts to avoid #VMEXIT.
1434         */
1435        if (vls) {
1436                clr_intercept(svm, INTERCEPT_VMLOAD);
1437                clr_intercept(svm, INTERCEPT_VMSAVE);
1438                svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1439        }
1440
1441        if (vgif) {
1442                clr_intercept(svm, INTERCEPT_STGI);
1443                clr_intercept(svm, INTERCEPT_CLGI);
1444                svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1445        }
1446
1447        mark_all_dirty(svm->vmcb);
1448
1449        enable_gif(svm);
1450
1451}
1452
1453static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index)
1454{
1455        u64 *avic_physical_id_table;
1456        struct kvm_arch *vm_data = &vcpu->kvm->arch;
1457
1458        if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
1459                return NULL;
1460
1461        avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page);
1462
1463        return &avic_physical_id_table[index];
1464}
1465
1466/**
1467 * Note:
1468 * AVIC hardware walks the nested page table to check permissions,
1469 * but does not use the SPA address specified in the leaf page
1470 * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
1471 * field of the VMCB. Therefore, we set up the
1472 * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
1473 */
1474static int avic_init_access_page(struct kvm_vcpu *vcpu)
1475{
1476        struct kvm *kvm = vcpu->kvm;
1477        int ret;
1478
1479        if (kvm->arch.apic_access_page_done)
1480                return 0;
1481
1482        ret = x86_set_memory_region(kvm,
1483                                    APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
1484                                    APIC_DEFAULT_PHYS_BASE,
1485                                    PAGE_SIZE);
1486        if (ret)
1487                return ret;
1488
1489        kvm->arch.apic_access_page_done = true;
1490        return 0;
1491}
1492
1493static int avic_init_backing_page(struct kvm_vcpu *vcpu)
1494{
1495        int ret;
1496        u64 *entry, new_entry;
1497        int id = vcpu->vcpu_id;
1498        struct vcpu_svm *svm = to_svm(vcpu);
1499
1500        ret = avic_init_access_page(vcpu);
1501        if (ret)
1502                return ret;
1503
1504        if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
1505                return -EINVAL;
1506
1507        if (!svm->vcpu.arch.apic->regs)
1508                return -EINVAL;
1509
1510        svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
1511
1512        /* Setting AVIC backing page address in the phy APIC ID table */
1513        entry = avic_get_physical_id_entry(vcpu, id);
1514        if (!entry)
1515                return -EINVAL;
1516
1517        new_entry = READ_ONCE(*entry);
1518        new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
1519                              AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
1520                              AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
1521        WRITE_ONCE(*entry, new_entry);
1522
1523        svm->avic_physical_id_cache = entry;
1524
1525        return 0;
1526}
1527
1528static inline int avic_get_next_vm_id(void)
1529{
1530        int id;
1531
1532        spin_lock(&avic_vm_id_lock);
1533
1534        /* AVIC VM ID is one-based. */
1535        id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
1536        if (id <= AVIC_VM_ID_MASK)
1537                __set_bit(id, avic_vm_id_bitmap);
1538        else
1539                id = -EAGAIN;
1540
1541        spin_unlock(&avic_vm_id_lock);
1542        return id;
1543}
1544
1545static inline int avic_free_vm_id(int id)
1546{
1547        if (id <= 0 || id > AVIC_VM_ID_MASK)
1548                return -EINVAL;
1549
1550        spin_lock(&avic_vm_id_lock);
1551        __clear_bit(id, avic_vm_id_bitmap);
1552        spin_unlock(&avic_vm_id_lock);
1553        return 0;
1554}
1555
1556static void avic_vm_destroy(struct kvm *kvm)
1557{
1558        unsigned long flags;
1559        struct kvm_arch *vm_data = &kvm->arch;
1560
1561        if (!avic)
1562                return;
1563
1564        avic_free_vm_id(vm_data->avic_vm_id);
1565
1566        if (vm_data->avic_logical_id_table_page)
1567                __free_page(vm_data->avic_logical_id_table_page);
1568        if (vm_data->avic_physical_id_table_page)
1569                __free_page(vm_data->avic_physical_id_table_page);
1570
1571        spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1572        hash_del(&vm_data->hnode);
1573        spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1574}
1575
1576static int avic_vm_init(struct kvm *kvm)
1577{
1578        unsigned long flags;
1579        int vm_id, err = -ENOMEM;
1580        struct kvm_arch *vm_data = &kvm->arch;
1581        struct page *p_page;
1582        struct page *l_page;
1583
1584        if (!avic)
1585                return 0;
1586
1587        vm_id = avic_get_next_vm_id();
1588        if (vm_id < 0)
1589                return vm_id;
1590        vm_data->avic_vm_id = (u32)vm_id;
1591
1592        /* Allocating physical APIC ID table (4KB) */
1593        p_page = alloc_page(GFP_KERNEL);
1594        if (!p_page)
1595                goto free_avic;
1596
1597        vm_data->avic_physical_id_table_page = p_page;
1598        clear_page(page_address(p_page));
1599
1600        /* Allocating logical APIC ID table (4KB) */
1601        l_page = alloc_page(GFP_KERNEL);
1602        if (!l_page)
1603                goto free_avic;
1604
1605        vm_data->avic_logical_id_table_page = l_page;
1606        clear_page(page_address(l_page));
1607
1608        spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1609        hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
1610        spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1611
1612        return 0;
1613
1614free_avic:
1615        avic_vm_destroy(kvm);
1616        return err;
1617}
1618
1619static inline int
1620avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
1621{
1622        int ret = 0;
1623        unsigned long flags;
1624        struct amd_svm_iommu_ir *ir;
1625        struct vcpu_svm *svm = to_svm(vcpu);
1626
1627        if (!kvm_arch_has_assigned_device(vcpu->kvm))
1628                return 0;
1629
1630        /*
1631         * Here, we go through the per-vcpu ir_list to update all existing
1632         * interrupt remapping table entry targeting this vcpu.
1633         */
1634        spin_lock_irqsave(&svm->ir_list_lock, flags);
1635
1636        if (list_empty(&svm->ir_list))
1637                goto out;
1638
1639        list_for_each_entry(ir, &svm->ir_list, node) {
1640                ret = amd_iommu_update_ga(cpu, r, ir->data);
1641                if (ret)
1642                        break;
1643        }
1644out:
1645        spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1646        return ret;
1647}
1648
1649static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1650{
1651        u64 entry;
1652        /* ID = 0xff (broadcast), ID > 0xff (reserved) */
1653        int h_physical_id = kvm_cpu_get_apicid(cpu);
1654        struct vcpu_svm *svm = to_svm(vcpu);
1655
1656        if (!kvm_vcpu_apicv_active(vcpu))
1657                return;
1658
1659        if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
1660                return;
1661
1662        entry = READ_ONCE(*(svm->avic_physical_id_cache));
1663        WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
1664
1665        entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
1666        entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
1667
1668        entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1669        if (svm->avic_is_running)
1670                entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1671
1672        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1673        avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
1674                                        svm->avic_is_running);
1675}
1676
1677static void avic_vcpu_put(struct kvm_vcpu *vcpu)
1678{
1679        u64 entry;
1680        struct vcpu_svm *svm = to_svm(vcpu);
1681
1682        if (!kvm_vcpu_apicv_active(vcpu))
1683                return;
1684
1685        entry = READ_ONCE(*(svm->avic_physical_id_cache));
1686        if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
1687                avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
1688
1689        entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
1690        WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1691}
1692
1693/**
1694 * This function is called during VCPU halt/unhalt.
1695 */
1696static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
1697{
1698        struct vcpu_svm *svm = to_svm(vcpu);
1699
1700        svm->avic_is_running = is_run;
1701        if (is_run)
1702                avic_vcpu_load(vcpu, vcpu->cpu);
1703        else
1704                avic_vcpu_put(vcpu);
1705}
1706
1707static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1708{
1709        struct vcpu_svm *svm = to_svm(vcpu);
1710        u32 dummy;
1711        u32 eax = 1;
1712
1713        svm->virt_spec_ctrl = 0;
1714        if (!init_event) {
1715                svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1716                                           MSR_IA32_APICBASE_ENABLE;
1717                if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1718                        svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1719        }
1720        init_vmcb(svm);
1721
1722        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1723        kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1724
1725        if (kvm_vcpu_apicv_active(vcpu) && !init_event)
1726                avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
1727}
1728
1729static int avic_init_vcpu(struct vcpu_svm *svm)
1730{
1731        int ret;
1732
1733        if (!kvm_vcpu_apicv_active(&svm->vcpu))
1734                return 0;
1735
1736        ret = avic_init_backing_page(&svm->vcpu);
1737        if (ret)
1738                return ret;
1739
1740        INIT_LIST_HEAD(&svm->ir_list);
1741        spin_lock_init(&svm->ir_list_lock);
1742
1743        return ret;
1744}
1745
1746static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1747{
1748        struct vcpu_svm *svm;
1749        struct page *page;
1750        struct page *msrpm_pages;
1751        struct page *hsave_page;
1752        struct page *nested_msrpm_pages;
1753        int err;
1754
1755        svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1756        if (!svm) {
1757                err = -ENOMEM;
1758                goto out;
1759        }
1760
1761        err = kvm_vcpu_init(&svm->vcpu, kvm, id);
1762        if (err)
1763                goto free_svm;
1764
1765        err = -ENOMEM;
1766        page = alloc_page(GFP_KERNEL);
1767        if (!page)
1768                goto uninit;
1769
1770        msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1771        if (!msrpm_pages)
1772                goto free_page1;
1773
1774        nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1775        if (!nested_msrpm_pages)
1776                goto free_page2;
1777
1778        hsave_page = alloc_page(GFP_KERNEL);
1779        if (!hsave_page)
1780                goto free_page3;
1781
1782        err = avic_init_vcpu(svm);
1783        if (err)
1784                goto free_page4;
1785
1786        /* We initialize this flag to true to make sure that the is_running
1787         * bit would be set the first time the vcpu is loaded.
1788         */
1789        svm->avic_is_running = true;
1790
1791        svm->nested.hsave = page_address(hsave_page);
1792
1793        svm->msrpm = page_address(msrpm_pages);
1794        svm_vcpu_init_msrpm(svm->msrpm);
1795
1796        svm->nested.msrpm = page_address(nested_msrpm_pages);
1797        svm_vcpu_init_msrpm(svm->nested.msrpm);
1798
1799        svm->vmcb = page_address(page);
1800        clear_page(svm->vmcb);
1801        svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
1802        svm->asid_generation = 0;
1803        init_vmcb(svm);
1804
1805        svm_init_osvw(&svm->vcpu);
1806
1807        return &svm->vcpu;
1808
1809free_page4:
1810        __free_page(hsave_page);
1811free_page3:
1812        __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1813free_page2:
1814        __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
1815free_page1:
1816        __free_page(page);
1817uninit:
1818        kvm_vcpu_uninit(&svm->vcpu);
1819free_svm:
1820        kmem_cache_free(kvm_vcpu_cache, svm);
1821out:
1822        return ERR_PTR(err);
1823}
1824
1825static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1826{
1827        struct vcpu_svm *svm = to_svm(vcpu);
1828
1829        __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
1830        __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1831        __free_page(virt_to_page(svm->nested.hsave));
1832        __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1833        kvm_vcpu_uninit(vcpu);
1834        kmem_cache_free(kvm_vcpu_cache, svm);
1835
1836        /*
1837         * The VMCB could be recycled, causing a false negative in svm_vcpu_load;
1838         * block speculative execution.
1839         */
1840        spec_ctrl_ibpb();
1841}
1842
1843static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1844{
1845        struct vcpu_svm *svm = to_svm(vcpu);
1846        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1847        int i;
1848
1849        if (unlikely(cpu != vcpu->cpu)) {
1850                svm->asid_generation = 0;
1851                mark_all_dirty(svm->vmcb);
1852        }
1853
1854#ifdef CONFIG_X86_64
1855        rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1856#endif
1857        savesegment(fs, svm->host.fs);
1858        savesegment(gs, svm->host.gs);
1859        svm->host.ldt = kvm_read_ldt();
1860
1861        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1862                rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1863
1864        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1865                u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
1866                if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
1867                        __this_cpu_write(current_tsc_ratio, tsc_ratio);
1868                        wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
1869                }
1870        }
1871        /* This assumes that the kernel never uses MSR_TSC_AUX */
1872        if (static_cpu_has(X86_FEATURE_RDTSCP))
1873                wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
1874
1875        if (sd->current_vmcb != svm->vmcb) {
1876                sd->current_vmcb = svm->vmcb;
1877                spec_ctrl_ibpb();
1878        }
1879
1880        avic_vcpu_load(vcpu, cpu);
1881}
1882
1883static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1884{
1885        struct vcpu_svm *svm = to_svm(vcpu);
1886        int i;
1887
1888        avic_vcpu_put(vcpu);
1889
1890        ++vcpu->stat.host_state_reload;
1891        kvm_load_ldt(svm->host.ldt);
1892#ifdef CONFIG_X86_64
1893        loadsegment(fs, svm->host.fs);
1894        wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1895        load_gs_index(svm->host.gs);
1896#else
1897#ifdef CONFIG_X86_32_LAZY_GS
1898        loadsegment(gs, svm->host.gs);
1899#endif
1900#endif
1901        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1902                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1903}
1904
1905static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
1906{
1907        avic_set_running(vcpu, false);
1908}
1909
1910static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
1911{
1912        avic_set_running(vcpu, true);
1913}
1914
1915static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1916{
1917        struct vcpu_svm *svm = to_svm(vcpu);
1918        unsigned long rflags = svm->vmcb->save.rflags;
1919
1920        if (svm->nmi_singlestep) {
1921                /* Hide our flags if they were not set by the guest */
1922                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1923                        rflags &= ~X86_EFLAGS_TF;
1924                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1925                        rflags &= ~X86_EFLAGS_RF;
1926        }
1927        return rflags;
1928}
1929
1930static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1931{
1932        if (to_svm(vcpu)->nmi_singlestep)
1933                rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1934
1935       /*
1936        * Any change of EFLAGS.VM is accompanied by a reload of SS
1937        * (caused by either a task switch or an inter-privilege IRET),
1938        * so we do not need to update the CPL here.
1939        */
1940        to_svm(vcpu)->vmcb->save.rflags = rflags;
1941}
1942
1943static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1944{
1945        switch (reg) {
1946        case VCPU_EXREG_PDPTR:
1947                BUG_ON(!npt_enabled);
1948                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1949                break;
1950        default:
1951                BUG();
1952        }
1953}
1954
1955static void svm_set_vintr(struct vcpu_svm *svm)
1956{
1957        set_intercept(svm, INTERCEPT_VINTR);
1958}
1959
1960static void svm_clear_vintr(struct vcpu_svm *svm)
1961{
1962        clr_intercept(svm, INTERCEPT_VINTR);
1963}
1964
1965static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1966{
1967        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1968
1969        switch (seg) {
1970        case VCPU_SREG_CS: return &save->cs;
1971        case VCPU_SREG_DS: return &save->ds;
1972        case VCPU_SREG_ES: return &save->es;
1973        case VCPU_SREG_FS: return &save->fs;
1974        case VCPU_SREG_GS: return &save->gs;
1975        case VCPU_SREG_SS: return &save->ss;
1976        case VCPU_SREG_TR: return &save->tr;
1977        case VCPU_SREG_LDTR: return &save->ldtr;
1978        }
1979        BUG();
1980        return NULL;
1981}
1982
1983static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1984{
1985        struct vmcb_seg *s = svm_seg(vcpu, seg);
1986
1987        return s->base;
1988}
1989
1990static void svm_get_segment(struct kvm_vcpu *vcpu,
1991                            struct kvm_segment *var, int seg)
1992{
1993        struct vmcb_seg *s = svm_seg(vcpu, seg);
1994
1995        var->base = s->base;
1996        var->limit = s->limit;
1997        var->selector = s->selector;
1998        var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1999        var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
2000        var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
2001        var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
2002        var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
2003        var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
2004        var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
2005
2006        /*
2007         * AMD CPUs circa 2014 track the G bit for all segments except CS.
2008         * However, the SVM spec states that the G bit is not observed by the
2009         * CPU, and some VMware virtual CPUs drop the G bit for all segments.
2010         * So let's synthesize a legal G bit for all segments, this helps
2011         * running KVM nested. It also helps cross-vendor migration, because
2012         * Intel's vmentry has a check on the 'G' bit.
2013         */
2014        var->g = s->limit > 0xfffff;
2015
2016        /*
2017         * AMD's VMCB does not have an explicit unusable field, so emulate it
2018         * for cross vendor migration purposes by "not present"
2019         */
2020        var->unusable = !var->present || (var->type == 0);
2021
2022        switch (seg) {
2023        case VCPU_SREG_TR:
2024                /*
2025                 * Work around a bug where the busy flag in the tr selector
2026                 * isn't exposed
2027                 */
2028                var->type |= 0x2;
2029                break;
2030        case VCPU_SREG_DS:
2031        case VCPU_SREG_ES:
2032        case VCPU_SREG_FS:
2033        case VCPU_SREG_GS:
2034                /*
2035                 * The accessed bit must always be set in the segment
2036                 * descriptor cache, although it can be cleared in the
2037                 * descriptor, the cached bit always remains at 1. Since
2038                 * Intel has a check on this, set it here to support
2039                 * cross-vendor migration.
2040                 */
2041                if (!var->unusable)
2042                        var->type |= 0x1;
2043                break;
2044        case VCPU_SREG_SS:
2045                /*
2046                 * On AMD CPUs sometimes the DB bit in the segment
2047                 * descriptor is left as 1, although the whole segment has
2048                 * been made unusable. Clear it here to pass an Intel VMX
2049                 * entry check when cross vendor migrating.
2050                 */
2051                if (var->unusable)
2052                        var->db = 0;
2053                var->dpl = to_svm(vcpu)->vmcb->save.cpl;
2054                break;
2055        }
2056}
2057
2058static int svm_get_cpl(struct kvm_vcpu *vcpu)
2059{
2060        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
2061
2062        return save->cpl;
2063}
2064
2065static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2066{
2067        struct vcpu_svm *svm = to_svm(vcpu);
2068
2069        dt->size = svm->vmcb->save.idtr.limit;
2070        dt->address = svm->vmcb->save.idtr.base;
2071}
2072
2073static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2074{
2075        struct vcpu_svm *svm = to_svm(vcpu);
2076
2077        svm->vmcb->save.idtr.limit = dt->size;
2078        svm->vmcb->save.idtr.base = dt->address ;
2079        mark_dirty(svm->vmcb, VMCB_DT);
2080}
2081
2082static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2083{
2084        struct vcpu_svm *svm = to_svm(vcpu);
2085
2086        dt->size = svm->vmcb->save.gdtr.limit;
2087        dt->address = svm->vmcb->save.gdtr.base;
2088}
2089
2090static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2091{
2092        struct vcpu_svm *svm = to_svm(vcpu);
2093
2094        svm->vmcb->save.gdtr.limit = dt->size;
2095        svm->vmcb->save.gdtr.base = dt->address ;
2096        mark_dirty(svm->vmcb, VMCB_DT);
2097}
2098
2099static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
2100{
2101}
2102
2103static void svm_decache_cr3(struct kvm_vcpu *vcpu)
2104{
2105}
2106
2107static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
2108{
2109}
2110
2111static void update_cr0_intercept(struct vcpu_svm *svm)
2112{
2113        ulong gcr0 = svm->vcpu.arch.cr0;
2114        u64 *hcr0 = &svm->vmcb->save.cr0;
2115
2116        if (!svm->vcpu.fpu_active)
2117                *hcr0 |= SVM_CR0_SELECTIVE_MASK;
2118        else
2119                *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
2120                        | (gcr0 & SVM_CR0_SELECTIVE_MASK);
2121
2122        mark_dirty(svm->vmcb, VMCB_CR);
2123
2124        if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
2125                clr_cr_intercept(svm, INTERCEPT_CR0_READ);
2126                clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
2127        } else {
2128                set_cr_intercept(svm, INTERCEPT_CR0_READ);
2129                set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
2130        }
2131}
2132
2133static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
2134{
2135        struct vcpu_svm *svm = to_svm(vcpu);
2136
2137#ifdef CONFIG_X86_64
2138        if (vcpu->arch.efer & EFER_LME) {
2139                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
2140                        vcpu->arch.efer |= EFER_LMA;
2141                        svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
2142                }
2143
2144                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
2145                        vcpu->arch.efer &= ~EFER_LMA;
2146                        svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
2147                }
2148        }
2149#endif
2150        vcpu->arch.cr0 = cr0;
2151
2152        if (!npt_enabled)
2153                cr0 |= X86_CR0_PG | X86_CR0_WP;
2154
2155        if (!vcpu->fpu_active)
2156                cr0 |= X86_CR0_TS;
2157        /*
2158         * re-enable caching here because the QEMU bios
2159         * does not do it - this results in some delay at
2160         * reboot
2161         */
2162        if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
2163                cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
2164        svm->vmcb->save.cr0 = cr0;
2165        mark_dirty(svm->vmcb, VMCB_CR);
2166        update_cr0_intercept(svm);
2167}
2168
2169static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2170{
2171        unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
2172        unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
2173
2174        if (cr4 & X86_CR4_VMXE)
2175                return 1;
2176
2177        if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
2178                svm_flush_tlb(vcpu);
2179
2180        vcpu->arch.cr4 = cr4;
2181        if (!npt_enabled)
2182                cr4 |= X86_CR4_PAE;
2183        cr4 |= host_cr4_mce;
2184        to_svm(vcpu)->vmcb->save.cr4 = cr4;
2185        mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
2186        return 0;
2187}
2188
2189static void svm_set_segment(struct kvm_vcpu *vcpu,
2190                            struct kvm_segment *var, int seg)
2191{
2192        struct vcpu_svm *svm = to_svm(vcpu);
2193        struct vmcb_seg *s = svm_seg(vcpu, seg);
2194
2195        s->base = var->base;
2196        s->limit = var->limit;
2197        s->selector = var->selector;
2198        if (var->unusable)
2199                s->attrib = 0;
2200        else {
2201                s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
2202                s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
2203                s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
2204                s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
2205                s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
2206                s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
2207                s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
2208                s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
2209        }
2210
2211        /*
2212         * This is always accurate, except if SYSRET returned to a segment
2213         * with SS.DPL != 3.  Intel does not have this quirk, and always
2214         * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
2215         * would entail passing the CPL to userspace and back.
2216         */
2217        if (seg == VCPU_SREG_SS)
2218                svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
2219
2220        mark_dirty(svm->vmcb, VMCB_SEG);
2221}
2222
2223static void update_bp_intercept(struct kvm_vcpu *vcpu)
2224{
2225        struct vcpu_svm *svm = to_svm(vcpu);
2226
2227        clr_exception_intercept(svm, BP_VECTOR);
2228
2229        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
2230                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2231                        set_exception_intercept(svm, BP_VECTOR);
2232        } else
2233                vcpu->guest_debug = 0;
2234}
2235
2236static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
2237{
2238        if (sd->next_asid > sd->max_asid) {
2239                ++sd->asid_generation;
2240                sd->next_asid = 1;
2241                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
2242        }
2243
2244        svm->asid_generation = sd->asid_generation;
2245        svm->vmcb->control.asid = sd->next_asid++;
2246
2247        mark_dirty(svm->vmcb, VMCB_ASID);
2248}
2249
2250static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
2251{
2252        return to_svm(vcpu)->vmcb->save.dr6;
2253}
2254
2255static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
2256{
2257        struct vcpu_svm *svm = to_svm(vcpu);
2258
2259        svm->vmcb->save.dr6 = value;
2260        mark_dirty(svm->vmcb, VMCB_DR);
2261}
2262
2263static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
2264{
2265        struct vcpu_svm *svm = to_svm(vcpu);
2266
2267        get_debugreg(vcpu->arch.db[0], 0);
2268        get_debugreg(vcpu->arch.db[1], 1);
2269        get_debugreg(vcpu->arch.db[2], 2);
2270        get_debugreg(vcpu->arch.db[3], 3);
2271        vcpu->arch.dr6 = svm_get_dr6(vcpu);
2272        vcpu->arch.dr7 = svm->vmcb->save.dr7;
2273
2274        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
2275        set_dr_intercepts(svm);
2276}
2277
2278static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
2279{
2280        struct vcpu_svm *svm = to_svm(vcpu);
2281
2282        svm->vmcb->save.dr7 = value;
2283        mark_dirty(svm->vmcb, VMCB_DR);
2284}
2285
2286static int pf_interception(struct vcpu_svm *svm)
2287{
2288        u64 fault_address = svm->vmcb->control.exit_info_2;
2289        u32 error_code;
2290        int r = 1;
2291
2292        switch (svm->apf_reason) {
2293        default:
2294                error_code = svm->vmcb->control.exit_info_1;
2295
2296                trace_kvm_page_fault(fault_address, error_code);
2297                if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
2298                        kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
2299                r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
2300                        svm->vmcb->control.insn_bytes,
2301                        svm->vmcb->control.insn_len);
2302                break;
2303        case KVM_PV_REASON_PAGE_NOT_PRESENT:
2304                svm->apf_reason = 0;
2305                local_irq_disable();
2306                kvm_async_pf_task_wait(fault_address, true);
2307                local_irq_enable();
2308                break;
2309        case KVM_PV_REASON_PAGE_READY:
2310                svm->apf_reason = 0;
2311                local_irq_disable();
2312                kvm_async_pf_task_wake(fault_address);
2313                local_irq_enable();
2314                break;
2315        }
2316        return r;
2317}
2318
2319static int db_interception(struct vcpu_svm *svm)
2320{
2321        struct kvm_run *kvm_run = svm->vcpu.run;
2322
2323        if (!(svm->vcpu.guest_debug &
2324              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
2325                !svm->nmi_singlestep) {
2326                kvm_queue_exception(&svm->vcpu, DB_VECTOR);
2327                return 1;
2328        }
2329
2330        if (svm->nmi_singlestep) {
2331                disable_nmi_singlestep(svm);
2332        }
2333
2334        if (svm->vcpu.guest_debug &
2335            (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
2336                kvm_run->exit_reason = KVM_EXIT_DEBUG;
2337                kvm_run->debug.arch.pc =
2338                        svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2339                kvm_run->debug.arch.exception = DB_VECTOR;
2340                return 0;
2341        }
2342
2343        return 1;
2344}
2345
2346static int bp_interception(struct vcpu_svm *svm)
2347{
2348        struct kvm_run *kvm_run = svm->vcpu.run;
2349
2350        kvm_run->exit_reason = KVM_EXIT_DEBUG;
2351        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2352        kvm_run->debug.arch.exception = BP_VECTOR;
2353        return 0;
2354}
2355
2356static int ud_interception(struct vcpu_svm *svm)
2357{
2358        int er;
2359
2360        er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
2361        if (er != EMULATE_DONE)
2362                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2363        return 1;
2364}
2365
2366static int ac_interception(struct vcpu_svm *svm)
2367{
2368        kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
2369        return 1;
2370}
2371
2372static void svm_fpu_activate(struct kvm_vcpu *vcpu)
2373{
2374        struct vcpu_svm *svm = to_svm(vcpu);
2375
2376        clr_exception_intercept(svm, NM_VECTOR);
2377
2378        svm->vcpu.fpu_active = 1;
2379        update_cr0_intercept(svm);
2380}
2381
2382static int nm_interception(struct vcpu_svm *svm)
2383{
2384        svm_fpu_activate(&svm->vcpu);
2385        return 1;
2386}
2387
2388static bool is_erratum_383(void)
2389{
2390        int err, i;
2391        u64 value;
2392
2393        if (!erratum_383_found)
2394                return false;
2395
2396        value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2397        if (err)
2398                return false;
2399
2400        /* Bit 62 may or may not be set for this mce */
2401        value &= ~(1ULL << 62);
2402
2403        if (value != 0xb600000000010015ULL)
2404                return false;
2405
2406        /* Clear MCi_STATUS registers */
2407        for (i = 0; i < 6; ++i)
2408                native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2409
2410        value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2411        if (!err) {
2412                u32 low, high;
2413
2414                value &= ~(1ULL << 2);
2415                low    = lower_32_bits(value);
2416                high   = upper_32_bits(value);
2417
2418                native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2419        }
2420
2421        /* Flush tlb to evict multi-match entries */
2422        __flush_tlb_all();
2423
2424        return true;
2425}
2426
2427static void svm_handle_mce(struct vcpu_svm *svm)
2428{
2429        if (is_erratum_383()) {
2430                /*
2431                 * Erratum 383 triggered. Guest state is corrupt so kill the
2432                 * guest.
2433                 */
2434                pr_err("KVM: Guest triggered AMD Erratum 383\n");
2435
2436                kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
2437
2438                return;
2439        }
2440
2441        /*
2442         * On an #MC intercept the MCE handler is not called automatically in
2443         * the host. So do it by hand here.
2444         */
2445        asm volatile (
2446                "int $0x12\n");
2447        /* not sure if we ever come back to this point */
2448
2449        return;
2450}
2451
2452static int mc_interception(struct vcpu_svm *svm)
2453{
2454        return 1;
2455}
2456
2457static int shutdown_interception(struct vcpu_svm *svm)
2458{
2459        struct kvm_run *kvm_run = svm->vcpu.run;
2460
2461        /*
2462         * VMCB is undefined after a SHUTDOWN intercept
2463         * so reinitialize it.
2464         */
2465        clear_page(svm->vmcb);
2466        init_vmcb(svm);
2467
2468        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2469        return 0;
2470}
2471
2472static int io_interception(struct vcpu_svm *svm)
2473{
2474        struct kvm_vcpu *vcpu = &svm->vcpu;
2475        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2476        int size, in, string;
2477        unsigned port;
2478
2479        ++svm->vcpu.stat.io_exits;
2480        string = (io_info & SVM_IOIO_STR_MASK) != 0;
2481        in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2482        if (string || in)
2483                return emulate_instruction(vcpu, 0) == EMULATE_DONE;
2484
2485        port = io_info >> 16;
2486        size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2487        svm->next_rip = svm->vmcb->control.exit_info_2;
2488        skip_emulated_instruction(&svm->vcpu);
2489
2490        return kvm_fast_pio_out(vcpu, size, port);
2491}
2492
2493static int nmi_interception(struct vcpu_svm *svm)
2494{
2495        return 1;
2496}
2497
2498static int intr_interception(struct vcpu_svm *svm)
2499{
2500        ++svm->vcpu.stat.irq_exits;
2501        return 1;
2502}
2503
2504static int nop_on_interception(struct vcpu_svm *svm)
2505{
2506        return 1;
2507}
2508
2509static int halt_interception(struct vcpu_svm *svm)
2510{
2511        svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
2512        return kvm_emulate_halt(&svm->vcpu);
2513}
2514
2515static int vmmcall_interception(struct vcpu_svm *svm)
2516{
2517        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2518        kvm_emulate_hypercall(&svm->vcpu);
2519        return 1;
2520}
2521
2522static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
2523{
2524        struct vcpu_svm *svm = to_svm(vcpu);
2525
2526        return svm->nested.nested_cr3;
2527}
2528
2529static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
2530{
2531        struct vcpu_svm *svm = to_svm(vcpu);
2532        u64 cr3 = svm->nested.nested_cr3;
2533        u64 pdpte;
2534        int ret;
2535
2536        ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
2537                                       offset_in_page(cr3) + index * 8, 8);
2538        if (ret)
2539                return 0;
2540        return pdpte;
2541}
2542
2543static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
2544                                   unsigned long root)
2545{
2546        struct vcpu_svm *svm = to_svm(vcpu);
2547
2548        svm->vmcb->control.nested_cr3 = __sme_set(root);
2549        mark_dirty(svm->vmcb, VMCB_NPT);
2550        svm_flush_tlb(vcpu);
2551}
2552
2553static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
2554                                       struct x86_exception *fault)
2555{
2556        struct vcpu_svm *svm = to_svm(vcpu);
2557
2558        if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
2559                /*
2560                 * TODO: track the cause of the nested page fault, and
2561                 * correctly fill in the high bits of exit_info_1.
2562                 */
2563                svm->vmcb->control.exit_code = SVM_EXIT_NPF;
2564                svm->vmcb->control.exit_code_hi = 0;
2565                svm->vmcb->control.exit_info_1 = (1ULL << 32);
2566                svm->vmcb->control.exit_info_2 = fault->address;
2567        }
2568
2569        svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
2570        svm->vmcb->control.exit_info_1 |= fault->error_code;
2571
2572        /*
2573         * The present bit is always zero for page structure faults on real
2574         * hardware.
2575         */
2576        if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
2577                svm->vmcb->control.exit_info_1 &= ~1;
2578
2579        nested_svm_vmexit(svm);
2580}
2581
2582static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
2583{
2584        WARN_ON(mmu_is_nested(vcpu));
2585        kvm_init_shadow_mmu(vcpu);
2586        vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
2587        vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
2588        vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
2589        vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
2590        vcpu->arch.mmu.shadow_root_level = get_npt_level();
2591        reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
2592        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
2593}
2594
2595static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
2596{
2597        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
2598}
2599
2600static int nested_svm_check_permissions(struct vcpu_svm *svm)
2601{
2602        if (!(svm->vcpu.arch.efer & EFER_SVME)
2603            || !is_paging(&svm->vcpu)) {
2604                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2605                return 1;
2606        }
2607
2608        if (svm->vmcb->save.cpl) {
2609                kvm_inject_gp(&svm->vcpu, 0);
2610                return 1;
2611        }
2612
2613       return 0;
2614}
2615
2616static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
2617                                      bool has_error_code, u32 error_code)
2618{
2619        int vmexit;
2620
2621        if (!is_guest_mode(&svm->vcpu))
2622                return 0;
2623
2624        svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
2625        svm->vmcb->control.exit_code_hi = 0;
2626        svm->vmcb->control.exit_info_1 = error_code;
2627        svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
2628
2629        vmexit = nested_svm_intercept(svm);
2630        if (vmexit == NESTED_EXIT_DONE)
2631                svm->nested.exit_required = true;
2632
2633        return vmexit;
2634}
2635
2636/* This function returns true if it is save to enable the irq window */
2637static inline bool nested_svm_intr(struct vcpu_svm *svm)
2638{
2639        if (!is_guest_mode(&svm->vcpu))
2640                return true;
2641
2642        if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2643                return true;
2644
2645        if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
2646                return false;
2647
2648        /*
2649         * if vmexit was already requested (by intercepted exception
2650         * for instance) do not overwrite it with "external interrupt"
2651         * vmexit.
2652         */
2653        if (svm->nested.exit_required)
2654                return false;
2655
2656        svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
2657        svm->vmcb->control.exit_info_1 = 0;
2658        svm->vmcb->control.exit_info_2 = 0;
2659
2660        if (svm->nested.intercept & 1ULL) {
2661                /*
2662                 * The #vmexit can't be emulated here directly because this
2663                 * code path runs with irqs and preemption disabled. A
2664                 * #vmexit emulation might sleep. Only signal request for
2665                 * the #vmexit here.
2666                 */
2667                svm->nested.exit_required = true;
2668                trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
2669                return false;
2670        }
2671
2672        return true;
2673}
2674
2675/* This function returns true if it is save to enable the nmi window */
2676static inline bool nested_svm_nmi(struct vcpu_svm *svm)
2677{
2678        if (!is_guest_mode(&svm->vcpu))
2679                return true;
2680
2681        if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
2682                return true;
2683
2684        svm->vmcb->control.exit_code = SVM_EXIT_NMI;
2685        svm->nested.exit_required = true;
2686
2687        return false;
2688}
2689
2690static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
2691{
2692        struct page *page;
2693
2694        might_sleep();
2695
2696        page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
2697        if (is_error_page(page))
2698                goto error;
2699
2700        *_page = page;
2701
2702        return kmap(page);
2703
2704error:
2705        kvm_inject_gp(&svm->vcpu, 0);
2706
2707        return NULL;
2708}
2709
2710static void nested_svm_unmap(struct page *page)
2711{
2712        kunmap(page);
2713        kvm_release_page_dirty(page);
2714}
2715
2716static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
2717{
2718        unsigned port, size, iopm_len;
2719        u16 val, mask;
2720        u8 start_bit;
2721        u64 gpa;
2722
2723        if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
2724                return NESTED_EXIT_HOST;
2725
2726        port = svm->vmcb->control.exit_info_1 >> 16;
2727        size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
2728                SVM_IOIO_SIZE_SHIFT;
2729        gpa  = svm->nested.vmcb_iopm + (port / 8);
2730        start_bit = port % 8;
2731        iopm_len = (start_bit + size > 8) ? 2 : 1;
2732        mask = (0xf >> (4 - size)) << start_bit;
2733        val = 0;
2734
2735        if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
2736                return NESTED_EXIT_DONE;
2737
2738        return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2739}
2740
2741static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
2742{
2743        u32 offset, msr, value;
2744        int write, mask;
2745
2746        if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2747                return NESTED_EXIT_HOST;
2748
2749        msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2750        offset = svm_msrpm_offset(msr);
2751        write  = svm->vmcb->control.exit_info_1 & 1;
2752        mask   = 1 << ((2 * (msr & 0xf)) + write);
2753
2754        if (offset == MSR_INVALID)
2755                return NESTED_EXIT_DONE;
2756
2757        /* Offset is in 32 bit units but need in 8 bit units */
2758        offset *= 4;
2759
2760        if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
2761                return NESTED_EXIT_DONE;
2762
2763        return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2764}
2765
2766/* DB exceptions for our internal use must not cause vmexit */
2767static int nested_svm_intercept_db(struct vcpu_svm *svm)
2768{
2769        unsigned long dr6;
2770
2771        /* if we're not singlestepping, it's not ours */
2772        if (!svm->nmi_singlestep)
2773                return NESTED_EXIT_DONE;
2774
2775        /* if it's not a singlestep exception, it's not ours */
2776        if (kvm_get_dr(&svm->vcpu, 6, &dr6))
2777                return NESTED_EXIT_DONE;
2778        if (!(dr6 & DR6_BS))
2779                return NESTED_EXIT_DONE;
2780
2781        /* if the guest is singlestepping, it should get the vmexit */
2782        if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
2783                disable_nmi_singlestep(svm);
2784                return NESTED_EXIT_DONE;
2785        }
2786
2787        /* it's ours, the nested hypervisor must not see this one */
2788        return NESTED_EXIT_HOST;
2789}
2790
2791static int nested_svm_exit_special(struct vcpu_svm *svm)
2792{
2793        u32 exit_code = svm->vmcb->control.exit_code;
2794
2795        switch (exit_code) {
2796        case SVM_EXIT_INTR:
2797        case SVM_EXIT_NMI:
2798        case SVM_EXIT_EXCP_BASE + MC_VECTOR:
2799                return NESTED_EXIT_HOST;
2800        case SVM_EXIT_NPF:
2801                /* For now we are always handling NPFs when using them */
2802                if (npt_enabled)
2803                        return NESTED_EXIT_HOST;
2804                break;
2805        case SVM_EXIT_EXCP_BASE + PF_VECTOR:
2806                /* When we're shadowing, trap PFs, but not async PF */
2807                if (!npt_enabled && svm->apf_reason == 0)
2808                        return NESTED_EXIT_HOST;
2809                break;
2810        case SVM_EXIT_EXCP_BASE + NM_VECTOR:
2811                nm_interception(svm);
2812                break;
2813        default:
2814                break;
2815        }
2816
2817        return NESTED_EXIT_CONTINUE;
2818}
2819
2820/*
2821 * If this function returns true, this #vmexit was already handled
2822 */
2823static int nested_svm_intercept(struct vcpu_svm *svm)
2824{
2825        u32 exit_code = svm->vmcb->control.exit_code;
2826        int vmexit = NESTED_EXIT_HOST;
2827
2828        switch (exit_code) {
2829        case SVM_EXIT_MSR:
2830                vmexit = nested_svm_exit_handled_msr(svm);
2831                break;
2832        case SVM_EXIT_IOIO:
2833                vmexit = nested_svm_intercept_ioio(svm);
2834                break;
2835        case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
2836                u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
2837                if (svm->nested.intercept_cr & bit)
2838                        vmexit = NESTED_EXIT_DONE;
2839                break;
2840        }
2841        case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
2842                u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
2843                if (svm->nested.intercept_dr & bit)
2844                        vmexit = NESTED_EXIT_DONE;
2845                break;
2846        }
2847        case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
2848                u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
2849                if (svm->nested.intercept_exceptions & excp_bits) {
2850                        if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
2851                                vmexit = nested_svm_intercept_db(svm);
2852                        else
2853                                vmexit = NESTED_EXIT_DONE;
2854                }
2855                /* async page fault always cause vmexit */
2856                else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2857                         svm->apf_reason != 0)
2858                        vmexit = NESTED_EXIT_DONE;
2859                break;
2860        }
2861        case SVM_EXIT_ERR: {
2862                vmexit = NESTED_EXIT_DONE;
2863                break;
2864        }
2865        default: {
2866                u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
2867                if (svm->nested.intercept & exit_bits)
2868                        vmexit = NESTED_EXIT_DONE;
2869        }
2870        }
2871
2872        return vmexit;
2873}
2874
2875static int nested_svm_exit_handled(struct vcpu_svm *svm)
2876{
2877        int vmexit;
2878
2879        vmexit = nested_svm_intercept(svm);
2880
2881        if (vmexit == NESTED_EXIT_DONE)
2882                nested_svm_vmexit(svm);
2883
2884        return vmexit;
2885}
2886
2887static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
2888{
2889        struct vmcb_control_area *dst  = &dst_vmcb->control;
2890        struct vmcb_control_area *from = &from_vmcb->control;
2891
2892        dst->intercept_cr         = from->intercept_cr;
2893        dst->intercept_dr         = from->intercept_dr;
2894        dst->intercept_exceptions = from->intercept_exceptions;
2895        dst->intercept            = from->intercept;
2896        dst->iopm_base_pa         = from->iopm_base_pa;
2897        dst->msrpm_base_pa        = from->msrpm_base_pa;
2898        dst->tsc_offset           = from->tsc_offset;
2899        dst->asid                 = from->asid;
2900        dst->tlb_ctl              = from->tlb_ctl;
2901        dst->int_ctl              = from->int_ctl;
2902        dst->int_vector           = from->int_vector;
2903        dst->int_state            = from->int_state;
2904        dst->exit_code            = from->exit_code;
2905        dst->exit_code_hi         = from->exit_code_hi;
2906        dst->exit_info_1          = from->exit_info_1;
2907        dst->exit_info_2          = from->exit_info_2;
2908        dst->exit_int_info        = from->exit_int_info;
2909        dst->exit_int_info_err    = from->exit_int_info_err;
2910        dst->nested_ctl           = from->nested_ctl;
2911        dst->event_inj            = from->event_inj;
2912        dst->event_inj_err        = from->event_inj_err;
2913        dst->nested_cr3           = from->nested_cr3;
2914        dst->virt_ext              = from->virt_ext;
2915}
2916
2917static int nested_svm_vmexit(struct vcpu_svm *svm)
2918{
2919        struct vmcb *nested_vmcb;
2920        struct vmcb *hsave = svm->nested.hsave;
2921        struct vmcb *vmcb = svm->vmcb;
2922        struct page *page;
2923
2924        trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
2925                                       vmcb->control.exit_info_1,
2926                                       vmcb->control.exit_info_2,
2927                                       vmcb->control.exit_int_info,
2928                                       vmcb->control.exit_int_info_err,
2929                                       KVM_ISA_SVM);
2930
2931        nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
2932        if (!nested_vmcb)
2933                return 1;
2934
2935        /*
2936         * No need for IBPB here, the L1 hypervisor should be running with
2937         * IBRS=1 and inserts one already when switching L2 VMs.
2938         */
2939
2940        /* Exit Guest-Mode */
2941        leave_guest_mode(&svm->vcpu);
2942        svm->nested.vmcb = 0;
2943
2944        /* Give the current vmcb to the guest */
2945        disable_gif(svm);
2946
2947        nested_vmcb->save.es     = vmcb->save.es;
2948        nested_vmcb->save.cs     = vmcb->save.cs;
2949        nested_vmcb->save.ss     = vmcb->save.ss;
2950        nested_vmcb->save.ds     = vmcb->save.ds;
2951        nested_vmcb->save.gdtr   = vmcb->save.gdtr;
2952        nested_vmcb->save.idtr   = vmcb->save.idtr;
2953        nested_vmcb->save.efer   = svm->vcpu.arch.efer;
2954        nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
2955        nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
2956        nested_vmcb->save.cr2    = vmcb->save.cr2;
2957        nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
2958        nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
2959        nested_vmcb->save.rip    = vmcb->save.rip;
2960        nested_vmcb->save.rsp    = vmcb->save.rsp;
2961        nested_vmcb->save.rax    = vmcb->save.rax;
2962        nested_vmcb->save.dr7    = vmcb->save.dr7;
2963        nested_vmcb->save.dr6    = vmcb->save.dr6;
2964        nested_vmcb->save.cpl    = vmcb->save.cpl;
2965
2966        nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
2967        nested_vmcb->control.int_vector        = vmcb->control.int_vector;
2968        nested_vmcb->control.int_state         = vmcb->control.int_state;
2969        nested_vmcb->control.exit_code         = vmcb->control.exit_code;
2970        nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
2971        nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
2972        nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
2973        nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
2974        nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2975
2976        if (svm->nrips_enabled)
2977                nested_vmcb->control.next_rip  = vmcb->control.next_rip;
2978
2979        /*
2980         * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
2981         * to make sure that we do not lose injected events. So check event_inj
2982         * here and copy it to exit_int_info if it is valid.
2983         * Exit_int_info and event_inj can't be both valid because the case
2984         * below only happens on a VMRUN instruction intercept which has
2985         * no valid exit_int_info set.
2986         */
2987        if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
2988                struct vmcb_control_area *nc = &nested_vmcb->control;
2989
2990                nc->exit_int_info     = vmcb->control.event_inj;
2991                nc->exit_int_info_err = vmcb->control.event_inj_err;
2992        }
2993
2994        nested_vmcb->control.tlb_ctl           = 0;
2995        nested_vmcb->control.event_inj         = 0;
2996        nested_vmcb->control.event_inj_err     = 0;
2997
2998        /* We always set V_INTR_MASKING and remember the old value in hflags */
2999        if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
3000                nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
3001
3002        /* Restore the original control entries */
3003        copy_vmcb_control_area(vmcb, hsave);
3004
3005        kvm_clear_exception_queue(&svm->vcpu);
3006        kvm_clear_interrupt_queue(&svm->vcpu);
3007
3008        svm->nested.nested_cr3 = 0;
3009
3010        /* Restore selected save entries */
3011        svm->vmcb->save.es = hsave->save.es;
3012        svm->vmcb->save.cs = hsave->save.cs;
3013        svm->vmcb->save.ss = hsave->save.ss;
3014        svm->vmcb->save.ds = hsave->save.ds;
3015        svm->vmcb->save.gdtr = hsave->save.gdtr;
3016        svm->vmcb->save.idtr = hsave->save.idtr;
3017        kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
3018        svm_set_efer(&svm->vcpu, hsave->save.efer);
3019        svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
3020        svm_set_cr4(&svm->vcpu, hsave->save.cr4);
3021        if (npt_enabled) {
3022                svm->vmcb->save.cr3 = hsave->save.cr3;
3023                svm->vcpu.arch.cr3 = hsave->save.cr3;
3024        } else {
3025                (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
3026        }
3027        kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
3028        kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
3029        kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
3030        svm->vmcb->save.dr7 = 0;
3031        svm->vmcb->save.cpl = 0;
3032        svm->vmcb->control.exit_int_info = 0;
3033
3034        mark_all_dirty(svm->vmcb);
3035
3036        nested_svm_unmap(page);
3037
3038        nested_svm_uninit_mmu_context(&svm->vcpu);
3039        kvm_mmu_reset_context(&svm->vcpu);
3040        kvm_mmu_load(&svm->vcpu);
3041
3042        return 0;
3043}
3044
3045static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
3046{
3047        /*
3048         * This function merges the msr permission bitmaps of kvm and the
3049         * nested vmcb. It is optimized in that it only merges the parts where
3050         * the kvm msr permission bitmap may contain zero bits
3051         */
3052        int i;
3053
3054        if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
3055                return true;
3056
3057        for (i = 0; i < MSRPM_OFFSETS; i++) {
3058                u32 value, p;
3059                u64 offset;
3060
3061                if (msrpm_offsets[i] == 0xffffffff)
3062                        break;
3063
3064                p      = msrpm_offsets[i];
3065                offset = svm->nested.vmcb_msrpm + (p * 4);
3066
3067                if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
3068                        return false;
3069
3070                svm->nested.msrpm[p] = svm->msrpm[p] | value;
3071        }
3072
3073        svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
3074
3075        return true;
3076}
3077
3078static bool nested_vmcb_checks(struct vmcb *vmcb)
3079{
3080        if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
3081                return false;
3082
3083        if (vmcb->control.asid == 0)
3084                return false;
3085
3086        if (vmcb->control.nested_ctl && !npt_enabled)
3087                return false;
3088
3089        return true;
3090}
3091
3092static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
3093                                 struct vmcb *nested_vmcb, struct page *page)
3094{
3095
3096        /*
3097         * No need for IBPB here, since the nested VM is less privileged.  The
3098         * L1 hypervisor inserts one already when switching L2 VMs.
3099         */
3100        if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
3101                svm->vcpu.arch.hflags |= HF_HIF_MASK;
3102        else
3103                svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
3104
3105        if (nested_vmcb->control.nested_ctl) {
3106                kvm_mmu_unload(&svm->vcpu);
3107                svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
3108                nested_svm_init_mmu_context(&svm->vcpu);
3109        }
3110
3111        /* Load the nested guest state */
3112        svm->vmcb->save.es = nested_vmcb->save.es;
3113        svm->vmcb->save.cs = nested_vmcb->save.cs;
3114        svm->vmcb->save.ss = nested_vmcb->save.ss;
3115        svm->vmcb->save.ds = nested_vmcb->save.ds;
3116        svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
3117        svm->vmcb->save.idtr = nested_vmcb->save.idtr;
3118        kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
3119        svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
3120        svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
3121        svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
3122        if (npt_enabled) {
3123                svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
3124                svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
3125        } else
3126                (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
3127
3128        /* Guest paging mode is active - reset mmu */
3129        kvm_mmu_reset_context(&svm->vcpu);
3130
3131        svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
3132        kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
3133        kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
3134        kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
3135
3136        /* In case we don't even reach vcpu_run, the fields are not updated */
3137        svm->vmcb->save.rax = nested_vmcb->save.rax;
3138        svm->vmcb->save.rsp = nested_vmcb->save.rsp;
3139        svm->vmcb->save.rip = nested_vmcb->save.rip;
3140        svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
3141        svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
3142        svm->vmcb->save.cpl = nested_vmcb->save.cpl;
3143
3144        svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
3145        svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
3146
3147        /* cache intercepts */
3148        svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
3149        svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
3150        svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
3151        svm->nested.intercept            = nested_vmcb->control.intercept;
3152
3153        svm_flush_tlb(&svm->vcpu);
3154        svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
3155        if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
3156                svm->vcpu.arch.hflags |= HF_VINTR_MASK;
3157        else
3158                svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
3159
3160        if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
3161                /* We only want the cr8 intercept bits of the guest */
3162                clr_cr_intercept(svm, INTERCEPT_CR8_READ);
3163                clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3164        }
3165
3166        /* We don't want to see VMMCALLs from a nested guest */
3167        clr_intercept(svm, INTERCEPT_VMMCALL);
3168
3169        svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
3170        svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
3171        svm->vmcb->control.int_state = nested_vmcb->control.int_state;
3172        svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
3173        svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
3174        svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
3175
3176        nested_svm_unmap(page);
3177
3178        /* Enter Guest-Mode */
3179        enter_guest_mode(&svm->vcpu);
3180
3181        /*
3182         * Merge guest and host intercepts - must be called  with vcpu in
3183         * guest-mode to take affect here
3184         */
3185        recalc_intercepts(svm);
3186
3187        svm->nested.vmcb = vmcb_gpa;
3188
3189        enable_gif(svm);
3190
3191        mark_all_dirty(svm->vmcb);
3192}
3193
3194static bool nested_svm_vmrun(struct vcpu_svm *svm)
3195{
3196        struct vmcb *nested_vmcb;
3197        struct vmcb *hsave = svm->nested.hsave;
3198        struct vmcb *vmcb = svm->vmcb;
3199        struct page *page;
3200        u64 vmcb_gpa;
3201
3202        vmcb_gpa = svm->vmcb->save.rax;
3203
3204        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
3205        if (!nested_vmcb)
3206                return false;
3207
3208        if (!nested_vmcb_checks(nested_vmcb)) {
3209                nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
3210                nested_vmcb->control.exit_code_hi = 0;
3211                nested_vmcb->control.exit_info_1  = 0;
3212                nested_vmcb->control.exit_info_2  = 0;
3213
3214                nested_svm_unmap(page);
3215
3216                return false;
3217        }
3218
3219        trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
3220                               nested_vmcb->save.rip,
3221                               nested_vmcb->control.int_ctl,
3222                               nested_vmcb->control.event_inj,
3223                               nested_vmcb->control.nested_ctl);
3224
3225        trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
3226                                    nested_vmcb->control.intercept_cr >> 16,
3227                                    nested_vmcb->control.intercept_exceptions,
3228                                    nested_vmcb->control.intercept);
3229
3230        /* Clear internal status */
3231        kvm_clear_exception_queue(&svm->vcpu);
3232        kvm_clear_interrupt_queue(&svm->vcpu);
3233
3234        /*
3235         * Save the old vmcb, so we don't need to pick what we save, but can
3236         * restore everything when a VMEXIT occurs
3237         */
3238        hsave->save.es     = vmcb->save.es;
3239        hsave->save.cs     = vmcb->save.cs;
3240        hsave->save.ss     = vmcb->save.ss;
3241        hsave->save.ds     = vmcb->save.ds;
3242        hsave->save.gdtr   = vmcb->save.gdtr;
3243        hsave->save.idtr   = vmcb->save.idtr;
3244        hsave->save.efer   = svm->vcpu.arch.efer;
3245        hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
3246        hsave->save.cr4    = svm->vcpu.arch.cr4;
3247        hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
3248        hsave->save.rip    = kvm_rip_read(&svm->vcpu);
3249        hsave->save.rsp    = vmcb->save.rsp;
3250        hsave->save.rax    = vmcb->save.rax;
3251        if (npt_enabled)
3252                hsave->save.cr3    = vmcb->save.cr3;
3253        else
3254                hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
3255
3256        copy_vmcb_control_area(hsave, vmcb);
3257
3258        enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
3259
3260        return true;
3261}
3262
3263static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
3264{
3265        to_vmcb->save.fs = from_vmcb->save.fs;
3266        to_vmcb->save.gs = from_vmcb->save.gs;
3267        to_vmcb->save.tr = from_vmcb->save.tr;
3268        to_vmcb->save.ldtr = from_vmcb->save.ldtr;
3269        to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
3270        to_vmcb->save.star = from_vmcb->save.star;
3271        to_vmcb->save.lstar = from_vmcb->save.lstar;
3272        to_vmcb->save.cstar = from_vmcb->save.cstar;
3273        to_vmcb->save.sfmask = from_vmcb->save.sfmask;
3274        to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
3275        to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
3276        to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
3277}
3278
3279static int vmload_interception(struct vcpu_svm *svm)
3280{
3281        struct vmcb *nested_vmcb;
3282        struct page *page;
3283
3284        if (nested_svm_check_permissions(svm))
3285                return 1;
3286
3287        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
3288        if (!nested_vmcb)
3289                return 1;
3290
3291        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3292        skip_emulated_instruction(&svm->vcpu);
3293
3294        nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
3295        nested_svm_unmap(page);
3296
3297        return 1;
3298}
3299
3300static int vmsave_interception(struct vcpu_svm *svm)
3301{
3302        struct vmcb *nested_vmcb;
3303        struct page *page;
3304
3305        if (nested_svm_check_permissions(svm))
3306                return 1;
3307
3308        nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
3309        if (!nested_vmcb)
3310                return 1;
3311
3312        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3313        skip_emulated_instruction(&svm->vcpu);
3314
3315        nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
3316        nested_svm_unmap(page);
3317
3318        return 1;
3319}
3320
3321static int vmrun_interception(struct vcpu_svm *svm)
3322{
3323        if (nested_svm_check_permissions(svm))
3324                return 1;
3325
3326        /* Save rip after vmrun instruction */
3327        kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
3328
3329        if (!nested_svm_vmrun(svm))
3330                return 1;
3331
3332        if (!nested_svm_vmrun_msrpm(svm))
3333                goto failed;
3334
3335        return 1;
3336
3337failed:
3338
3339        svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
3340        svm->vmcb->control.exit_code_hi = 0;
3341        svm->vmcb->control.exit_info_1  = 0;
3342        svm->vmcb->control.exit_info_2  = 0;
3343
3344        nested_svm_vmexit(svm);
3345
3346        return 1;
3347}
3348
3349static int stgi_interception(struct vcpu_svm *svm)
3350{
3351        if (nested_svm_check_permissions(svm))
3352                return 1;
3353
3354        /*
3355         * If VGIF is enabled, the STGI intercept is only added to
3356         * detect the opening of the NMI window; remove it now.
3357         */
3358        if (vgif_enabled(svm))
3359                clr_intercept(svm, INTERCEPT_STGI);
3360
3361        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3362        skip_emulated_instruction(&svm->vcpu);
3363        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3364
3365        enable_gif(svm);
3366
3367        return 1;
3368}
3369
3370static int clgi_interception(struct vcpu_svm *svm)
3371{
3372        if (nested_svm_check_permissions(svm))
3373                return 1;
3374
3375        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3376        skip_emulated_instruction(&svm->vcpu);
3377
3378        disable_gif(svm);
3379
3380        /* After a CLGI no interrupts should come */
3381        if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
3382                svm_clear_vintr(svm);
3383                svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3384                mark_dirty(svm->vmcb, VMCB_INTR);
3385        }
3386
3387        return 1;
3388}
3389
3390static int invlpga_interception(struct vcpu_svm *svm)
3391{
3392        struct kvm_vcpu *vcpu = &svm->vcpu;
3393
3394        trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
3395                          kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
3396
3397        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
3398        kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
3399
3400        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3401        skip_emulated_instruction(&svm->vcpu);
3402        return 1;
3403}
3404
3405static int skinit_interception(struct vcpu_svm *svm)
3406{
3407        trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
3408
3409        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3410        return 1;
3411}
3412
3413static int xsetbv_interception(struct vcpu_svm *svm)
3414{
3415        u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
3416        u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3417
3418        if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
3419                svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3420                skip_emulated_instruction(&svm->vcpu);
3421        }
3422
3423        return 1;
3424}
3425
3426static int task_switch_interception(struct vcpu_svm *svm)
3427{
3428        u16 tss_selector;
3429        int reason;
3430        int int_type = svm->vmcb->control.exit_int_info &
3431                SVM_EXITINTINFO_TYPE_MASK;
3432        int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
3433        uint32_t type =
3434                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
3435        uint32_t idt_v =
3436                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
3437        bool has_error_code = false;
3438        u32 error_code = 0;
3439
3440        tss_selector = (u16)svm->vmcb->control.exit_info_1;
3441
3442        if (svm->vmcb->control.exit_info_2 &
3443            (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
3444                reason = TASK_SWITCH_IRET;
3445        else if (svm->vmcb->control.exit_info_2 &
3446                 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
3447                reason = TASK_SWITCH_JMP;
3448        else if (idt_v)
3449                reason = TASK_SWITCH_GATE;
3450        else
3451                reason = TASK_SWITCH_CALL;
3452
3453        if (reason == TASK_SWITCH_GATE) {
3454                switch (type) {
3455                case SVM_EXITINTINFO_TYPE_NMI:
3456                        svm->vcpu.arch.nmi_injected = false;
3457                        break;
3458                case SVM_EXITINTINFO_TYPE_EXEPT:
3459                        if (svm->vmcb->control.exit_info_2 &
3460                            (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
3461                                has_error_code = true;
3462                                error_code =
3463                                        (u32)svm->vmcb->control.exit_info_2;
3464                        }
3465                        kvm_clear_exception_queue(&svm->vcpu);
3466                        break;
3467                case SVM_EXITINTINFO_TYPE_INTR:
3468                        kvm_clear_interrupt_queue(&svm->vcpu);
3469                        break;
3470                default:
3471                        break;
3472                }
3473        }
3474
3475        if (reason != TASK_SWITCH_GATE ||
3476            int_type == SVM_EXITINTINFO_TYPE_SOFT ||
3477            (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
3478             (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
3479                skip_emulated_instruction(&svm->vcpu);
3480
3481        if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
3482                int_vec = -1;
3483
3484        if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
3485                                has_error_code, error_code) == EMULATE_FAIL) {
3486                svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3487                svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3488                svm->vcpu.run->internal.ndata = 0;
3489                return 0;
3490        }
3491        return 1;
3492}
3493
3494static int cpuid_interception(struct vcpu_svm *svm)
3495{
3496        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3497        kvm_emulate_cpuid(&svm->vcpu);
3498        return 1;
3499}
3500
3501static int iret_interception(struct vcpu_svm *svm)
3502{
3503        ++svm->vcpu.stat.nmi_window_exits;
3504        clr_intercept(svm, INTERCEPT_IRET);
3505        svm->vcpu.arch.hflags |= HF_IRET_MASK;
3506        svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
3507        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3508        return 1;
3509}
3510
3511static int invlpg_interception(struct vcpu_svm *svm)
3512{
3513        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
3514                return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
3515
3516        kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
3517        skip_emulated_instruction(&svm->vcpu);
3518        return 1;
3519}
3520
3521static int emulate_on_interception(struct vcpu_svm *svm)
3522{
3523        return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
3524}
3525
3526static int rdpmc_interception(struct vcpu_svm *svm)
3527{
3528        int err;
3529
3530        if (!static_cpu_has(X86_FEATURE_NRIPS))
3531                return emulate_on_interception(svm);
3532
3533        err = kvm_rdpmc(&svm->vcpu);
3534        kvm_complete_insn_gp(&svm->vcpu, err);
3535
3536        return 1;
3537}
3538
3539static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
3540                                            unsigned long val)
3541{
3542        unsigned long cr0 = svm->vcpu.arch.cr0;
3543        bool ret = false;
3544        u64 intercept;
3545
3546        intercept = svm->nested.intercept;
3547
3548        if (!is_guest_mode(&svm->vcpu) ||
3549            (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
3550                return false;
3551
3552        cr0 &= ~SVM_CR0_SELECTIVE_MASK;
3553        val &= ~SVM_CR0_SELECTIVE_MASK;
3554
3555        if (cr0 ^ val) {
3556                svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
3557                ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
3558        }
3559
3560        return ret;
3561}
3562
3563#define CR_VALID (1ULL << 63)
3564
3565static int cr_interception(struct vcpu_svm *svm)
3566{
3567        int reg, cr;
3568        unsigned long val;
3569        int err;
3570
3571        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
3572                return emulate_on_interception(svm);
3573
3574        if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
3575                return emulate_on_interception(svm);
3576
3577        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3578        cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
3579
3580        err = 0;
3581        if (cr >= 16) { /* mov to cr */
3582                cr -= 16;
3583                val = kvm_register_read(&svm->vcpu, reg);
3584                switch (cr) {
3585                case 0:
3586                        if (!check_selective_cr0_intercepted(svm, val))
3587                                err = kvm_set_cr0(&svm->vcpu, val);
3588                        else
3589                                return 1;
3590
3591                        break;
3592                case 3:
3593                        err = kvm_set_cr3(&svm->vcpu, val);
3594                        break;
3595                case 4:
3596                        err = kvm_set_cr4(&svm->vcpu, val);
3597                        break;
3598                case 8:
3599                        err = kvm_set_cr8(&svm->vcpu, val);
3600                        break;
3601                default:
3602                        WARN(1, "unhandled write to CR%d", cr);
3603                        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3604                        return 1;
3605                }
3606        } else { /* mov from cr */
3607                switch (cr) {
3608                case 0:
3609                        val = kvm_read_cr0(&svm->vcpu);
3610                        break;
3611                case 2:
3612                        val = svm->vcpu.arch.cr2;
3613                        break;
3614                case 3:
3615                        val = kvm_read_cr3(&svm->vcpu);
3616                        break;
3617                case 4:
3618                        val = kvm_read_cr4(&svm->vcpu);
3619                        break;
3620                case 8:
3621                        val = kvm_get_cr8(&svm->vcpu);
3622                        break;
3623                default:
3624                        WARN(1, "unhandled read from CR%d", cr);
3625                        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3626                        return 1;
3627                }
3628                kvm_register_write(&svm->vcpu, reg, val);
3629        }
3630        kvm_complete_insn_gp(&svm->vcpu, err);
3631
3632        return 1;
3633}
3634
3635static int dr_interception(struct vcpu_svm *svm)
3636{
3637        int reg, dr;
3638        unsigned long val;
3639
3640        if (svm->vcpu.guest_debug == 0) {
3641                /*
3642                 * No more DR vmexits; force a reload of the debug registers
3643                 * and reenter on this instruction.  The next vmexit will
3644                 * retrieve the full state of the debug registers.
3645                 */
3646                clr_dr_intercepts(svm);
3647                svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
3648                return 1;
3649        }
3650
3651        if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
3652                return emulate_on_interception(svm);
3653
3654        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3655        dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
3656
3657        if (dr >= 16) { /* mov to DRn */
3658                if (!kvm_require_dr(&svm->vcpu, dr - 16))
3659                        return 1;
3660                val = kvm_register_read(&svm->vcpu, reg);
3661                kvm_set_dr(&svm->vcpu, dr - 16, val);
3662        } else {
3663                if (!kvm_require_dr(&svm->vcpu, dr))
3664                        return 1;
3665                kvm_get_dr(&svm->vcpu, dr, &val);
3666                kvm_register_write(&svm->vcpu, reg, val);
3667        }
3668
3669        skip_emulated_instruction(&svm->vcpu);
3670
3671        return 1;
3672}
3673
3674static int cr8_write_interception(struct vcpu_svm *svm)
3675{
3676        struct kvm_run *kvm_run = svm->vcpu.run;
3677        int r;
3678
3679        u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
3680        /* instruction emulation calls kvm_set_cr8() */
3681        r = cr_interception(svm);
3682        if (lapic_in_kernel(&svm->vcpu))
3683                return r;
3684        if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
3685                return r;
3686        kvm_run->exit_reason = KVM_EXIT_SET_TPR;
3687        return 0;
3688}
3689
3690static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3691{
3692        struct vcpu_svm *svm = to_svm(vcpu);
3693
3694        switch (msr_info->index) {
3695        case MSR_IA32_TSC: {
3696                msr_info->data = svm->vmcb->control.tsc_offset +
3697                        kvm_scale_tsc(vcpu, rdtsc());
3698
3699                break;
3700        }
3701        case MSR_STAR:
3702                msr_info->data = svm->vmcb->save.star;
3703                break;
3704#ifdef CONFIG_X86_64
3705        case MSR_LSTAR:
3706                msr_info->data = svm->vmcb->save.lstar;
3707                break;
3708        case MSR_CSTAR:
3709                msr_info->data = svm->vmcb->save.cstar;
3710                break;
3711        case MSR_KERNEL_GS_BASE:
3712                msr_info->data = svm->vmcb->save.kernel_gs_base;
3713                break;
3714        case MSR_SYSCALL_MASK:
3715                msr_info->data = svm->vmcb->save.sfmask;
3716                break;
3717#endif
3718        case MSR_IA32_SYSENTER_CS:
3719                msr_info->data = svm->vmcb->save.sysenter_cs;
3720                break;
3721        case MSR_IA32_SYSENTER_EIP:
3722                msr_info->data = svm->sysenter_eip;
3723                break;
3724        case MSR_IA32_SYSENTER_ESP:
3725                msr_info->data = svm->sysenter_esp;
3726                break;
3727        case MSR_TSC_AUX:
3728                if (!boot_cpu_has(X86_FEATURE_RDTSCP))
3729                        return 1;
3730                msr_info->data = svm->tsc_aux;
3731                break;
3732        /*
3733         * Nobody will change the following 5 values in the VMCB so we can
3734         * safely return them on rdmsr. They will always be 0 until LBRV is
3735         * implemented.
3736         */
3737        case MSR_IA32_DEBUGCTLMSR:
3738                msr_info->data = svm->vmcb->save.dbgctl;
3739                break;
3740        case MSR_IA32_LASTBRANCHFROMIP:
3741                msr_info->data = svm->vmcb->save.br_from;
3742                break;
3743        case MSR_IA32_LASTBRANCHTOIP:
3744                msr_info->data = svm->vmcb->save.br_to;
3745                break;
3746        case MSR_IA32_LASTINTFROMIP:
3747                msr_info->data = svm->vmcb->save.last_excp_from;
3748                break;
3749        case MSR_IA32_LASTINTTOIP:
3750                msr_info->data = svm->vmcb->save.last_excp_to;
3751                break;
3752        case MSR_VM_HSAVE_PA:
3753                msr_info->data = svm->nested.hsave_msr;
3754                break;
3755        case MSR_VM_CR:
3756                msr_info->data = svm->nested.vm_cr_msr;
3757                break;
3758        case MSR_IA32_SPEC_CTRL:
3759                msr_info->data = svm->spec_ctrl;
3760                break;
3761        case MSR_AMD64_VIRT_SPEC_CTRL:
3762                if (!msr_info->host_initiated &&
3763                    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
3764                        return 1;
3765
3766                msr_info->data = svm->virt_spec_ctrl;
3767                break;
3768        case MSR_IA32_UCODE_REV:
3769                msr_info->data = 0x01000065;
3770                break;
3771        default:
3772                return kvm_get_msr_common(vcpu, msr_info);
3773        }
3774        return 0;
3775}
3776
3777static int rdmsr_interception(struct vcpu_svm *svm)
3778{
3779        u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3780        struct msr_data msr_info;
3781
3782        msr_info.index = ecx;
3783        msr_info.host_initiated = false;
3784        if (svm_get_msr(&svm->vcpu, &msr_info)) {
3785                trace_kvm_msr_read_ex(ecx);
3786                kvm_inject_gp(&svm->vcpu, 0);
3787        } else {
3788                trace_kvm_msr_read(ecx, msr_info.data);
3789
3790                kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
3791                                   msr_info.data & 0xffffffff);
3792                kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
3793                                   msr_info.data >> 32);
3794                svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3795                skip_emulated_instruction(&svm->vcpu);
3796        }
3797        return 1;
3798}
3799
3800static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
3801{
3802        struct vcpu_svm *svm = to_svm(vcpu);
3803        int svm_dis, chg_mask;
3804
3805        if (data & ~SVM_VM_CR_VALID_MASK)
3806                return 1;
3807
3808        chg_mask = SVM_VM_CR_VALID_MASK;
3809
3810        if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
3811                chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
3812
3813        svm->nested.vm_cr_msr &= ~chg_mask;
3814        svm->nested.vm_cr_msr |= (data & chg_mask);
3815
3816        svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
3817
3818        /* check for svm_disable while efer.svme is set */
3819        if (svm_dis && (vcpu->arch.efer & EFER_SVME))
3820                return 1;
3821
3822        return 0;
3823}
3824
3825static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3826{
3827        struct vcpu_svm *svm = to_svm(vcpu);
3828
3829        u32 ecx = msr->index;
3830        u64 data = msr->data;
3831        switch (ecx) {
3832        case MSR_IA32_CR_PAT:
3833                if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
3834                        return 1;
3835                vcpu->arch.pat = data;
3836                svm->vmcb->save.g_pat = data;
3837                mark_dirty(svm->vmcb, VMCB_NPT);
3838                break;
3839        case MSR_IA32_TSC:
3840                kvm_write_tsc(vcpu, msr);
3841                break;
3842        case MSR_IA32_PRED_CMD:
3843                if (!msr->host_initiated &&
3844                    !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
3845                        return 1;
3846
3847                if (data & ~PRED_CMD_IBPB)
3848                        return 1;
3849
3850                if (!data)
3851                        break;
3852
3853                wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
3854                if (is_guest_mode(vcpu))
3855                        break;
3856                set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
3857                break;
3858        case MSR_AMD64_VIRT_SPEC_CTRL:
3859                if (!msr->host_initiated &&
3860                    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
3861                        return 1;
3862
3863                if (data & ~SPEC_CTRL_SSBD)
3864                        return 1;
3865
3866                svm->virt_spec_ctrl = data;
3867                break;
3868        case MSR_STAR:
3869                svm->vmcb->save.star = data;
3870                break;
3871#ifdef CONFIG_X86_64
3872        case MSR_LSTAR:
3873                svm->vmcb->save.lstar = data;
3874                break;
3875        case MSR_CSTAR:
3876                svm->vmcb->save.cstar = data;
3877                break;
3878        case MSR_KERNEL_GS_BASE:
3879                svm->vmcb->save.kernel_gs_base = data;
3880                break;
3881        case MSR_SYSCALL_MASK:
3882                svm->vmcb->save.sfmask = data;
3883                break;
3884#endif
3885        case MSR_IA32_SYSENTER_CS:
3886                svm->vmcb->save.sysenter_cs = data;
3887                break;
3888        case MSR_IA32_SYSENTER_EIP:
3889                svm->sysenter_eip = data;
3890                svm->vmcb->save.sysenter_eip = data;
3891                break;
3892        case MSR_IA32_SYSENTER_ESP:
3893                svm->sysenter_esp = data;
3894                svm->vmcb->save.sysenter_esp = data;
3895                break;
3896        case MSR_TSC_AUX:
3897                if (!boot_cpu_has(X86_FEATURE_RDTSCP))
3898                        return 1;
3899
3900                /*
3901                 * This is rare, so we update the MSR here instead of using
3902                 * direct_access_msrs.  Doing that would require a rdmsr in
3903                 * svm_vcpu_put.
3904                 */
3905                svm->tsc_aux = data;
3906                wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
3907                break;
3908        case MSR_IA32_DEBUGCTLMSR:
3909                if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3910                        vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3911                                    __func__, data);
3912                        break;
3913                }
3914                if (data & DEBUGCTL_RESERVED_BITS)
3915                        return 1;
3916
3917                svm->vmcb->save.dbgctl = data;
3918                mark_dirty(svm->vmcb, VMCB_LBR);
3919                if (data & (1ULL<<0))
3920                        svm_enable_lbrv(svm);
3921                else
3922                        svm_disable_lbrv(svm);
3923                break;
3924        case MSR_VM_HSAVE_PA:
3925                svm->nested.hsave_msr = data;
3926                break;
3927        case MSR_VM_CR:
3928                return svm_set_vm_cr(vcpu, data);
3929        case MSR_VM_IGNNE:
3930                vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3931                break;
3932        case MSR_IA32_SPEC_CTRL:
3933                svm->spec_ctrl = data;
3934                break;
3935        case MSR_IA32_APICBASE:
3936                if (kvm_vcpu_apicv_active(vcpu))
3937                        avic_update_vapic_bar(to_svm(vcpu), data);
3938                /* Follow through */
3939        default:
3940                return kvm_set_msr_common(vcpu, msr);
3941        }
3942        return 0;
3943}
3944
3945static int wrmsr_interception(struct vcpu_svm *svm)
3946{
3947        struct msr_data msr;
3948        u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3949        u64 data = kvm_read_edx_eax(&svm->vcpu);
3950
3951        msr.data = data;
3952        msr.index = ecx;
3953        msr.host_initiated = false;
3954
3955        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3956        if (kvm_set_msr(&svm->vcpu, &msr)) {
3957                trace_kvm_msr_write_ex(ecx, data);
3958                kvm_inject_gp(&svm->vcpu, 0);
3959        } else {
3960                trace_kvm_msr_write(ecx, data);
3961                skip_emulated_instruction(&svm->vcpu);
3962        }
3963        return 1;
3964}
3965
3966static int msr_interception(struct vcpu_svm *svm)
3967{
3968        if (svm->vmcb->control.exit_info_1)
3969                return wrmsr_interception(svm);
3970        else
3971                return rdmsr_interception(svm);
3972}
3973
3974static int interrupt_window_interception(struct vcpu_svm *svm)
3975{
3976        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3977        svm_clear_vintr(svm);
3978        svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3979        mark_dirty(svm->vmcb, VMCB_INTR);
3980        ++svm->vcpu.stat.irq_window_exits;
3981        return 1;
3982}
3983
3984static int pause_interception(struct vcpu_svm *svm)
3985{
3986        struct kvm_vcpu *vcpu = &svm->vcpu;
3987
3988        if (pause_filter_thresh)
3989                grow_ple_window(vcpu);
3990
3991        kvm_vcpu_on_spin(vcpu);
3992        return 1;
3993}
3994
3995static int nop_interception(struct vcpu_svm *svm)
3996{
3997        skip_emulated_instruction(&(svm->vcpu));
3998        return 1;
3999}
4000
4001static int monitor_interception(struct vcpu_svm *svm)
4002{
4003        printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
4004        return nop_interception(svm);
4005}
4006
4007static int mwait_interception(struct vcpu_svm *svm)
4008{
4009        printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
4010        return nop_interception(svm);
4011}
4012
4013enum avic_ipi_failure_cause {
4014        AVIC_IPI_FAILURE_INVALID_INT_TYPE,
4015        AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
4016        AVIC_IPI_FAILURE_INVALID_TARGET,
4017        AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
4018};
4019
4020static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
4021{
4022        u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
4023        u32 icrl = svm->vmcb->control.exit_info_1;
4024        u32 id = svm->vmcb->control.exit_info_2 >> 32;
4025        u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
4026        struct kvm_lapic *apic = svm->vcpu.arch.apic;
4027
4028        trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
4029
4030        switch (id) {
4031        case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
4032                /*
4033                 * AVIC hardware handles the generation of
4034                 * IPIs when the specified Message Type is Fixed
4035                 * (also known as fixed delivery mode) and
4036                 * the Trigger Mode is edge-triggered. The hardware
4037                 * also supports self and broadcast delivery modes
4038                 * specified via the Destination Shorthand(DSH)
4039                 * field of the ICRL. Logical and physical APIC ID
4040                 * formats are supported. All other IPI types cause
4041                 * a #VMEXIT, which needs to emulated.
4042                 */
4043                kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
4044                kvm_lapic_reg_write(apic, APIC_ICR, icrl);
4045                break;
4046        case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
4047                int i;
4048                struct kvm_vcpu *vcpu;
4049                struct kvm *kvm = svm->vcpu.kvm;
4050                struct kvm_lapic *apic = svm->vcpu.arch.apic;
4051
4052                /*
4053                 * At this point, we expect that the AVIC HW has already
4054                 * set the appropriate IRR bits on the valid target
4055                 * vcpus. So, we just need to kick the appropriate vcpu.
4056                 */
4057                kvm_for_each_vcpu(i, vcpu, kvm) {
4058                        bool m = kvm_apic_match_dest(vcpu, apic,
4059                                                     icrl & KVM_APIC_SHORT_MASK,
4060                                                     GET_APIC_DEST_FIELD(icrh),
4061                                                     icrl & KVM_APIC_DEST_MASK);
4062
4063                        if (m && !avic_vcpu_is_running(vcpu))
4064                                kvm_vcpu_wake_up(vcpu);
4065                }
4066                break;
4067        }
4068        case AVIC_IPI_FAILURE_INVALID_TARGET:
4069                break;
4070        case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
4071                WARN_ONCE(1, "Invalid backing page\n");
4072                break;
4073        default:
4074                pr_err("Unknown IPI interception\n");
4075        }
4076
4077        return 1;
4078}
4079
4080static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
4081{
4082        struct kvm_arch *vm_data = &vcpu->kvm->arch;
4083        int index;
4084        u32 *logical_apic_id_table;
4085        int dlid = GET_APIC_LOGICAL_ID(ldr);
4086
4087        if (!dlid)
4088                return NULL;
4089
4090        if (flat) { /* flat */
4091                index = ffs(dlid) - 1;
4092                if (index > 7)
4093                        return NULL;
4094        } else { /* cluster */
4095                int cluster = (dlid & 0xf0) >> 4;
4096                int apic = ffs(dlid & 0x0f) - 1;
4097
4098                if ((apic < 0) || (apic > 7) ||
4099                    (cluster >= 0xf))
4100                        return NULL;
4101                index = (cluster << 2) + apic;
4102        }
4103
4104        logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page);
4105
4106        return &logical_apic_id_table[index];
4107}
4108
4109static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
4110                          bool valid)
4111{
4112        bool flat;
4113        u32 *entry, new_entry;
4114
4115        flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
4116        entry = avic_get_logical_id_entry(vcpu, ldr, flat);
4117        if (!entry)
4118                return -EINVAL;
4119
4120        new_entry = READ_ONCE(*entry);
4121        new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
4122        new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
4123        if (valid)
4124                new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
4125        else
4126                new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
4127        WRITE_ONCE(*entry, new_entry);
4128
4129        return 0;
4130}
4131
4132static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
4133{
4134        int ret;
4135        struct vcpu_svm *svm = to_svm(vcpu);
4136        u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
4137
4138        if (!ldr)
4139                return 1;
4140
4141        ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
4142        if (ret && svm->ldr_reg) {
4143                avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
4144                svm->ldr_reg = 0;
4145        } else {
4146                svm->ldr_reg = ldr;
4147        }
4148        return ret;
4149}
4150
4151static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
4152{
4153        u64 *old, *new;
4154        struct vcpu_svm *svm = to_svm(vcpu);
4155        u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
4156        u32 id = (apic_id_reg >> 24) & 0xff;
4157
4158        if (vcpu->vcpu_id == id)
4159                return 0;
4160
4161        old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
4162        new = avic_get_physical_id_entry(vcpu, id);
4163        if (!new || !old)
4164                return 1;
4165
4166        /* We need to move physical_id_entry to new offset */
4167        *new = *old;
4168        *old = 0ULL;
4169        to_svm(vcpu)->avic_physical_id_cache = new;
4170
4171        /*
4172         * Also update the guest physical APIC ID in the logical
4173         * APIC ID table entry if already setup the LDR.
4174         */
4175        if (svm->ldr_reg)
4176                avic_handle_ldr_update(vcpu);
4177
4178        return 0;
4179}
4180
4181static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
4182{
4183        struct vcpu_svm *svm = to_svm(vcpu);
4184        struct kvm_arch *vm_data = &vcpu->kvm->arch;
4185        u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
4186        u32 mod = (dfr >> 28) & 0xf;
4187
4188        /*
4189         * We assume that all local APICs are using the same type.
4190         * If this changes, we need to flush the AVIC logical
4191         * APID id table.
4192         */
4193        if (vm_data->ldr_mode == mod)
4194                return 0;
4195
4196        clear_page(page_address(vm_data->avic_logical_id_table_page));
4197        vm_data->ldr_mode = mod;
4198
4199        if (svm->ldr_reg)
4200                avic_handle_ldr_update(vcpu);
4201        return 0;
4202}
4203
4204static int avic_unaccel_trap_write(struct vcpu_svm *svm)
4205{
4206        struct kvm_lapic *apic = svm->vcpu.arch.apic;
4207        u32 offset = svm->vmcb->control.exit_info_1 &
4208                                AVIC_UNACCEL_ACCESS_OFFSET_MASK;
4209
4210        switch (offset) {
4211        case APIC_ID:
4212                if (avic_handle_apic_id_update(&svm->vcpu))
4213                        return 0;
4214                break;
4215        case APIC_LDR:
4216                if (avic_handle_ldr_update(&svm->vcpu))
4217                        return 0;
4218                break;
4219        case APIC_DFR:
4220                avic_handle_dfr_update(&svm->vcpu);
4221                break;
4222        default:
4223                break;
4224        }
4225
4226        kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
4227
4228        return 1;
4229}
4230
4231static bool is_avic_unaccelerated_access_trap(u32 offset)
4232{
4233        bool ret = false;
4234
4235        switch (offset) {
4236        case APIC_ID:
4237        case APIC_EOI:
4238        case APIC_RRR:
4239        case APIC_LDR:
4240        case APIC_DFR:
4241        case APIC_SPIV:
4242        case APIC_ESR:
4243        case APIC_ICR:
4244        case APIC_LVTT:
4245        case APIC_LVTTHMR:
4246        case APIC_LVTPC:
4247        case APIC_LVT0:
4248        case APIC_LVT1:
4249        case APIC_LVTERR:
4250        case APIC_TMICT:
4251        case APIC_TDCR:
4252                ret = true;
4253                break;
4254        default:
4255                break;
4256        }
4257        return ret;
4258}
4259
4260static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
4261{
4262        int ret = 0;
4263        u32 offset = svm->vmcb->control.exit_info_1 &
4264                     AVIC_UNACCEL_ACCESS_OFFSET_MASK;
4265        u32 vector = svm->vmcb->control.exit_info_2 &
4266                     AVIC_UNACCEL_ACCESS_VECTOR_MASK;
4267        bool write = (svm->vmcb->control.exit_info_1 >> 32) &
4268                     AVIC_UNACCEL_ACCESS_WRITE_MASK;
4269        bool trap = is_avic_unaccelerated_access_trap(offset);
4270
4271        trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
4272                                            trap, write, vector);
4273        if (trap) {
4274                /* Handling Trap */
4275                WARN_ONCE(!write, "svm: Handling trap read.\n");
4276                ret = avic_unaccel_trap_write(svm);
4277        } else {
4278                /* Handling Fault */
4279                ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
4280        }
4281
4282        return ret;
4283}
4284
4285static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
4286        [SVM_EXIT_READ_CR0]                     = cr_interception,
4287        [SVM_EXIT_READ_CR3]                     = cr_interception,
4288        [SVM_EXIT_READ_CR4]                     = cr_interception,
4289        [SVM_EXIT_READ_CR8]                     = cr_interception,
4290        [SVM_EXIT_CR0_SEL_WRITE]                = emulate_on_interception,
4291        [SVM_EXIT_WRITE_CR0]                    = cr_interception,
4292        [SVM_EXIT_WRITE_CR3]                    = cr_interception,
4293        [SVM_EXIT_WRITE_CR4]                    = cr_interception,
4294        [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
4295        [SVM_EXIT_READ_DR0]                     = dr_interception,
4296        [SVM_EXIT_READ_DR1]                     = dr_interception,
4297        [SVM_EXIT_READ_DR2]                     = dr_interception,
4298        [SVM_EXIT_READ_DR3]                     = dr_interception,
4299        [SVM_EXIT_READ_DR4]                     = dr_interception,
4300        [SVM_EXIT_READ_DR5]                     = dr_interception,
4301        [SVM_EXIT_READ_DR6]                     = dr_interception,
4302        [SVM_EXIT_READ_DR7]                     = dr_interception,
4303        [SVM_EXIT_WRITE_DR0]                    = dr_interception,
4304        [SVM_EXIT_WRITE_DR1]                    = dr_interception,
4305        [SVM_EXIT_WRITE_DR2]                    = dr_interception,
4306        [SVM_EXIT_WRITE_DR3]                    = dr_interception,
4307        [SVM_EXIT_WRITE_DR4]                    = dr_interception,
4308        [SVM_EXIT_WRITE_DR5]                    = dr_interception,
4309        [SVM_EXIT_WRITE_DR6]                    = dr_interception,
4310        [SVM_EXIT_WRITE_DR7]                    = dr_interception,
4311        [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
4312        [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
4313        [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
4314        [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
4315        [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
4316        [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
4317        [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
4318        [SVM_EXIT_INTR]                         = intr_interception,
4319        [SVM_EXIT_NMI]                          = nmi_interception,
4320        [SVM_EXIT_SMI]                          = nop_on_interception,
4321        [SVM_EXIT_INIT]                         = nop_on_interception,
4322        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
4323        [SVM_EXIT_RDPMC]                        = rdpmc_interception,
4324        [SVM_EXIT_CPUID]                        = cpuid_interception,
4325        [SVM_EXIT_IRET]                         = iret_interception,
4326        [SVM_EXIT_INVD]                         = emulate_on_interception,
4327        [SVM_EXIT_PAUSE]                        = pause_interception,
4328        [SVM_EXIT_HLT]                          = halt_interception,
4329        [SVM_EXIT_INVLPG]                       = invlpg_interception,
4330        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
4331        [SVM_EXIT_IOIO]                         = io_interception,
4332        [SVM_EXIT_MSR]                          = msr_interception,
4333        [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
4334        [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
4335        [SVM_EXIT_VMRUN]                        = vmrun_interception,
4336        [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
4337        [SVM_EXIT_VMLOAD]                       = vmload_interception,
4338        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
4339        [SVM_EXIT_STGI]                         = stgi_interception,
4340        [SVM_EXIT_CLGI]                         = clgi_interception,
4341        [SVM_EXIT_SKINIT]                       = skinit_interception,
4342        [SVM_EXIT_WBINVD]                       = emulate_on_interception,
4343        [SVM_EXIT_MONITOR]                      = monitor_interception,
4344        [SVM_EXIT_MWAIT]                        = mwait_interception,
4345        [SVM_EXIT_XSETBV]                       = xsetbv_interception,
4346        [SVM_EXIT_NPF]                          = pf_interception,
4347        [SVM_EXIT_RSM]                          = emulate_on_interception,
4348        [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
4349        [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
4350};
4351
4352static void dump_vmcb(struct kvm_vcpu *vcpu)
4353{
4354        struct vcpu_svm *svm = to_svm(vcpu);
4355        struct vmcb_control_area *control = &svm->vmcb->control;
4356        struct vmcb_save_area *save = &svm->vmcb->save;
4357
4358        pr_err("VMCB Control Area:\n");
4359        pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
4360        pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
4361        pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
4362        pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
4363        pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
4364        pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
4365        pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
4366        pr_err("%-20s%d\n", "pause filter threshold:",
4367               control->pause_filter_thresh);
4368        pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
4369        pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
4370        pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
4371        pr_err("%-20s%d\n", "asid:", control->asid);
4372        pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
4373        pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
4374        pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
4375        pr_err("%-20s%08x\n", "int_state:", control->int_state);
4376        pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
4377        pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
4378        pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
4379        pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
4380        pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
4381        pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
4382        pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
4383        pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
4384        pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
4385        pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
4386        pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
4387        pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
4388        pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
4389        pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
4390        pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
4391        pr_err("VMCB State Save Area:\n");
4392        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4393               "es:",
4394               save->es.selector, save->es.attrib,
4395               save->es.limit, save->es.base);
4396        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4397               "cs:",
4398               save->cs.selector, save->cs.attrib,
4399               save->cs.limit, save->cs.base);
4400        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4401               "ss:",
4402               save->ss.selector, save->ss.attrib,
4403               save->ss.limit, save->ss.base);
4404        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4405               "ds:",
4406               save->ds.selector, save->ds.attrib,
4407               save->ds.limit, save->ds.base);
4408        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4409               "fs:",
4410               save->fs.selector, save->fs.attrib,
4411               save->fs.limit, save->fs.base);
4412        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4413               "gs:",
4414               save->gs.selector, save->gs.attrib,
4415               save->gs.limit, save->gs.base);
4416        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4417               "gdtr:",
4418               save->gdtr.selector, save->gdtr.attrib,
4419               save->gdtr.limit, save->gdtr.base);
4420        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4421               "ldtr:",
4422               save->ldtr.selector, save->ldtr.attrib,
4423               save->ldtr.limit, save->ldtr.base);
4424        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4425               "idtr:",
4426               save->idtr.selector, save->idtr.attrib,
4427               save->idtr.limit, save->idtr.base);
4428        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4429               "tr:",
4430               save->tr.selector, save->tr.attrib,
4431               save->tr.limit, save->tr.base);
4432        pr_err("cpl:            %d                efer:         %016llx\n",
4433                save->cpl, save->efer);
4434        pr_err("%-15s %016llx %-13s %016llx\n",
4435               "cr0:", save->cr0, "cr2:", save->cr2);
4436        pr_err("%-15s %016llx %-13s %016llx\n",
4437               "cr3:", save->cr3, "cr4:", save->cr4);
4438        pr_err("%-15s %016llx %-13s %016llx\n",
4439               "dr6:", save->dr6, "dr7:", save->dr7);
4440        pr_err("%-15s %016llx %-13s %016llx\n",
4441               "rip:", save->rip, "rflags:", save->rflags);
4442        pr_err("%-15s %016llx %-13s %016llx\n",
4443               "rsp:", save->rsp, "rax:", save->rax);
4444        pr_err("%-15s %016llx %-13s %016llx\n",
4445               "star:", save->star, "lstar:", save->lstar);
4446        pr_err("%-15s %016llx %-13s %016llx\n",
4447               "cstar:", save->cstar, "sfmask:", save->sfmask);
4448        pr_err("%-15s %016llx %-13s %016llx\n",
4449               "kernel_gs_base:", save->kernel_gs_base,
4450               "sysenter_cs:", save->sysenter_cs);
4451        pr_err("%-15s %016llx %-13s %016llx\n",
4452               "sysenter_esp:", save->sysenter_esp,
4453               "sysenter_eip:", save->sysenter_eip);
4454        pr_err("%-15s %016llx %-13s %016llx\n",
4455               "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
4456        pr_err("%-15s %016llx %-13s %016llx\n",
4457               "br_from:", save->br_from, "br_to:", save->br_to);
4458        pr_err("%-15s %016llx %-13s %016llx\n",
4459               "excp_from:", save->last_excp_from,
4460               "excp_to:", save->last_excp_to);
4461}
4462
4463static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
4464{
4465        struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
4466
4467        *info1 = control->exit_info_1;
4468        *info2 = control->exit_info_2;
4469}
4470
4471static int handle_exit(struct kvm_vcpu *vcpu)
4472{
4473        struct vcpu_svm *svm = to_svm(vcpu);
4474        struct kvm_run *kvm_run = vcpu->run;
4475        u32 exit_code = svm->vmcb->control.exit_code;
4476
4477        trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
4478
4479        if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
4480                vcpu->arch.cr0 = svm->vmcb->save.cr0;
4481        if (npt_enabled)
4482                vcpu->arch.cr3 = svm->vmcb->save.cr3;
4483
4484        if (unlikely(svm->nested.exit_required)) {
4485                nested_svm_vmexit(svm);
4486                svm->nested.exit_required = false;
4487
4488                return 1;
4489        }
4490
4491        if (is_guest_mode(vcpu)) {
4492                int vmexit;
4493
4494                trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
4495                                        svm->vmcb->control.exit_info_1,
4496                                        svm->vmcb->control.exit_info_2,
4497                                        svm->vmcb->control.exit_int_info,
4498                                        svm->vmcb->control.exit_int_info_err,
4499                                        KVM_ISA_SVM);
4500
4501                vmexit = nested_svm_exit_special(svm);
4502
4503                if (vmexit == NESTED_EXIT_CONTINUE)
4504                        vmexit = nested_svm_exit_handled(svm);
4505
4506                if (vmexit == NESTED_EXIT_DONE)
4507                        return 1;
4508        }
4509
4510        svm_complete_interrupts(svm);
4511
4512        if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
4513                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4514                kvm_run->fail_entry.hardware_entry_failure_reason
4515                        = svm->vmcb->control.exit_code;
4516                pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
4517                dump_vmcb(vcpu);
4518                return 0;
4519        }
4520
4521        if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
4522            exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
4523            exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
4524            exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
4525                printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
4526                       "exit_code 0x%x\n",
4527                       __func__, svm->vmcb->control.exit_int_info,
4528                       exit_code);
4529
4530        if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
4531            || !svm_exit_handlers[exit_code]) {
4532                WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
4533                kvm_queue_exception(vcpu, UD_VECTOR);
4534                return 1;
4535        }
4536
4537        return svm_exit_handlers[exit_code](svm);
4538}
4539
4540static void reload_tss(struct kvm_vcpu *vcpu)
4541{
4542        int cpu = raw_smp_processor_id();
4543
4544        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
4545        sd->tss_desc->type = 9; /* available 32/64-bit TSS */
4546        load_TR_desc();
4547}
4548
4549static void pre_svm_run(struct vcpu_svm *svm)
4550{
4551        int cpu = raw_smp_processor_id();
4552
4553        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
4554
4555        /* FIXME: handle wraparound of asid_generation */
4556        if (svm->asid_generation != sd->asid_generation)
4557                new_asid(svm, sd);
4558}
4559
4560static void svm_inject_nmi(struct kvm_vcpu *vcpu)
4561{
4562        struct vcpu_svm *svm = to_svm(vcpu);
4563
4564        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
4565        vcpu->arch.hflags |= HF_NMI_MASK;
4566        set_intercept(svm, INTERCEPT_IRET);
4567        ++vcpu->stat.nmi_injections;
4568}
4569
4570static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
4571{
4572        struct vmcb_control_area *control;
4573
4574        /* The following fields are ignored when AVIC is enabled */
4575        control = &svm->vmcb->control;
4576        control->int_vector = irq;
4577        control->int_ctl &= ~V_INTR_PRIO_MASK;
4578        control->int_ctl |= V_IRQ_MASK |
4579                ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
4580        mark_dirty(svm->vmcb, VMCB_INTR);
4581}
4582
4583static void svm_set_irq(struct kvm_vcpu *vcpu)
4584{
4585        struct vcpu_svm *svm = to_svm(vcpu);
4586
4587        BUG_ON(!(gif_set(svm)));
4588
4589        trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
4590        ++vcpu->stat.irq_injections;
4591
4592        svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
4593                SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
4594}
4595
4596static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
4597{
4598        return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
4599}
4600
4601static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
4602{
4603        struct vcpu_svm *svm = to_svm(vcpu);
4604
4605        if (svm_nested_virtualize_tpr(vcpu) ||
4606            kvm_vcpu_apicv_active(vcpu))
4607                return;
4608
4609        clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
4610
4611        if (irr == -1)
4612                return;
4613
4614        if (tpr >= irr)
4615                set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
4616}
4617
4618static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
4619{
4620        return;
4621}
4622
4623static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
4624{
4625        return avic && irqchip_split(vcpu->kvm);
4626}
4627
4628static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
4629{
4630}
4631
4632static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
4633{
4634}
4635
4636/* Note: Currently only used by Hyper-V. */
4637static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4638{
4639        struct vcpu_svm *svm = to_svm(vcpu);
4640        struct vmcb *vmcb = svm->vmcb;
4641
4642        if (!kvm_vcpu_apicv_active(&svm->vcpu))
4643                return;
4644
4645        vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
4646        mark_dirty(vmcb, VMCB_INTR);
4647}
4648
4649static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
4650{
4651        return;
4652}
4653
4654static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
4655{
4656        kvm_lapic_set_irr(vec, vcpu->arch.apic);
4657        smp_mb__after_atomic();
4658
4659        if (avic_vcpu_is_running(vcpu))
4660                wrmsrl(SVM_AVIC_DOORBELL,
4661                       kvm_cpu_get_apicid(vcpu->cpu));
4662        else
4663                kvm_vcpu_wake_up(vcpu);
4664}
4665
4666static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
4667{
4668        unsigned long flags;
4669        struct amd_svm_iommu_ir *cur;
4670
4671        spin_lock_irqsave(&svm->ir_list_lock, flags);
4672        list_for_each_entry(cur, &svm->ir_list, node) {
4673                if (cur->data != pi->ir_data)
4674                        continue;
4675                list_del(&cur->node);
4676                kfree(cur);
4677                break;
4678        }
4679        spin_unlock_irqrestore(&svm->ir_list_lock, flags);
4680}
4681
4682static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
4683{
4684        int ret = 0;
4685        unsigned long flags;
4686        struct amd_svm_iommu_ir *ir;
4687
4688        /**
4689         * In some cases, the existing irte is updaed and re-set,
4690         * so we need to check here if it's already been * added
4691         * to the ir_list.
4692         */
4693        if (pi->ir_data && (pi->prev_ga_tag != 0)) {
4694                struct kvm *kvm = svm->vcpu.kvm;
4695                u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
4696                struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
4697                struct vcpu_svm *prev_svm;
4698
4699                if (!prev_vcpu) {
4700                        ret = -EINVAL;
4701                        goto out;
4702                }
4703
4704                prev_svm = to_svm(prev_vcpu);
4705                svm_ir_list_del(prev_svm, pi);
4706        }
4707
4708        /**
4709         * Allocating new amd_iommu_pi_data, which will get
4710         * add to the per-vcpu ir_list.
4711         */
4712        ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
4713        if (!ir) {
4714                ret = -ENOMEM;
4715                goto out;
4716        }
4717        ir->data = pi->ir_data;
4718
4719        spin_lock_irqsave(&svm->ir_list_lock, flags);
4720        list_add(&ir->node, &svm->ir_list);
4721        spin_unlock_irqrestore(&svm->ir_list_lock, flags);
4722out:
4723        return ret;
4724}
4725
4726/**
4727 * Note:
4728 * The HW cannot support posting multicast/broadcast
4729 * interrupts to a vCPU. So, we still use legacy interrupt
4730 * remapping for these kind of interrupts.
4731 *
4732 * For lowest-priority interrupts, we only support
4733 * those with single CPU as the destination, e.g. user
4734 * configures the interrupts via /proc/irq or uses
4735 * irqbalance to make the interrupts single-CPU.
4736 */
4737static int
4738get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
4739                 struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
4740{
4741        struct kvm_lapic_irq irq;
4742        struct kvm_vcpu *vcpu = NULL;
4743
4744        kvm_set_msi_irq(kvm, e, &irq);
4745
4746        if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
4747                pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
4748                         __func__, irq.vector);
4749                return -1;
4750        }
4751
4752        pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
4753                 irq.vector);
4754        *svm = to_svm(vcpu);
4755        vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
4756        vcpu_info->vector = irq.vector;
4757
4758        return 0;
4759}
4760
4761/*
4762 * svm_update_pi_irte - set IRTE for Posted-Interrupts
4763 *
4764 * @kvm: kvm
4765 * @host_irq: host irq of the interrupt
4766 * @guest_irq: gsi of the interrupt
4767 * @set: set or unset PI
4768 * returns 0 on success, < 0 on failure
4769 */
4770static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
4771                              uint32_t guest_irq, bool set)
4772{
4773        struct kvm_kernel_irq_routing_entry *e;
4774        struct kvm_irq_routing_table *irq_rt;
4775        int idx, ret = -EINVAL;
4776
4777        if (!kvm_arch_has_assigned_device(kvm) ||
4778            !irq_remapping_cap(IRQ_POSTING_CAP))
4779                return 0;
4780
4781        pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
4782                 __func__, host_irq, guest_irq, set);
4783
4784        idx = srcu_read_lock(&kvm->irq_srcu);
4785        irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
4786        WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
4787
4788        hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
4789                struct vcpu_data vcpu_info;
4790                struct vcpu_svm *svm = NULL;
4791
4792                if (e->type != KVM_IRQ_ROUTING_MSI)
4793                        continue;
4794
4795                /**
4796                 * Here, we setup with legacy mode in the following cases:
4797                 * 1. When cannot target interrupt to a specific vcpu.
4798                 * 2. Unsetting posted interrupt.
4799                 * 3. APIC virtialization is disabled for the vcpu.
4800                 */
4801                if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
4802                    kvm_vcpu_apicv_active(&svm->vcpu)) {
4803                        struct amd_iommu_pi_data pi;
4804
4805                        /* Try to enable guest_mode in IRTE */
4806                        pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
4807                                            AVIC_HPA_MASK);
4808                        pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
4809                                                     svm->vcpu.vcpu_id);
4810                        pi.is_guest_mode = true;
4811                        pi.vcpu_data = &vcpu_info;
4812                        ret = irq_set_vcpu_affinity(host_irq, &pi);
4813
4814                        /**
4815                         * Here, we successfully setting up vcpu affinity in
4816                         * IOMMU guest mode. Now, we need to store the posted
4817                         * interrupt information in a per-vcpu ir_list so that
4818                         * we can reference to them directly when we update vcpu
4819                         * scheduling information in IOMMU irte.
4820                         */
4821                        if (!ret && pi.is_guest_mode)
4822                                svm_ir_list_add(svm, &pi);
4823                } else {
4824                        /* Use legacy mode in IRTE */
4825                        struct amd_iommu_pi_data pi;
4826
4827                        /**
4828                         * Here, pi is used to:
4829                         * - Tell IOMMU to use legacy mode for this interrupt.
4830                         * - Retrieve ga_tag of prior interrupt remapping data.
4831                         */
4832                        pi.is_guest_mode = false;
4833                        ret = irq_set_vcpu_affinity(host_irq, &pi);
4834
4835                        /**
4836                         * Check if the posted interrupt was previously
4837                         * setup with the guest_mode by checking if the ga_tag
4838                         * was cached. If so, we need to clean up the per-vcpu
4839                         * ir_list.
4840                         */
4841                        if (!ret && pi.prev_ga_tag) {
4842                                int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
4843                                struct kvm_vcpu *vcpu;
4844
4845                                vcpu = kvm_get_vcpu_by_id(kvm, id);
4846                                if (vcpu)
4847                                        svm_ir_list_del(to_svm(vcpu), &pi);
4848                        }
4849                }
4850
4851                if (!ret && svm) {
4852                        trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
4853                                                 host_irq, e->gsi,
4854                                                 vcpu_info.vector,
4855                                                 vcpu_info.pi_desc_addr, set);
4856                }
4857
4858                if (ret < 0) {
4859                        pr_err("%s: failed to update PI IRTE\n", __func__);
4860                        goto out;
4861                }
4862        }
4863
4864        ret = 0;
4865out:
4866        srcu_read_unlock(&kvm->irq_srcu, idx);
4867        return ret;
4868}
4869
4870static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
4871{
4872        struct vcpu_svm *svm = to_svm(vcpu);
4873        struct vmcb *vmcb = svm->vmcb;
4874        int ret;
4875        ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
4876              !(svm->vcpu.arch.hflags & HF_NMI_MASK);
4877        ret = ret && gif_set(svm) && nested_svm_nmi(svm);
4878
4879        return ret;
4880}
4881
4882static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
4883{
4884        struct vcpu_svm *svm = to_svm(vcpu);
4885
4886        return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
4887}
4888
4889static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4890{
4891        struct vcpu_svm *svm = to_svm(vcpu);
4892
4893        if (masked) {
4894                svm->vcpu.arch.hflags |= HF_NMI_MASK;
4895                set_intercept(svm, INTERCEPT_IRET);
4896        } else {
4897                svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
4898                clr_intercept(svm, INTERCEPT_IRET);
4899        }
4900}
4901
4902static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
4903{
4904        struct vcpu_svm *svm = to_svm(vcpu);
4905        struct vmcb *vmcb = svm->vmcb;
4906        int ret;
4907
4908        if (!gif_set(svm) ||
4909             (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
4910                return 0;
4911
4912        ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
4913
4914        if (is_guest_mode(vcpu))
4915                return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
4916
4917        return ret;
4918}
4919
4920static void enable_irq_window(struct kvm_vcpu *vcpu)
4921{
4922        struct vcpu_svm *svm = to_svm(vcpu);
4923
4924        if (kvm_vcpu_apicv_active(vcpu))
4925                return;
4926
4927        /*
4928         * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
4929         * 1, because that's a separate STGI/VMRUN intercept.  The next time we
4930         * get that intercept, this function will be called again though and
4931         * we'll get the vintr intercept. However, if the vGIF feature is
4932         * enabled, the STGI interception will not occur. Enable the irq
4933         * window under the assumption that the hardware will set the GIF.
4934         */
4935        if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
4936                svm_set_vintr(svm);
4937                svm_inject_irq(svm, 0x0);
4938        }
4939}
4940
4941static void enable_nmi_window(struct kvm_vcpu *vcpu)
4942{
4943        struct vcpu_svm *svm = to_svm(vcpu);
4944
4945        if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
4946            == HF_NMI_MASK)
4947                return; /* IRET will cause a vm exit */
4948
4949        if (!gif_set(svm)) {
4950                if (vgif_enabled(svm))
4951                        set_intercept(svm, INTERCEPT_STGI);
4952                return; /* STGI will cause a vm exit */
4953        }
4954
4955        if (svm->nested.exit_required)
4956                return; /* we're not going to run the guest yet */
4957
4958        /*
4959         * Something prevents NMI from been injected. Single step over possible
4960         * problem (IRET or exception injection or interrupt shadow)
4961         */
4962        svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
4963        svm->nmi_singlestep = true;
4964        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
4965}
4966
4967static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
4968{
4969        return 0;
4970}
4971
4972static void svm_flush_tlb(struct kvm_vcpu *vcpu)
4973{
4974        struct vcpu_svm *svm = to_svm(vcpu);
4975
4976        if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
4977                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
4978        else
4979                svm->asid_generation--;
4980}
4981
4982static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
4983{
4984}
4985
4986static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
4987{
4988        struct vcpu_svm *svm = to_svm(vcpu);
4989
4990        if (svm_nested_virtualize_tpr(vcpu))
4991                return;
4992
4993        if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
4994                int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
4995                kvm_set_cr8(vcpu, cr8);
4996        }
4997}
4998
4999static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
5000{
5001        struct vcpu_svm *svm = to_svm(vcpu);
5002        u64 cr8;
5003
5004        if (svm_nested_virtualize_tpr(vcpu) ||
5005            kvm_vcpu_apicv_active(vcpu))
5006                return;
5007
5008        cr8 = kvm_get_cr8(vcpu);
5009        svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
5010        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
5011}
5012
5013static void svm_complete_interrupts(struct vcpu_svm *svm)
5014{
5015        u8 vector;
5016        int type;
5017        u32 exitintinfo = svm->vmcb->control.exit_int_info;
5018        unsigned int3_injected = svm->int3_injected;
5019
5020        svm->int3_injected = 0;
5021
5022        /*
5023         * If we've made progress since setting HF_IRET_MASK, we've
5024         * executed an IRET and can allow NMI injection.
5025         */
5026        if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
5027            && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
5028                svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
5029                kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
5030        }
5031
5032        svm->vcpu.arch.nmi_injected = false;
5033        kvm_clear_exception_queue(&svm->vcpu);
5034        kvm_clear_interrupt_queue(&svm->vcpu);
5035
5036        if (!(exitintinfo & SVM_EXITINTINFO_VALID))
5037                return;
5038
5039        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
5040
5041        vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
5042        type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
5043
5044        switch (type) {
5045        case SVM_EXITINTINFO_TYPE_NMI:
5046                svm->vcpu.arch.nmi_injected = true;
5047                break;
5048        case SVM_EXITINTINFO_TYPE_EXEPT:
5049                /*
5050                 * In case of software exceptions, do not reinject the vector,
5051                 * but re-execute the instruction instead. Rewind RIP first
5052                 * if we emulated INT3 before.
5053                 */
5054                if (kvm_exception_is_soft(vector)) {
5055                        if (vector == BP_VECTOR && int3_injected &&
5056                            kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
5057                                kvm_rip_write(&svm->vcpu,
5058                                              kvm_rip_read(&svm->vcpu) -
5059                                              int3_injected);
5060                        break;
5061                }
5062                if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
5063                        u32 err = svm->vmcb->control.exit_int_info_err;
5064                        kvm_requeue_exception_e(&svm->vcpu, vector, err);
5065
5066                } else
5067                        kvm_requeue_exception(&svm->vcpu, vector);
5068                break;
5069        case SVM_EXITINTINFO_TYPE_INTR:
5070                kvm_queue_interrupt(&svm->vcpu, vector, false);
5071                break;
5072        default:
5073                break;
5074        }
5075}
5076
5077static void svm_cancel_injection(struct kvm_vcpu *vcpu)
5078{
5079        struct vcpu_svm *svm = to_svm(vcpu);
5080        struct vmcb_control_area *control = &svm->vmcb->control;
5081
5082        control->exit_int_info = control->event_inj;
5083        control->exit_int_info_err = control->event_inj_err;
5084        control->event_inj = 0;
5085        svm_complete_interrupts(svm);
5086}
5087
5088static void svm_vcpu_run(struct kvm_vcpu *vcpu)
5089{
5090        struct vcpu_svm *svm = to_svm(vcpu);
5091
5092        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
5093        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
5094        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
5095
5096        /*
5097         * A vmexit emulation is required before the vcpu can be executed
5098         * again.
5099         */
5100        if (unlikely(svm->nested.exit_required))
5101                return;
5102
5103        /*
5104         * Disable singlestep if we're injecting an interrupt/exception.
5105         * We don't want our modified rflags to be pushed on the stack where
5106         * we might not be able to easily reset them if we disabled NMI
5107         * singlestep later.
5108         */
5109        if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
5110                /*
5111                 * Event injection happens before external interrupts cause a
5112                 * vmexit and interrupts are disabled here, so smp_send_reschedule
5113                 * is enough to force an immediate vmexit.
5114                 */
5115                disable_nmi_singlestep(svm);
5116                smp_send_reschedule(vcpu->cpu);
5117        }
5118
5119        pre_svm_run(svm);
5120
5121        sync_lapic_to_cr8(vcpu);
5122
5123        svm->vmcb->save.cr2 = vcpu->arch.cr2;
5124
5125        clgi();
5126
5127        local_irq_enable();
5128
5129        x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
5130
5131        asm volatile (
5132                "push %%" _ASM_BP "; \n\t"
5133                "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
5134                "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
5135                "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
5136                "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
5137                "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
5138                "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
5139#ifdef CONFIG_X86_64
5140                "mov %c[r8](%[svm]),  %%r8  \n\t"
5141                "mov %c[r9](%[svm]),  %%r9  \n\t"
5142                "mov %c[r10](%[svm]), %%r10 \n\t"
5143                "mov %c[r11](%[svm]), %%r11 \n\t"
5144                "mov %c[r12](%[svm]), %%r12 \n\t"
5145                "mov %c[r13](%[svm]), %%r13 \n\t"
5146                "mov %c[r14](%[svm]), %%r14 \n\t"
5147                "mov %c[r15](%[svm]), %%r15 \n\t"
5148#endif
5149
5150                /* Enter guest mode */
5151                "push %%" _ASM_AX " \n\t"
5152                "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
5153                __ex(SVM_VMLOAD) "\n\t"
5154                __ex(SVM_VMRUN) "\n\t"
5155                __ex(SVM_VMSAVE) "\n\t"
5156                "pop %%" _ASM_AX " \n\t"
5157
5158                /* Save guest registers, load host registers */
5159                "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
5160                "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
5161                "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
5162                "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
5163                "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
5164                "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
5165#ifdef CONFIG_X86_64
5166                "mov %%r8,  %c[r8](%[svm]) \n\t"
5167                "mov %%r9,  %c[r9](%[svm]) \n\t"
5168                "mov %%r10, %c[r10](%[svm]) \n\t"
5169                "mov %%r11, %c[r11](%[svm]) \n\t"
5170                "mov %%r12, %c[r12](%[svm]) \n\t"
5171                "mov %%r13, %c[r13](%[svm]) \n\t"
5172                "mov %%r14, %c[r14](%[svm]) \n\t"
5173                "mov %%r15, %c[r15](%[svm]) \n\t"
5174#endif
5175                /*
5176                 * Clear host registers marked as clobbered to prevent
5177                 * speculative use.
5178                 */
5179                "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
5180                "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
5181                "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
5182                "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
5183                "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
5184#ifdef CONFIG_X86_64
5185                "xor %%r8, %%r8 \n\t"
5186                "xor %%r9, %%r9 \n\t"
5187                "xor %%r10, %%r10 \n\t"
5188                "xor %%r11, %%r11 \n\t"
5189                "xor %%r12, %%r12 \n\t"
5190                "xor %%r13, %%r13 \n\t"
5191                "xor %%r14, %%r14 \n\t"
5192                "xor %%r15, %%r15 \n\t"
5193#endif
5194                "pop %%" _ASM_BP
5195                :
5196                : [svm]"a"(svm),
5197                  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
5198                  [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
5199                  [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
5200                  [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
5201                  [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
5202                  [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
5203                  [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
5204#ifdef CONFIG_X86_64
5205                  , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
5206                  [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
5207                  [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
5208                  [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
5209                  [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
5210                  [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
5211                  [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
5212                  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
5213#endif
5214                : "cc", "memory"
5215#ifdef CONFIG_X86_64
5216                , "rbx", "rcx", "rdx", "rsi", "rdi"
5217                , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
5218#else
5219                , "ebx", "ecx", "edx", "esi", "edi"
5220#endif
5221                );
5222
5223#ifdef CONFIG_X86_64
5224        native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
5225#else
5226        loadsegment(fs, svm->host.fs);
5227#ifndef CONFIG_X86_32_LAZY_GS
5228        loadsegment(gs, svm->host.gs);
5229#endif
5230#endif
5231
5232        if (cpu_has_spec_ctrl())
5233                svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
5234        x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
5235
5236        /* Eliminate branch target predictions from guest mode */
5237        fill_RSB();
5238
5239        reload_tss(vcpu);
5240
5241        local_irq_disable();
5242
5243        vcpu->arch.cr2 = svm->vmcb->save.cr2;
5244        vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
5245        vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
5246        vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
5247
5248        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
5249                kvm_before_handle_nmi(&svm->vcpu);
5250
5251        stgi();
5252
5253        /* Any pending NMI will happen here */
5254
5255        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
5256                kvm_after_handle_nmi(&svm->vcpu);
5257
5258        sync_cr8_to_lapic(vcpu);
5259
5260        svm->next_rip = 0;
5261
5262        svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
5263
5264        /* if exit due to PF check for async PF */
5265        if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
5266                svm->apf_reason = kvm_read_and_reset_pf_reason();
5267
5268        if (npt_enabled) {
5269                vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
5270                vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
5271        }
5272
5273        /*
5274         * We need to handle MC intercepts here before the vcpu has a chance to
5275         * change the physical cpu
5276         */
5277        if (unlikely(svm->vmcb->control.exit_code ==
5278                     SVM_EXIT_EXCP_BASE + MC_VECTOR))
5279                svm_handle_mce(svm);
5280
5281        mark_all_clean(svm->vmcb);
5282}
5283
5284static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
5285{
5286        struct vcpu_svm *svm = to_svm(vcpu);
5287
5288        svm->vmcb->save.cr3 = __sme_set(root);
5289        mark_dirty(svm->vmcb, VMCB_CR);
5290        svm_flush_tlb(vcpu);
5291}
5292
5293static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
5294{
5295        struct vcpu_svm *svm = to_svm(vcpu);
5296
5297        svm->vmcb->control.nested_cr3 = __sme_set(root);
5298        mark_dirty(svm->vmcb, VMCB_NPT);
5299
5300        /* Also sync guest cr3 here in case we live migrate */
5301        svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
5302        mark_dirty(svm->vmcb, VMCB_CR);
5303
5304        svm_flush_tlb(vcpu);
5305}
5306
5307static int is_disabled(void)
5308{
5309        u64 vm_cr;
5310
5311        rdmsrl(MSR_VM_CR, vm_cr);
5312        if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
5313                return 1;
5314
5315        return 0;
5316}
5317
5318static void
5319svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5320{
5321        /*
5322         * Patch in the VMMCALL instruction:
5323         */
5324        hypercall[0] = 0x0f;
5325        hypercall[1] = 0x01;
5326        hypercall[2] = 0xd9;
5327}
5328
5329static void svm_check_processor_compat(void *rtn)
5330{
5331        *(int *)rtn = 0;
5332}
5333
5334static bool svm_cpu_has_accelerated_tpr(void)
5335{
5336        return false;
5337}
5338
5339static bool svm_has_emulated_msr(int index)
5340{
5341        return true;
5342}
5343
5344static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
5345{
5346        return 0;
5347}
5348
5349static void svm_cpuid_update(struct kvm_vcpu *vcpu)
5350{
5351        struct vcpu_svm *svm = to_svm(vcpu);
5352
5353        /* Update nrips enabled cache */
5354        svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
5355
5356        if (!kvm_vcpu_apicv_active(vcpu))
5357                return;
5358
5359        guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
5360}
5361
5362static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
5363{
5364        switch (func) {
5365        case 0x1:
5366                if (avic)
5367                        entry->ecx &= ~bit(X86_FEATURE_X2APIC);
5368                break;
5369        case 0x80000001:
5370                if (nested)
5371                        entry->ecx |= (1 << 2); /* Set SVM bit */
5372                break;
5373        case 0x8000000A:
5374                entry->eax = 1; /* SVM revision 1 */
5375                entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
5376                                   ASID emulation to nested SVM */
5377                entry->ecx = 0; /* Reserved */
5378                entry->edx = 0; /* Per default do not support any
5379                                   additional features */
5380
5381                /* Support next_rip if host supports it */
5382                if (boot_cpu_has(X86_FEATURE_NRIPS))
5383                        entry->edx |= SVM_FEATURE_NRIP;
5384
5385                /* Support NPT for the guest if enabled */
5386                if (npt_enabled)
5387                        entry->edx |= SVM_FEATURE_NPT;
5388
5389                break;
5390        }
5391}
5392
5393static int svm_get_lpage_level(void)
5394{
5395        return PT_PDPE_LEVEL;
5396}
5397
5398static bool svm_rdtscp_supported(void)
5399{
5400        return boot_cpu_has(X86_FEATURE_RDTSCP);
5401}
5402
5403static bool svm_invpcid_supported(void)
5404{
5405        return false;
5406}
5407
5408static bool svm_mpx_supported(void)
5409{
5410        return false;
5411}
5412
5413static bool svm_has_wbinvd_exit(void)
5414{
5415        return true;
5416}
5417
5418static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
5419{
5420        struct vcpu_svm *svm = to_svm(vcpu);
5421
5422        set_exception_intercept(svm, NM_VECTOR);
5423        update_cr0_intercept(svm);
5424}
5425
5426#define PRE_EX(exit)  { .exit_code = (exit), \
5427                        .stage = X86_ICPT_PRE_EXCEPT, }
5428#define POST_EX(exit) { .exit_code = (exit), \
5429                        .stage = X86_ICPT_POST_EXCEPT, }
5430#define POST_MEM(exit) { .exit_code = (exit), \
5431                        .stage = X86_ICPT_POST_MEMACCESS, }
5432
5433static const struct __x86_intercept {
5434        u32 exit_code;
5435        enum x86_intercept_stage stage;
5436} x86_intercept_map[] = {
5437        [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
5438        [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
5439        [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
5440        [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
5441        [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
5442        [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
5443        [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
5444        [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
5445        [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
5446        [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
5447        [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
5448        [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
5449        [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
5450        [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
5451        [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
5452        [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
5453        [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
5454        [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
5455        [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
5456        [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
5457        [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
5458        [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
5459        [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
5460        [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
5461        [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
5462        [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
5463        [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
5464        [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
5465        [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
5466        [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
5467        [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
5468        [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
5469        [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
5470        [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
5471        [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
5472        [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
5473        [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
5474        [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
5475        [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
5476        [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
5477        [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
5478        [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
5479        [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
5480        [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
5481        [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
5482        [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
5483};
5484
5485#undef PRE_EX
5486#undef POST_EX
5487#undef POST_MEM
5488
5489static int svm_check_intercept(struct kvm_vcpu *vcpu,
5490                               struct x86_instruction_info *info,
5491                               enum x86_intercept_stage stage)
5492{
5493        struct vcpu_svm *svm = to_svm(vcpu);
5494        int vmexit, ret = X86EMUL_CONTINUE;
5495        struct __x86_intercept icpt_info;
5496        struct vmcb *vmcb = svm->vmcb;
5497
5498        if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
5499                goto out;
5500
5501        icpt_info = x86_intercept_map[info->intercept];
5502
5503        if (stage != icpt_info.stage)
5504                goto out;
5505
5506        switch (icpt_info.exit_code) {
5507        case SVM_EXIT_READ_CR0:
5508                if (info->intercept == x86_intercept_cr_read)
5509                        icpt_info.exit_code += info->modrm_reg;
5510                break;
5511        case SVM_EXIT_WRITE_CR0: {
5512                unsigned long cr0, val;
5513                u64 intercept;
5514
5515                if (info->intercept == x86_intercept_cr_write)
5516                        icpt_info.exit_code += info->modrm_reg;
5517
5518                if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
5519                    info->intercept == x86_intercept_clts)
5520                        break;
5521
5522                intercept = svm->nested.intercept;
5523
5524                if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
5525                        break;
5526
5527                cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
5528                val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
5529
5530                if (info->intercept == x86_intercept_lmsw) {
5531                        cr0 &= 0xfUL;
5532                        val &= 0xfUL;
5533                        /* lmsw can't clear PE - catch this here */
5534                        if (cr0 & X86_CR0_PE)
5535                                val |= X86_CR0_PE;
5536                }
5537
5538                if (cr0 ^ val)
5539                        icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
5540
5541                break;
5542        }
5543        case SVM_EXIT_READ_DR0:
5544        case SVM_EXIT_WRITE_DR0:
5545                icpt_info.exit_code += info->modrm_reg;
5546                break;
5547        case SVM_EXIT_MSR:
5548                if (info->intercept == x86_intercept_wrmsr)
5549                        vmcb->control.exit_info_1 = 1;
5550                else
5551                        vmcb->control.exit_info_1 = 0;
5552                break;
5553        case SVM_EXIT_PAUSE:
5554                /*
5555                 * We get this for NOP only, but pause
5556                 * is rep not, check this here
5557                 */
5558                if (info->rep_prefix != REPE_PREFIX)
5559                        goto out;
5560        case SVM_EXIT_IOIO: {
5561                u64 exit_info;
5562                u32 bytes;
5563
5564                if (info->intercept == x86_intercept_in ||
5565                    info->intercept == x86_intercept_ins) {
5566                        exit_info = ((info->src_val & 0xffff) << 16) |
5567                                SVM_IOIO_TYPE_MASK;
5568                        bytes = info->dst_bytes;
5569                } else {
5570                        exit_info = (info->dst_val & 0xffff) << 16;
5571                        bytes = info->src_bytes;
5572                }
5573
5574                if (info->intercept == x86_intercept_outs ||
5575                    info->intercept == x86_intercept_ins)
5576                        exit_info |= SVM_IOIO_STR_MASK;
5577
5578                if (info->rep_prefix)
5579                        exit_info |= SVM_IOIO_REP_MASK;
5580
5581                bytes = min(bytes, 4u);
5582
5583                exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
5584
5585                exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
5586
5587                vmcb->control.exit_info_1 = exit_info;
5588                vmcb->control.exit_info_2 = info->next_rip;
5589
5590                break;
5591        }
5592        default:
5593                break;
5594        }
5595
5596        /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
5597        if (static_cpu_has(X86_FEATURE_NRIPS))
5598                vmcb->control.next_rip  = info->next_rip;
5599        vmcb->control.exit_code = icpt_info.exit_code;
5600        vmexit = nested_svm_exit_handled(svm);
5601
5602        ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
5603                                           : X86EMUL_CONTINUE;
5604
5605out:
5606        return ret;
5607}
5608
5609static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
5610{
5611        local_irq_enable();
5612}
5613
5614static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
5615{
5616        if (pause_filter_thresh)
5617                shrink_ple_window(vcpu);
5618}
5619
5620static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
5621{
5622        if (avic_handle_apic_id_update(vcpu) != 0)
5623                return;
5624        if (avic_handle_dfr_update(vcpu) != 0)
5625                return;
5626        avic_handle_ldr_update(vcpu);
5627}
5628
5629static int svm_smi_allowed(struct kvm_vcpu *vcpu)
5630{
5631        struct vcpu_svm *svm = to_svm(vcpu);
5632
5633        /* Per APM Vol.2 15.22.2 "Response to SMI" */
5634        if (!gif_set(svm))
5635                return 0;
5636
5637        if (is_guest_mode(&svm->vcpu) &&
5638            svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
5639                /* TODO: Might need to set exit_info_1 and exit_info_2 here */
5640                svm->vmcb->control.exit_code = SVM_EXIT_SMI;
5641                svm->nested.exit_required = true;
5642                return 0;
5643        }
5644
5645        return 1;
5646}
5647
5648static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
5649{
5650        struct vcpu_svm *svm = to_svm(vcpu);
5651        int ret;
5652
5653        if (is_guest_mode(vcpu)) {
5654                /* FED8h - SVM Guest */
5655                put_smstate(u64, smstate, 0x7ed8, 1);
5656                /* FEE0h - SVM Guest VMCB Physical Address */
5657                put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
5658
5659                svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
5660                svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
5661                svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
5662
5663                ret = nested_svm_vmexit(svm);
5664                if (ret)
5665                        return ret;
5666        }
5667        return 0;
5668}
5669
5670static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
5671{
5672        struct vcpu_svm *svm = to_svm(vcpu);
5673        struct vmcb *nested_vmcb;
5674        struct page *page;
5675        struct {
5676                u64 guest;
5677                u64 vmcb;
5678        } svm_state_save;
5679        int ret;
5680
5681        ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save,
5682                                  sizeof(svm_state_save));
5683        if (ret)
5684                return ret;
5685
5686        if (svm_state_save.guest) {
5687                vcpu->arch.hflags &= ~HF_SMM_MASK;
5688                nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page);
5689                if (nested_vmcb)
5690                        enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page);
5691                else
5692                        ret = 1;
5693                vcpu->arch.hflags |= HF_SMM_MASK;
5694        }
5695        return ret;
5696}
5697
5698static int enable_smi_window(struct kvm_vcpu *vcpu)
5699{
5700        struct vcpu_svm *svm = to_svm(vcpu);
5701
5702        if (!gif_set(svm)) {
5703                if (vgif_enabled(svm))
5704                        set_intercept(svm, INTERCEPT_STGI);
5705                /* STGI will cause a vm exit */
5706                return 1;
5707        }
5708        return 0;
5709}
5710
5711static struct kvm_x86_ops svm_x86_ops = {
5712        .cpu_has_kvm_support = has_svm,
5713        .disabled_by_bios = is_disabled,
5714        .hardware_setup = svm_hardware_setup,
5715        .hardware_unsetup = svm_hardware_unsetup,
5716        .check_processor_compatibility = svm_check_processor_compat,
5717        .hardware_enable = svm_hardware_enable,
5718        .hardware_disable = svm_hardware_disable,
5719        .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
5720        .has_emulated_msr = svm_has_emulated_msr,
5721
5722        .vcpu_create = svm_create_vcpu,
5723        .vcpu_free = svm_free_vcpu,
5724        .vcpu_reset = svm_vcpu_reset,
5725
5726        .vm_init = avic_vm_init,
5727        .vm_destroy = avic_vm_destroy,
5728
5729        .prepare_guest_switch = svm_prepare_guest_switch,
5730        .vcpu_load = svm_vcpu_load,
5731        .vcpu_put = svm_vcpu_put,
5732        .vcpu_blocking = svm_vcpu_blocking,
5733        .vcpu_unblocking = svm_vcpu_unblocking,
5734
5735        .update_bp_intercept = update_bp_intercept,
5736        .get_msr = svm_get_msr,
5737        .set_msr = svm_set_msr,
5738        .get_segment_base = svm_get_segment_base,
5739        .get_segment = svm_get_segment,
5740        .set_segment = svm_set_segment,
5741        .get_cpl = svm_get_cpl,
5742        .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
5743        .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
5744        .decache_cr3 = svm_decache_cr3,
5745        .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
5746        .set_cr0 = svm_set_cr0,
5747        .set_cr3 = svm_set_cr3,
5748        .set_cr4 = svm_set_cr4,
5749        .set_efer = svm_set_efer,
5750        .get_idt = svm_get_idt,
5751        .set_idt = svm_set_idt,
5752        .get_gdt = svm_get_gdt,
5753        .set_gdt = svm_set_gdt,
5754        .get_dr6 = svm_get_dr6,
5755        .set_dr6 = svm_set_dr6,
5756        .set_dr7 = svm_set_dr7,
5757        .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
5758        .cache_reg = svm_cache_reg,
5759        .get_rflags = svm_get_rflags,
5760        .set_rflags = svm_set_rflags,
5761
5762        .fpu_activate = svm_fpu_activate,
5763        .fpu_deactivate = svm_fpu_deactivate,
5764
5765        .tlb_flush = svm_flush_tlb,
5766
5767        .run = svm_vcpu_run,
5768        .handle_exit = handle_exit,
5769        .skip_emulated_instruction = skip_emulated_instruction,
5770        .set_interrupt_shadow = svm_set_interrupt_shadow,
5771        .get_interrupt_shadow = svm_get_interrupt_shadow,
5772        .patch_hypercall = svm_patch_hypercall,
5773        .set_irq = svm_set_irq,
5774        .set_nmi = svm_inject_nmi,
5775        .queue_exception = svm_queue_exception,
5776        .cancel_injection = svm_cancel_injection,
5777        .interrupt_allowed = svm_interrupt_allowed,
5778        .nmi_allowed = svm_nmi_allowed,
5779        .get_nmi_mask = svm_get_nmi_mask,
5780        .set_nmi_mask = svm_set_nmi_mask,
5781        .enable_nmi_window = enable_nmi_window,
5782        .enable_irq_window = enable_irq_window,
5783        .update_cr8_intercept = update_cr8_intercept,
5784        .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
5785        .get_enable_apicv = svm_get_enable_apicv,
5786        .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
5787        .load_eoi_exitmap = svm_load_eoi_exitmap,
5788        .hwapic_irr_update = svm_hwapic_irr_update,
5789        .hwapic_isr_update = svm_hwapic_isr_update,
5790        .apicv_post_state_restore = avic_post_state_restore,
5791
5792        .set_tss_addr = svm_set_tss_addr,
5793        .get_tdp_level = get_npt_level,
5794        .get_mt_mask = svm_get_mt_mask,
5795
5796        .get_exit_info = svm_get_exit_info,
5797
5798        .get_lpage_level = svm_get_lpage_level,
5799
5800        .cpuid_update = svm_cpuid_update,
5801
5802        .rdtscp_supported = svm_rdtscp_supported,
5803        .invpcid_supported = svm_invpcid_supported,
5804        .mpx_supported = svm_mpx_supported,
5805
5806        .set_supported_cpuid = svm_set_supported_cpuid,
5807
5808        .has_wbinvd_exit = svm_has_wbinvd_exit,
5809
5810        .write_tsc_offset = svm_write_tsc_offset,
5811
5812        .set_tdp_cr3 = set_tdp_cr3,
5813
5814        .check_intercept = svm_check_intercept,
5815        .handle_external_intr = svm_handle_external_intr,
5816
5817        .sched_in = svm_sched_in,
5818
5819        .pmu_ops = &amd_pmu_ops,
5820        .deliver_posted_interrupt = svm_deliver_avic_intr,
5821        .update_pi_irte = svm_update_pi_irte,
5822
5823        .smi_allowed = svm_smi_allowed,
5824        .pre_enter_smm = svm_pre_enter_smm,
5825        .pre_leave_smm = svm_pre_leave_smm,
5826        .enable_smi_window = enable_smi_window,
5827};
5828
5829static int __init svm_init(void)
5830{
5831        return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
5832                        __alignof__(struct vcpu_svm), THIS_MODULE);
5833}
5834
5835static void __exit svm_exit(void)
5836{
5837        kvm_exit();
5838}
5839
5840module_init(svm_init)
5841module_exit(svm_exit)
5842