LXR linux/arch/x86/kvm/svm/svm.c

   1#define pr_fmt(fmt) "SVM: " fmt
   2
   3#include <linux/kvm_host.h>
   4
   5#include "irq.h"
   6#include "mmu.h"
   7#include "kvm_cache_regs.h"
   8#include "x86.h"
   9#include "cpuid.h"
  10#include "pmu.h"
  11
  12#include <linux/module.h>
  13#include <linux/mod_devicetable.h>
  14#include <linux/kernel.h>
  15#include <linux/vmalloc.h>
  16#include <linux/highmem.h>
  17#include <linux/amd-iommu.h>
  18#include <linux/sched.h>
  19#include <linux/trace_events.h>
  20#include <linux/slab.h>
  21#include <linux/hashtable.h>
  22#include <linux/objtool.h>
  23#include <linux/psp-sev.h>
  24#include <linux/file.h>
  25#include <linux/pagemap.h>
  26#include <linux/swap.h>
  27#include <linux/rwsem.h>
  28
  29#include <asm/apic.h>
  30#include <asm/perf_event.h>
  31#include <asm/tlbflush.h>
  32#include <asm/desc.h>
  33#include <asm/debugreg.h>
  34#include <asm/kvm_para.h>
  35#include <asm/irq_remapping.h>
  36#include <asm/spec-ctrl.h>
  37#include <asm/cpu_device_id.h>
  38#include <asm/traps.h>
  39#include <asm/fpu/api.h>
  40
  41#include <asm/virtext.h>
  42#include "trace.h"
  43
  44#include "svm.h"
  45#include "svm_ops.h"
  46
  47#include "kvm_onhyperv.h"
  48#include "svm_onhyperv.h"
  49
  50MODULE_AUTHOR("Qumranet");
  51MODULE_LICENSE("GPL");
  52
  53#ifdef MODULE
  54static const struct x86_cpu_id svm_cpu_id[] = {
  55        X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
  56        {}
  57};
  58MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  59#endif
  60
  61#define SEG_TYPE_LDT 2
  62#define SEG_TYPE_BUSY_TSS16 3
  63
  64#define SVM_FEATURE_LBRV           (1 <<  1)
  65#define SVM_FEATURE_SVML           (1 <<  2)
  66#define SVM_FEATURE_TSC_RATE       (1 <<  4)
  67#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
  68#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
  69#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
  70#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
  71
  72#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
  73
  74#define TSC_RATIO_RSVD          0xffffff0000000000ULL
  75#define TSC_RATIO_MIN           0x0000000000000001ULL
  76#define TSC_RATIO_MAX           0x000000ffffffffffULL
  77
  78static bool erratum_383_found __read_mostly;
  79
  80u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  81
  82/*
  83 * Set osvw_len to higher value when updated Revision Guides
  84 * are published and we know what the new status bits are
  85 */
  86static uint64_t osvw_len = 4, osvw_status;
  87
  88static DEFINE_PER_CPU(u64, current_tsc_ratio);
  89#define TSC_RATIO_DEFAULT       0x0100000000ULL
  90
  91static const struct svm_direct_access_msrs {
  92        u32 index;   /* Index of the MSR */
  93        bool always; /* True if intercept is initially cleared */
  94} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
  95        { .index = MSR_STAR,                            .always = true  },
  96        { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
  97        { .index = MSR_IA32_SYSENTER_EIP,               .always = false },
  98        { .index = MSR_IA32_SYSENTER_ESP,               .always = false },
  99#ifdef CONFIG_X86_64
 100        { .index = MSR_GS_BASE,                         .always = true  },
 101        { .index = MSR_FS_BASE,                         .always = true  },
 102        { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
 103        { .index = MSR_LSTAR,                           .always = true  },
 104        { .index = MSR_CSTAR,                           .always = true  },
 105        { .index = MSR_SYSCALL_MASK,                    .always = true  },
 106#endif
 107        { .index = MSR_IA32_SPEC_CTRL,                  .always = false },
 108        { .index = MSR_IA32_PRED_CMD,                   .always = false },
 109        { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
 110        { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
 111        { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
 112        { .index = MSR_IA32_LASTINTTOIP,                .always = false },
 113        { .index = MSR_EFER,                            .always = false },
 114        { .index = MSR_IA32_CR_PAT,                     .always = false },
 115        { .index = MSR_AMD64_SEV_ES_GHCB,               .always = true  },
 116        { .index = MSR_INVALID,                         .always = false },
 117};
 118
 119/*
 120 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 121 * pause_filter_count: On processors that support Pause filtering(indicated
 122 *      by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 123 *      count value. On VMRUN this value is loaded into an internal counter.
 124 *      Each time a pause instruction is executed, this counter is decremented
 125 *      until it reaches zero at which time a #VMEXIT is generated if pause
 126 *      intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 127 *      Intercept Filtering for more details.
 128 *      This also indicate if ple logic enabled.
 129 *
 130 * pause_filter_thresh: In addition, some processor families support advanced
 131 *      pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 132 *      the amount of time a guest is allowed to execute in a pause loop.
 133 *      In this mode, a 16-bit pause filter threshold field is added in the
 134 *      VMCB. The threshold value is a cycle count that is used to reset the
 135 *      pause counter. As with simple pause filtering, VMRUN loads the pause
 136 *      count value from VMCB into an internal counter. Then, on each pause
 137 *      instruction the hardware checks the elapsed number of cycles since
 138 *      the most recent pause instruction against the pause filter threshold.
 139 *      If the elapsed cycle count is greater than the pause filter threshold,
 140 *      then the internal pause count is reloaded from the VMCB and execution
 141 *      continues. If the elapsed cycle count is less than the pause filter
 142 *      threshold, then the internal pause count is decremented. If the count
 143 *      value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 144 *      triggered. If advanced pause filtering is supported and pause filter
 145 *      threshold field is set to zero, the filter will operate in the simpler,
 146 *      count only mode.
 147 */
 148
 149static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
 150module_param(pause_filter_thresh, ushort, 0444);
 151
 152static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
 153module_param(pause_filter_count, ushort, 0444);
 154
 155/* Default doubles per-vcpu window every exit. */
 156static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 157module_param(pause_filter_count_grow, ushort, 0444);
 158
 159/* Default resets per-vcpu window every exit to pause_filter_count. */
 160static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 161module_param(pause_filter_count_shrink, ushort, 0444);
 162
 163/* Default is to compute the maximum so we can never overflow. */
 164static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
 165module_param(pause_filter_count_max, ushort, 0444);
 166
 167/*
 168 * Use nested page tables by default.  Note, NPT may get forced off by
 169 * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
 170 */
 171bool npt_enabled = true;
 172module_param_named(npt, npt_enabled, bool, 0444);
 173
 174/* allow nested virtualization in KVM/SVM */
 175static int nested = true;
 176module_param(nested, int, S_IRUGO);
 177
 178/* enable/disable Next RIP Save */
 179static int nrips = true;
 180module_param(nrips, int, 0444);
 181
 182/* enable/disable Virtual VMLOAD VMSAVE */
 183static int vls = true;
 184module_param(vls, int, 0444);
 185
 186/* enable/disable Virtual GIF */
 187static int vgif = true;
 188module_param(vgif, int, 0444);
 189
 190/*
 191 * enable / disable AVIC.  Because the defaults differ for APICv
 192 * support between VMX and SVM we cannot use module_param_named.
 193 */
 194static bool avic;
 195module_param(avic, bool, 0444);
 196
 197bool __read_mostly dump_invalid_vmcb;
 198module_param(dump_invalid_vmcb, bool, 0644);
 199
 200
 201bool intercept_smi = true;
 202module_param(intercept_smi, bool, 0444);
 203
 204
 205static bool svm_gp_erratum_intercept = true;
 206
 207static u8 rsm_ins_bytes[] = "\x0f\xaa";
 208
 209static unsigned long iopm_base;
 210
 211struct kvm_ldttss_desc {
 212        u16 limit0;
 213        u16 base0;
 214        unsigned base1:8, type:5, dpl:2, p:1;
 215        unsigned limit1:4, zero0:3, g:1, base2:8;
 216        u32 base3;
 217        u32 zero1;
 218} __attribute__((packed));
 219
 220DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 221
 222/*
 223 * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
 224 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
 225 *
 226 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
 227 * defer the restoration of TSC_AUX until the CPU returns to userspace.
 228 */
 229static int tsc_aux_uret_slot __read_mostly = -1;
 230
 231static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 232
 233#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 234#define MSRS_RANGE_SIZE 2048
 235#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 236
 237u32 svm_msrpm_offset(u32 msr)
 238{
 239        u32 offset;
 240        int i;
 241
 242        for (i = 0; i < NUM_MSR_MAPS; i++) {
 243                if (msr < msrpm_ranges[i] ||
 244                    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 245                        continue;
 246
 247                offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 248                offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 249
 250                /* Now we have the u8 offset - but need the u32 offset */
 251                return offset / 4;
 252        }
 253
 254        /* MSR not in any range */
 255        return MSR_INVALID;
 256}
 257
 258#define MAX_INST_SIZE 15
 259
 260static int get_max_npt_level(void)
 261{
 262#ifdef CONFIG_X86_64
 263        return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
 264#else
 265        return PT32E_ROOT_LEVEL;
 266#endif
 267}
 268
 269int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 270{
 271        struct vcpu_svm *svm = to_svm(vcpu);
 272        u64 old_efer = vcpu->arch.efer;
 273        vcpu->arch.efer = efer;
 274
 275        if (!npt_enabled) {
 276                /* Shadow paging assumes NX to be available.  */
 277                efer |= EFER_NX;
 278
 279                if (!(efer & EFER_LMA))
 280                        efer &= ~EFER_LME;
 281        }
 282
 283        if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
 284                if (!(efer & EFER_SVME)) {
 285                        svm_leave_nested(svm);
 286                        svm_set_gif(svm, true);
 287                        /* #GP intercept is still needed for vmware backdoor */
 288                        if (!enable_vmware_backdoor)
 289                                clr_exception_intercept(svm, GP_VECTOR);
 290
 291                        /*
 292                         * Free the nested guest state, unless we are in SMM.
 293                         * In this case we will return to the nested guest
 294                         * as soon as we leave SMM.
 295                         */
 296                        if (!is_smm(vcpu))
 297                                svm_free_nested(svm);
 298
 299                } else {
 300                        int ret = svm_allocate_nested(svm);
 301
 302                        if (ret) {
 303                                vcpu->arch.efer = old_efer;
 304                                return ret;
 305                        }
 306
 307                        if (svm_gp_erratum_intercept)
 308                                set_exception_intercept(svm, GP_VECTOR);
 309                }
 310        }
 311
 312        svm->vmcb->save.efer = efer | EFER_SVME;
 313        vmcb_mark_dirty(svm->vmcb, VMCB_CR);
 314        return 0;
 315}
 316
 317static int is_external_interrupt(u32 info)
 318{
 319        info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
 320        return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 321}
 322
 323static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 324{
 325        struct vcpu_svm *svm = to_svm(vcpu);
 326        u32 ret = 0;
 327
 328        if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 329                ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 330        return ret;
 331}
 332
 333static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 334{
 335        struct vcpu_svm *svm = to_svm(vcpu);
 336
 337        if (mask == 0)
 338                svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 339        else
 340                svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 341
 342}
 343
 344static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 345{
 346        struct vcpu_svm *svm = to_svm(vcpu);
 347
 348        /*
 349         * SEV-ES does not expose the next RIP. The RIP update is controlled by
 350         * the type of exit and the #VC handler in the guest.
 351         */
 352        if (sev_es_guest(vcpu->kvm))
 353                goto done;
 354
 355        if (nrips && svm->vmcb->control.next_rip != 0) {
 356                WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 357                svm->next_rip = svm->vmcb->control.next_rip;
 358        }
 359
 360        if (!svm->next_rip) {
 361                if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
 362                        return 0;
 363        } else {
 364                kvm_rip_write(vcpu, svm->next_rip);
 365        }
 366
 367done:
 368        svm_set_interrupt_shadow(vcpu, 0);
 369
 370        return 1;
 371}
 372
 373static void svm_queue_exception(struct kvm_vcpu *vcpu)
 374{
 375        struct vcpu_svm *svm = to_svm(vcpu);
 376        unsigned nr = vcpu->arch.exception.nr;
 377        bool has_error_code = vcpu->arch.exception.has_error_code;
 378        u32 error_code = vcpu->arch.exception.error_code;
 379
 380        kvm_deliver_exception_payload(vcpu);
 381
 382        if (nr == BP_VECTOR && !nrips) {
 383                unsigned long rip, old_rip = kvm_rip_read(vcpu);
 384
 385                /*
 386                 * For guest debugging where we have to reinject #BP if some
 387                 * INT3 is guest-owned:
 388                 * Emulate nRIP by moving RIP forward. Will fail if injection
 389                 * raises a fault that is not intercepted. Still better than
 390                 * failing in all cases.
 391                 */
 392                (void)skip_emulated_instruction(vcpu);
 393                rip = kvm_rip_read(vcpu);
 394                svm->int3_rip = rip + svm->vmcb->save.cs.base;
 395                svm->int3_injected = rip - old_rip;
 396        }
 397
 398        svm->vmcb->control.event_inj = nr
 399                | SVM_EVTINJ_VALID
 400                | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 401                | SVM_EVTINJ_TYPE_EXEPT;
 402        svm->vmcb->control.event_inj_err = error_code;
 403}
 404
 405static void svm_init_erratum_383(void)
 406{
 407        u32 low, high;
 408        int err;
 409        u64 val;
 410
 411        if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 412                return;
 413
 414        /* Use _safe variants to not break nested virtualization */
 415        val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 416        if (err)
 417                return;
 418
 419        val |= (1ULL << 47);
 420
 421        low  = lower_32_bits(val);
 422        high = upper_32_bits(val);
 423
 424        native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 425
 426        erratum_383_found = true;
 427}
 428
 429static void svm_init_osvw(struct kvm_vcpu *vcpu)
 430{
 431        /*
 432         * Guests should see errata 400 and 415 as fixed (assuming that
 433         * HLT and IO instructions are intercepted).
 434         */
 435        vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 436        vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 437
 438        /*
 439         * By increasing VCPU's osvw.length to 3 we are telling the guest that
 440         * all osvw.status bits inside that length, including bit 0 (which is
 441         * reserved for erratum 298), are valid. However, if host processor's
 442         * osvw_len is 0 then osvw_status[0] carries no information. We need to
 443         * be conservative here and therefore we tell the guest that erratum 298
 444         * is present (because we really don't know).
 445         */
 446        if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 447                vcpu->arch.osvw.status |= 1;
 448}
 449
 450static int has_svm(void)
 451{
 452        const char *msg;
 453
 454        if (!cpu_has_svm(&msg)) {
 455                printk(KERN_INFO "has_svm: %s\n", msg);
 456                return 0;
 457        }
 458
 459        if (sev_active()) {
 460                pr_info("KVM is unsupported when running as an SEV guest\n");
 461                return 0;
 462        }
 463
 464        return 1;
 465}
 466
 467static void svm_hardware_disable(void)
 468{
 469        /* Make sure we clean up behind us */
 470        if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
 471                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 472
 473        cpu_svm_disable();
 474
 475        amd_pmu_disable_virt();
 476}
 477
 478static int svm_hardware_enable(void)
 479{
 480
 481        struct svm_cpu_data *sd;
 482        uint64_t efer;
 483        struct desc_struct *gdt;
 484        int me = raw_smp_processor_id();
 485
 486        rdmsrl(MSR_EFER, efer);
 487        if (efer & EFER_SVME)
 488                return -EBUSY;
 489
 490        if (!has_svm()) {
 491                pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
 492                return -EINVAL;
 493        }
 494        sd = per_cpu(svm_data, me);
 495        if (!sd) {
 496                pr_err("%s: svm_data is NULL on %d\n", __func__, me);
 497                return -EINVAL;
 498        }
 499
 500        sd->asid_generation = 1;
 501        sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 502        sd->next_asid = sd->max_asid + 1;
 503        sd->min_asid = max_sev_asid + 1;
 504
 505        gdt = get_current_gdt_rw();
 506        sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 507
 508        wrmsrl(MSR_EFER, efer | EFER_SVME);
 509
 510        wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
 511
 512        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 513                wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 514                __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
 515        }
 516
 517
 518        /*
 519         * Get OSVW bits.
 520         *
 521         * Note that it is possible to have a system with mixed processor
 522         * revisions and therefore different OSVW bits. If bits are not the same
 523         * on different processors then choose the worst case (i.e. if erratum
 524         * is present on one processor and not on another then assume that the
 525         * erratum is present everywhere).
 526         */
 527        if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 528                uint64_t len, status = 0;
 529                int err;
 530
 531                len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 532                if (!err)
 533                        status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 534                                                      &err);
 535
 536                if (err)
 537                        osvw_status = osvw_len = 0;
 538                else {
 539                        if (len < osvw_len)
 540                                osvw_len = len;
 541                        osvw_status |= status;
 542                        osvw_status &= (1ULL << osvw_len) - 1;
 543                }
 544        } else
 545                osvw_status = osvw_len = 0;
 546
 547        svm_init_erratum_383();
 548
 549        amd_pmu_enable_virt();
 550
 551        return 0;
 552}
 553
 554static void svm_cpu_uninit(int cpu)
 555{
 556        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
 557
 558        if (!sd)
 559                return;
 560
 561        per_cpu(svm_data, cpu) = NULL;
 562        kfree(sd->sev_vmcbs);
 563        __free_page(sd->save_area);
 564        kfree(sd);
 565}
 566
 567static int svm_cpu_init(int cpu)
 568{
 569        struct svm_cpu_data *sd;
 570        int ret = -ENOMEM;
 571
 572        sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
 573        if (!sd)
 574                return ret;
 575        sd->cpu = cpu;
 576        sd->save_area = alloc_page(GFP_KERNEL);
 577        if (!sd->save_area)
 578                goto free_cpu_data;
 579
 580        clear_page(page_address(sd->save_area));
 581
 582        ret = sev_cpu_init(sd);
 583        if (ret)
 584                goto free_save_area;
 585
 586        per_cpu(svm_data, cpu) = sd;
 587
 588        return 0;
 589
 590free_save_area:
 591        __free_page(sd->save_area);
 592free_cpu_data:
 593        kfree(sd);
 594        return ret;
 595
 596}
 597
 598static int direct_access_msr_slot(u32 msr)
 599{
 600        u32 i;
 601
 602        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 603                if (direct_access_msrs[i].index == msr)
 604                        return i;
 605
 606        return -ENOENT;
 607}
 608
 609static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
 610                                     int write)
 611{
 612        struct vcpu_svm *svm = to_svm(vcpu);
 613        int slot = direct_access_msr_slot(msr);
 614
 615        if (slot == -ENOENT)
 616                return;
 617
 618        /* Set the shadow bitmaps to the desired intercept states */
 619        if (read)
 620                set_bit(slot, svm->shadow_msr_intercept.read);
 621        else
 622                clear_bit(slot, svm->shadow_msr_intercept.read);
 623
 624        if (write)
 625                set_bit(slot, svm->shadow_msr_intercept.write);
 626        else
 627                clear_bit(slot, svm->shadow_msr_intercept.write);
 628}
 629
 630static bool valid_msr_intercept(u32 index)
 631{
 632        return direct_access_msr_slot(index) != -ENOENT;
 633}
 634
 635static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 636{
 637        u8 bit_write;
 638        unsigned long tmp;
 639        u32 offset;
 640        u32 *msrpm;
 641
 642        msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
 643                                      to_svm(vcpu)->msrpm;
 644
 645        offset    = svm_msrpm_offset(msr);
 646        bit_write = 2 * (msr & 0x0f) + 1;
 647        tmp       = msrpm[offset];
 648
 649        BUG_ON(offset == MSR_INVALID);
 650
 651        return !!test_bit(bit_write,  &tmp);
 652}
 653
 654static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
 655                                        u32 msr, int read, int write)
 656{
 657        u8 bit_read, bit_write;
 658        unsigned long tmp;
 659        u32 offset;
 660
 661        /*
 662         * If this warning triggers extend the direct_access_msrs list at the
 663         * beginning of the file
 664         */
 665        WARN_ON(!valid_msr_intercept(msr));
 666
 667        /* Enforce non allowed MSRs to trap */
 668        if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
 669                read = 0;
 670
 671        if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
 672                write = 0;
 673
 674        offset    = svm_msrpm_offset(msr);
 675        bit_read  = 2 * (msr & 0x0f);
 676        bit_write = 2 * (msr & 0x0f) + 1;
 677        tmp       = msrpm[offset];
 678
 679        BUG_ON(offset == MSR_INVALID);
 680
 681        read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 682        write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 683
 684        msrpm[offset] = tmp;
 685
 686        svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
 687
 688}
 689
 690void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 691                          int read, int write)
 692{
 693        set_shadow_msr_intercept(vcpu, msr, read, write);
 694        set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
 695}
 696
 697u32 *svm_vcpu_alloc_msrpm(void)
 698{
 699        unsigned int order = get_order(MSRPM_SIZE);
 700        struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
 701        u32 *msrpm;
 702
 703        if (!pages)
 704                return NULL;
 705
 706        msrpm = page_address(pages);
 707        memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
 708
 709        return msrpm;
 710}
 711
 712void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
 713{
 714        int i;
 715
 716        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 717                if (!direct_access_msrs[i].always)
 718                        continue;
 719                set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
 720        }
 721}
 722
 723
 724void svm_vcpu_free_msrpm(u32 *msrpm)
 725{
 726        __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
 727}
 728
 729static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
 730{
 731        struct vcpu_svm *svm = to_svm(vcpu);
 732        u32 i;
 733
 734        /*
 735         * Set intercept permissions for all direct access MSRs again. They
 736         * will automatically get filtered through the MSR filter, so we are
 737         * back in sync after this.
 738         */
 739        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 740                u32 msr = direct_access_msrs[i].index;
 741                u32 read = test_bit(i, svm->shadow_msr_intercept.read);
 742                u32 write = test_bit(i, svm->shadow_msr_intercept.write);
 743
 744                set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
 745        }
 746}
 747
 748static void add_msr_offset(u32 offset)
 749{
 750        int i;
 751
 752        for (i = 0; i < MSRPM_OFFSETS; ++i) {
 753
 754                /* Offset already in list? */
 755                if (msrpm_offsets[i] == offset)
 756                        return;
 757
 758                /* Slot used by another offset? */
 759                if (msrpm_offsets[i] != MSR_INVALID)
 760                        continue;
 761
 762                /* Add offset to list */
 763                msrpm_offsets[i] = offset;
 764
 765                return;
 766        }
 767
 768        /*
 769         * If this BUG triggers the msrpm_offsets table has an overflow. Just
 770         * increase MSRPM_OFFSETS in this case.
 771         */
 772        BUG();
 773}
 774
 775static void init_msrpm_offsets(void)
 776{
 777        int i;
 778
 779        memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 780
 781        for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 782                u32 offset;
 783
 784                offset = svm_msrpm_offset(direct_access_msrs[i].index);
 785                BUG_ON(offset == MSR_INVALID);
 786
 787                add_msr_offset(offset);
 788        }
 789}
 790
 791static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 792{
 793        struct vcpu_svm *svm = to_svm(vcpu);
 794
 795        svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
 796        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 797        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 798        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 799        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 800}
 801
 802static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
 803{
 804        struct vcpu_svm *svm = to_svm(vcpu);
 805
 806        svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
 807        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
 808        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 809        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 810        set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 811}
 812
 813void disable_nmi_singlestep(struct vcpu_svm *svm)
 814{
 815        svm->nmi_singlestep = false;
 816
 817        if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
 818                /* Clear our flags if they were not set by the guest */
 819                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
 820                        svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
 821                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
 822                        svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
 823        }
 824}
 825
 826static void grow_ple_window(struct kvm_vcpu *vcpu)
 827{
 828        struct vcpu_svm *svm = to_svm(vcpu);
 829        struct vmcb_control_area *control = &svm->vmcb->control;
 830        int old = control->pause_filter_count;
 831
 832        control->pause_filter_count = __grow_ple_window(old,
 833                                                        pause_filter_count,
 834                                                        pause_filter_count_grow,
 835                                                        pause_filter_count_max);
 836
 837        if (control->pause_filter_count != old) {
 838                vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 839                trace_kvm_ple_window_update(vcpu->vcpu_id,
 840                                            control->pause_filter_count, old);
 841        }
 842}
 843
 844static void shrink_ple_window(struct kvm_vcpu *vcpu)
 845{
 846        struct vcpu_svm *svm = to_svm(vcpu);
 847        struct vmcb_control_area *control = &svm->vmcb->control;
 848        int old = control->pause_filter_count;
 849
 850        control->pause_filter_count =
 851                                __shrink_ple_window(old,
 852                                                    pause_filter_count,
 853                                                    pause_filter_count_shrink,
 854                                                    pause_filter_count);
 855        if (control->pause_filter_count != old) {
 856                vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 857                trace_kvm_ple_window_update(vcpu->vcpu_id,
 858                                            control->pause_filter_count, old);
 859        }
 860}
 861
 862/*
 863 * The default MMIO mask is a single bit (excluding the present bit),
 864 * which could conflict with the memory encryption bit. Check for
 865 * memory encryption support and override the default MMIO mask if
 866 * memory encryption is enabled.
 867 */
 868static __init void svm_adjust_mmio_mask(void)
 869{
 870        unsigned int enc_bit, mask_bit;
 871        u64 msr, mask;
 872
 873        /* If there is no memory encryption support, use existing mask */
 874        if (cpuid_eax(0x80000000) < 0x8000001f)
 875                return;
 876
 877        /* If memory encryption is not enabled, use existing mask */
 878        rdmsrl(MSR_AMD64_SYSCFG, msr);
 879        if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
 880                return;
 881
 882        enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
 883        mask_bit = boot_cpu_data.x86_phys_bits;
 884
 885        /* Increment the mask bit if it is the same as the encryption bit */
 886        if (enc_bit == mask_bit)
 887                mask_bit++;
 888
 889        /*
 890         * If the mask bit location is below 52, then some bits above the
 891         * physical addressing limit will always be reserved, so use the
 892         * rsvd_bits() function to generate the mask. This mask, along with
 893         * the present bit, will be used to generate a page fault with
 894         * PFER.RSV = 1.
 895         *
 896         * If the mask bit location is 52 (or above), then clear the mask.
 897         */
 898        mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
 899
 900        kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
 901}
 902
 903static void svm_hardware_teardown(void)
 904{
 905        int cpu;
 906
 907        sev_hardware_teardown();
 908
 909        for_each_possible_cpu(cpu)
 910                svm_cpu_uninit(cpu);
 911
 912        __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
 913        get_order(IOPM_SIZE));
 914        iopm_base = 0;
 915}
 916
 917static __init void svm_set_cpu_caps(void)
 918{
 919        kvm_set_cpu_caps();
 920
 921        supported_xss = 0;
 922
 923        /* CPUID 0x80000001 and 0x8000000A (SVM features) */
 924        if (nested) {
 925                kvm_cpu_cap_set(X86_FEATURE_SVM);
 926
 927                if (nrips)
 928                        kvm_cpu_cap_set(X86_FEATURE_NRIPS);
 929
 930                if (npt_enabled)
 931                        kvm_cpu_cap_set(X86_FEATURE_NPT);
 932
 933                /* Nested VM can receive #VMEXIT instead of triggering #GP */
 934                kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
 935        }
 936
 937        /* CPUID 0x80000008 */
 938        if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
 939            boot_cpu_has(X86_FEATURE_AMD_SSBD))
 940                kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
 941
 942        /* CPUID 0x8000001F (SME/SEV features) */
 943        sev_set_cpu_caps();
 944}
 945
 946static __init int svm_hardware_setup(void)
 947{
 948        int cpu;
 949        struct page *iopm_pages;
 950        void *iopm_va;
 951        int r;
 952        unsigned int order = get_order(IOPM_SIZE);
 953
 954        /*
 955         * NX is required for shadow paging and for NPT if the NX huge pages
 956         * mitigation is enabled.
 957         */
 958        if (!boot_cpu_has(X86_FEATURE_NX)) {
 959                pr_err_ratelimited("NX (Execute Disable) not supported\n");
 960                return -EOPNOTSUPP;
 961        }
 962        kvm_enable_efer_bits(EFER_NX);
 963
 964        iopm_pages = alloc_pages(GFP_KERNEL, order);
 965
 966        if (!iopm_pages)
 967                return -ENOMEM;
 968
 969        iopm_va = page_address(iopm_pages);
 970        memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
 971        iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 972
 973        init_msrpm_offsets();
 974
 975        supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
 976
 977        if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
 978                kvm_enable_efer_bits(EFER_FFXSR);
 979
 980        if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 981                kvm_has_tsc_control = true;
 982                kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
 983                kvm_tsc_scaling_ratio_frac_bits = 32;
 984        }
 985
 986        tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
 987
 988        /* Check for pause filtering support */
 989        if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
 990                pause_filter_count = 0;
 991                pause_filter_thresh = 0;
 992        } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
 993                pause_filter_thresh = 0;
 994        }
 995
 996        if (nested) {
 997                printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
 998                kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
 999        }
1000

1001        /*
1002         * KVM's MMU doesn't support using 2-level paging for itself, and thus
1003         * NPT isn't supported if the host is using 2-level paging since host
1004         * CR4 is unchanged on VMRUN.
1005         */
1006        if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
1007                npt_enabled = false;
1008
1009        if (!boot_cpu_has(X86_FEATURE_NPT))
1010                npt_enabled = false;
1011
1012        /* Force VM NPT level equal to the host's max NPT level */
1013        kvm_configure_mmu(npt_enabled, get_max_npt_level(),
1014                          get_max_npt_level(), PG_LEVEL_1G);
1015        pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
1016
1017        /* Note, SEV setup consumes npt_enabled. */
1018        sev_hardware_setup();
1019
1020        svm_hv_hardware_setup();
1021
1022        svm_adjust_mmio_mask();
1023
1024        for_each_possible_cpu(cpu) {
1025                r = svm_cpu_init(cpu);
1026                if (r)
1027                        goto err;
1028        }
1029
1030        if (nrips) {
1031                if (!boot_cpu_has(X86_FEATURE_NRIPS))
1032                        nrips = false;
1033        }
1034
1035        enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
1036
1037        if (enable_apicv) {
1038                pr_info("AVIC enabled\n");
1039
1040                amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1041        }
1042
1043        if (vls) {
1044                if (!npt_enabled ||
1045                    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
1046                    !IS_ENABLED(CONFIG_X86_64)) {
1047                        vls = false;
1048                } else {
1049                        pr_info("Virtual VMLOAD VMSAVE supported\n");
1050                }
1051        }
1052
1053        if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
1054                svm_gp_erratum_intercept = false;
1055
1056        if (vgif) {
1057                if (!boot_cpu_has(X86_FEATURE_VGIF))
1058                        vgif = false;
1059                else
1060                        pr_info("Virtual GIF supported\n");
1061        }
1062
1063        svm_set_cpu_caps();
1064
1065        /*
1066         * It seems that on AMD processors PTE's accessed bit is
1067         * being set by the CPU hardware before the NPF vmexit.
1068         * This is not expected behaviour and our tests fail because
1069         * of it.
1070         * A workaround here is to disable support for
1071         * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
1072         * In this case userspace can know if there is support using
1073         * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
1074         * it
1075         * If future AMD CPU models change the behaviour described above,
1076         * this variable can be changed accordingly
1077         */
1078        allow_smaller_maxphyaddr = !npt_enabled;
1079
1080        return 0;
1081
1082err:
1083        svm_hardware_teardown();
1084        return r;
1085}
1086
1087static void init_seg(struct vmcb_seg *seg)
1088{
1089        seg->selector = 0;
1090        seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1091                      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1092        seg->limit = 0xffff;
1093        seg->base = 0;
1094}
1095
1096static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1097{
1098        seg->selector = 0;
1099        seg->attrib = SVM_SELECTOR_P_MASK | type;
1100        seg->limit = 0xffff;
1101        seg->base = 0;
1102}
1103
1104static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1105{
1106        struct vcpu_svm *svm = to_svm(vcpu);
1107
1108        return svm->nested.ctl.tsc_offset;
1109}
1110
1111static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1112{
1113        return kvm_default_tsc_scaling_ratio;
1114}
1115
1116static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1117{
1118        struct vcpu_svm *svm = to_svm(vcpu);
1119
1120        svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1121        svm->vmcb->control.tsc_offset = offset;
1122        vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1123}
1124
1125static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1126{
1127        wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
1128}
1129
1130/* Evaluate instruction intercepts that depend on guest CPUID features. */
1131static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1132                                              struct vcpu_svm *svm)
1133{
1134        /*
1135         * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1136         * roots, or if INVPCID is disabled in the guest to inject #UD.
1137         */
1138        if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1139                if (!npt_enabled ||
1140                    !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1141                        svm_set_intercept(svm, INTERCEPT_INVPCID);
1142                else
1143                        svm_clr_intercept(svm, INTERCEPT_INVPCID);
1144        }
1145
1146        if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1147                if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1148                        svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1149                else
1150                        svm_set_intercept(svm, INTERCEPT_RDTSCP);
1151        }
1152}
1153
1154static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1155{
1156        struct vcpu_svm *svm = to_svm(vcpu);
1157
1158        if (guest_cpuid_is_intel(vcpu)) {
1159                /*
1160                 * We must intercept SYSENTER_EIP and SYSENTER_ESP
1161                 * accesses because the processor only stores 32 bits.
1162                 * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1163                 */
1164                svm_set_intercept(svm, INTERCEPT_VMLOAD);
1165                svm_set_intercept(svm, INTERCEPT_VMSAVE);
1166                svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1167
1168                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1169                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
1170        } else {
1171                /*
1172                 * If hardware supports Virtual VMLOAD VMSAVE then enable it
1173                 * in VMCB and clear intercepts to avoid #VMEXIT.
1174                 */
1175                if (vls) {
1176                        svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1177                        svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1178                        svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1179                }
1180                /* No need to intercept these MSRs */
1181                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1182                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1183        }
1184}
1185
1186static void init_vmcb(struct kvm_vcpu *vcpu)
1187{
1188        struct vcpu_svm *svm = to_svm(vcpu);
1189        struct vmcb_control_area *control = &svm->vmcb->control;
1190        struct vmcb_save_area *save = &svm->vmcb->save;
1191
1192        svm_set_intercept(svm, INTERCEPT_CR0_READ);
1193        svm_set_intercept(svm, INTERCEPT_CR3_READ);
1194        svm_set_intercept(svm, INTERCEPT_CR4_READ);
1195        svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1196        svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1197        svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1198        if (!kvm_vcpu_apicv_active(vcpu))
1199                svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1200
1201        set_dr_intercepts(svm);
1202
1203        set_exception_intercept(svm, PF_VECTOR);
1204        set_exception_intercept(svm, UD_VECTOR);
1205        set_exception_intercept(svm, MC_VECTOR);
1206        set_exception_intercept(svm, AC_VECTOR);
1207        set_exception_intercept(svm, DB_VECTOR);
1208        /*
1209         * Guest access to VMware backdoor ports could legitimately
1210         * trigger #GP because of TSS I/O permission bitmap.
1211         * We intercept those #GP and allow access to them anyway
1212         * as VMware does.
1213         */
1214        if (enable_vmware_backdoor)
1215                set_exception_intercept(svm, GP_VECTOR);
1216
1217        svm_set_intercept(svm, INTERCEPT_INTR);
1218        svm_set_intercept(svm, INTERCEPT_NMI);
1219
1220        if (intercept_smi)
1221                svm_set_intercept(svm, INTERCEPT_SMI);
1222
1223        svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1224        svm_set_intercept(svm, INTERCEPT_RDPMC);
1225        svm_set_intercept(svm, INTERCEPT_CPUID);
1226        svm_set_intercept(svm, INTERCEPT_INVD);
1227        svm_set_intercept(svm, INTERCEPT_INVLPG);
1228        svm_set_intercept(svm, INTERCEPT_INVLPGA);
1229        svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1230        svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1231        svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1232        svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1233        svm_set_intercept(svm, INTERCEPT_VMRUN);
1234        svm_set_intercept(svm, INTERCEPT_VMMCALL);
1235        svm_set_intercept(svm, INTERCEPT_VMLOAD);
1236        svm_set_intercept(svm, INTERCEPT_VMSAVE);
1237        svm_set_intercept(svm, INTERCEPT_STGI);
1238        svm_set_intercept(svm, INTERCEPT_CLGI);
1239        svm_set_intercept(svm, INTERCEPT_SKINIT);
1240        svm_set_intercept(svm, INTERCEPT_WBINVD);
1241        svm_set_intercept(svm, INTERCEPT_XSETBV);
1242        svm_set_intercept(svm, INTERCEPT_RDPRU);
1243        svm_set_intercept(svm, INTERCEPT_RSM);
1244
1245        if (!kvm_mwait_in_guest(vcpu->kvm)) {
1246                svm_set_intercept(svm, INTERCEPT_MONITOR);
1247                svm_set_intercept(svm, INTERCEPT_MWAIT);
1248        }
1249
1250        if (!kvm_hlt_in_guest(vcpu->kvm))
1251                svm_set_intercept(svm, INTERCEPT_HLT);
1252
1253        control->iopm_base_pa = __sme_set(iopm_base);
1254        control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1255        control->int_ctl = V_INTR_MASKING_MASK;
1256
1257        init_seg(&save->es);
1258        init_seg(&save->ss);
1259        init_seg(&save->ds);
1260        init_seg(&save->fs);
1261        init_seg(&save->gs);
1262
1263        save->cs.selector = 0xf000;
1264        save->cs.base = 0xffff0000;
1265        /* Executable/Readable Code Segment */
1266        save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1267                SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1268        save->cs.limit = 0xffff;
1269
1270        save->gdtr.base = 0;
1271        save->gdtr.limit = 0xffff;
1272        save->idtr.base = 0;
1273        save->idtr.limit = 0xffff;
1274
1275        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1276        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1277
1278        if (npt_enabled) {
1279                /* Setup VMCB for Nested Paging */
1280                control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1281                svm_clr_intercept(svm, INTERCEPT_INVLPG);
1282                clr_exception_intercept(svm, PF_VECTOR);
1283                svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1284                svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1285                save->g_pat = vcpu->arch.pat;
1286                save->cr3 = 0;
1287        }
1288        svm->current_vmcb->asid_generation = 0;
1289        svm->asid = 0;
1290
1291        svm->nested.vmcb12_gpa = INVALID_GPA;
1292        svm->nested.last_vmcb12_gpa = INVALID_GPA;
1293
1294        if (!kvm_pause_in_guest(vcpu->kvm)) {
1295                control->pause_filter_count = pause_filter_count;
1296                if (pause_filter_thresh)
1297                        control->pause_filter_thresh = pause_filter_thresh;
1298                svm_set_intercept(svm, INTERCEPT_PAUSE);
1299        } else {
1300                svm_clr_intercept(svm, INTERCEPT_PAUSE);
1301        }
1302
1303        svm_recalc_instruction_intercepts(vcpu, svm);
1304
1305        /*
1306         * If the host supports V_SPEC_CTRL then disable the interception
1307         * of MSR_IA32_SPEC_CTRL.
1308         */
1309        if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1310                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1311
1312        if (kvm_vcpu_apicv_active(vcpu))
1313                avic_init_vmcb(svm);
1314
1315        if (vgif) {
1316                svm_clr_intercept(svm, INTERCEPT_STGI);
1317                svm_clr_intercept(svm, INTERCEPT_CLGI);
1318                svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1319        }
1320
1321        if (sev_guest(vcpu->kvm)) {
1322                svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
1323                clr_exception_intercept(svm, UD_VECTOR);
1324
1325                if (sev_es_guest(vcpu->kvm)) {
1326                        /* Perform SEV-ES specific VMCB updates */
1327                        sev_es_init_vmcb(svm);
1328                }
1329        }
1330
1331        svm_hv_init_vmcb(svm->vmcb);
1332        init_vmcb_after_set_cpuid(vcpu);
1333
1334        vmcb_mark_all_dirty(svm->vmcb);
1335
1336        enable_gif(svm);
1337
1338}
1339
1340static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1341{
1342        struct vcpu_svm *svm = to_svm(vcpu);
1343
1344        svm->spec_ctrl = 0;
1345        svm->virt_spec_ctrl = 0;
1346
1347        init_vmcb(vcpu);
1348}
1349
1350void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1351{
1352        svm->current_vmcb = target_vmcb;
1353        svm->vmcb = target_vmcb->ptr;
1354}
1355
1356static int svm_create_vcpu(struct kvm_vcpu *vcpu)
1357{
1358        struct vcpu_svm *svm;
1359        struct page *vmcb01_page;
1360        struct page *vmsa_page = NULL;
1361        int err;
1362
1363        BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1364        svm = to_svm(vcpu);
1365
1366        err = -ENOMEM;
1367        vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1368        if (!vmcb01_page)
1369                goto out;
1370
1371        if (sev_es_guest(vcpu->kvm)) {
1372                /*
1373                 * SEV-ES guests require a separate VMSA page used to contain
1374                 * the encrypted register state of the guest.
1375                 */
1376                vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1377                if (!vmsa_page)
1378                        goto error_free_vmcb_page;
1379
1380                /*
1381                 * SEV-ES guests maintain an encrypted version of their FPU
1382                 * state which is restored and saved on VMRUN and VMEXIT.
1383                 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1384                 * do xsave/xrstor on it.
1385                 */
1386                fpstate_set_confidential(&vcpu->arch.guest_fpu);
1387        }
1388
1389        err = avic_init_vcpu(svm);
1390        if (err)
1391                goto error_free_vmsa_page;
1392
1393        /* We initialize this flag to true to make sure that the is_running
1394         * bit would be set the first time the vcpu is loaded.
1395         */
1396        if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
1397                svm->avic_is_running = true;
1398
1399        svm->msrpm = svm_vcpu_alloc_msrpm();
1400        if (!svm->msrpm) {
1401                err = -ENOMEM;
1402                goto error_free_vmsa_page;
1403        }
1404
1405        svm->vmcb01.ptr = page_address(vmcb01_page);
1406        svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1407
1408        if (vmsa_page)
1409                svm->sev_es.vmsa = page_address(vmsa_page);
1410
1411        svm->guest_state_loaded = false;
1412
1413        svm_switch_vmcb(svm, &svm->vmcb01);
1414        init_vmcb(vcpu);
1415
1416        svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1417
1418        svm_init_osvw(vcpu);
1419        vcpu->arch.microcode_version = 0x01000065;
1420
1421        if (sev_es_guest(vcpu->kvm))
1422                /* Perform SEV-ES specific VMCB creation updates */
1423                sev_es_create_vcpu(svm);
1424
1425        return 0;
1426
1427error_free_vmsa_page:
1428        if (vmsa_page)
1429                __free_page(vmsa_page);
1430error_free_vmcb_page:
1431        __free_page(vmcb01_page);
1432out:
1433        return err;
1434}
1435
1436static void svm_clear_current_vmcb(struct vmcb *vmcb)
1437{
1438        int i;
1439
1440        for_each_online_cpu(i)
1441                cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
1442}
1443
1444static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1445{
1446        struct vcpu_svm *svm = to_svm(vcpu);
1447
1448        /*
1449         * The vmcb page can be recycled, causing a false negative in
1450         * svm_vcpu_load(). So, ensure that no logical CPU has this
1451         * vmcb page recorded as its current vmcb.
1452         */
1453        svm_clear_current_vmcb(svm->vmcb);
1454
1455        svm_free_nested(svm);
1456
1457        sev_free_vcpu(vcpu);
1458
1459        __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
1460        __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
1461}
1462
1463static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
1464{
1465        struct vcpu_svm *svm = to_svm(vcpu);
1466        struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
1467
1468        if (sev_es_guest(vcpu->kvm))
1469                sev_es_unmap_ghcb(svm);
1470
1471        if (svm->guest_state_loaded)
1472                return;
1473
1474        /*
1475         * Save additional host state that will be restored on VMEXIT (sev-es)
1476         * or subsequent vmload of host save area.
1477         */
1478        if (sev_es_guest(vcpu->kvm)) {
1479                sev_es_prepare_guest_switch(svm, vcpu->cpu);
1480        } else {
1481                vmsave(__sme_page_pa(sd->save_area));
1482        }
1483
1484        if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1485                u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
1486                if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
1487                        __this_cpu_write(current_tsc_ratio, tsc_ratio);
1488                        wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
1489                }
1490        }
1491
1492        if (likely(tsc_aux_uret_slot >= 0))
1493                kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1494
1495        svm->guest_state_loaded = true;
1496}
1497
1498static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1499{
1500        to_svm(vcpu)->guest_state_loaded = false;
1501}
1502
1503static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1504{
1505        struct vcpu_svm *svm = to_svm(vcpu);
1506        struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1507
1508        if (sd->current_vmcb != svm->vmcb) {
1509                sd->current_vmcb = svm->vmcb;
1510                indirect_branch_prediction_barrier();
1511        }
1512        if (kvm_vcpu_apicv_active(vcpu))
1513                avic_vcpu_load(vcpu, cpu);
1514}
1515
1516static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1517{
1518        if (kvm_vcpu_apicv_active(vcpu))
1519                avic_vcpu_put(vcpu);
1520
1521        svm_prepare_host_switch(vcpu);
1522
1523        ++vcpu->stat.host_state_reload;
1524}
1525
1526static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1527{
1528        struct vcpu_svm *svm = to_svm(vcpu);
1529        unsigned long rflags = svm->vmcb->save.rflags;
1530
1531        if (svm->nmi_singlestep) {
1532                /* Hide our flags if they were not set by the guest */
1533                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1534                        rflags &= ~X86_EFLAGS_TF;
1535                if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1536                        rflags &= ~X86_EFLAGS_RF;
1537        }
1538        return rflags;
1539}
1540
1541static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1542{
1543        if (to_svm(vcpu)->nmi_singlestep)
1544                rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1545
1546       /*
1547        * Any change of EFLAGS.VM is accompanied by a reload of SS
1548        * (caused by either a task switch or an inter-privilege IRET),
1549        * so we do not need to update the CPL here.
1550        */
1551        to_svm(vcpu)->vmcb->save.rflags = rflags;
1552}
1553
1554static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1555{
1556        switch (reg) {
1557        case VCPU_EXREG_PDPTR:
1558                BUG_ON(!npt_enabled);
1559                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1560                break;
1561        default:
1562                KVM_BUG_ON(1, vcpu->kvm);
1563        }
1564}
1565
1566static void svm_set_vintr(struct vcpu_svm *svm)
1567{
1568        struct vmcb_control_area *control;
1569
1570        /*
1571         * The following fields are ignored when AVIC is enabled
1572         */
1573        WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
1574
1575        svm_set_intercept(svm, INTERCEPT_VINTR);
1576
1577        /*
1578         * This is just a dummy VINTR to actually cause a vmexit to happen.
1579         * Actual injection of virtual interrupts happens through EVENTINJ.
1580         */
1581        control = &svm->vmcb->control;
1582        control->int_vector = 0x0;
1583        control->int_ctl &= ~V_INTR_PRIO_MASK;
1584        control->int_ctl |= V_IRQ_MASK |
1585                ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1586        vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1587}
1588
1589static void svm_clear_vintr(struct vcpu_svm *svm)
1590{
1591        svm_clr_intercept(svm, INTERCEPT_VINTR);
1592
1593        /* Drop int_ctl fields related to VINTR injection.  */
1594        svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1595        if (is_guest_mode(&svm->vcpu)) {
1596                svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1597
1598                WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1599                        (svm->nested.ctl.int_ctl & V_TPR_MASK));
1600
1601                svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1602                        V_IRQ_INJECTION_BITS_MASK;
1603
1604                svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1605        }
1606
1607        vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1608}
1609
1610static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1611{
1612        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1613        struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1614
1615        switch (seg) {
1616        case VCPU_SREG_CS: return &save->cs;
1617        case VCPU_SREG_DS: return &save->ds;
1618        case VCPU_SREG_ES: return &save->es;
1619        case VCPU_SREG_FS: return &save01->fs;
1620        case VCPU_SREG_GS: return &save01->gs;
1621        case VCPU_SREG_SS: return &save->ss;
1622        case VCPU_SREG_TR: return &save01->tr;
1623        case VCPU_SREG_LDTR: return &save01->ldtr;
1624        }
1625        BUG();
1626        return NULL;
1627}
1628
1629static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1630{
1631        struct vmcb_seg *s = svm_seg(vcpu, seg);
1632
1633        return s->base;
1634}
1635
1636static void svm_get_segment(struct kvm_vcpu *vcpu,
1637                            struct kvm_segment *var, int seg)
1638{
1639        struct vmcb_seg *s = svm_seg(vcpu, seg);
1640
1641        var->base = s->base;
1642        var->limit = s->limit;
1643        var->selector = s->selector;
1644        var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1645        var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1646        var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1647        var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1648        var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1649        var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1650        var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1651
1652        /*
1653         * AMD CPUs circa 2014 track the G bit for all segments except CS.
1654         * However, the SVM spec states that the G bit is not observed by the
1655         * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1656         * So let's synthesize a legal G bit for all segments, this helps
1657         * running KVM nested. It also helps cross-vendor migration, because
1658         * Intel's vmentry has a check on the 'G' bit.
1659         */
1660        var->g = s->limit > 0xfffff;
1661
1662        /*
1663         * AMD's VMCB does not have an explicit unusable field, so emulate it
1664         * for cross vendor migration purposes by "not present"
1665         */
1666        var->unusable = !var->present;
1667
1668        switch (seg) {
1669        case VCPU_SREG_TR:
1670                /*
1671                 * Work around a bug where the busy flag in the tr selector
1672                 * isn't exposed
1673                 */
1674                var->type |= 0x2;
1675                break;
1676        case VCPU_SREG_DS:
1677        case VCPU_SREG_ES:
1678        case VCPU_SREG_FS:
1679        case VCPU_SREG_GS:
1680                /*
1681                 * The accessed bit must always be set in the segment
1682                 * descriptor cache, although it can be cleared in the
1683                 * descriptor, the cached bit always remains at 1. Since
1684                 * Intel has a check on this, set it here to support
1685                 * cross-vendor migration.
1686                 */
1687                if (!var->unusable)
1688                        var->type |= 0x1;
1689                break;
1690        case VCPU_SREG_SS:
1691                /*
1692                 * On AMD CPUs sometimes the DB bit in the segment
1693                 * descriptor is left as 1, although the whole segment has
1694                 * been made unusable. Clear it here to pass an Intel VMX
1695                 * entry check when cross vendor migrating.
1696                 */
1697                if (var->unusable)
1698                        var->db = 0;
1699                /* This is symmetric with svm_set_segment() */
1700                var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1701                break;
1702        }
1703}
1704
1705static int svm_get_cpl(struct kvm_vcpu *vcpu)
1706{
1707        struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1708
1709        return save->cpl;
1710}
1711
1712static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1713{
1714        struct vcpu_svm *svm = to_svm(vcpu);
1715
1716        dt->size = svm->vmcb->save.idtr.limit;
1717        dt->address = svm->vmcb->save.idtr.base;
1718}
1719
1720static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1721{
1722        struct vcpu_svm *svm = to_svm(vcpu);
1723
1724        svm->vmcb->save.idtr.limit = dt->size;
1725        svm->vmcb->save.idtr.base = dt->address ;
1726        vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1727}
1728
1729static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1730{
1731        struct vcpu_svm *svm = to_svm(vcpu);
1732
1733        dt->size = svm->vmcb->save.gdtr.limit;
1734        dt->address = svm->vmcb->save.gdtr.base;
1735}
1736
1737static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1738{
1739        struct vcpu_svm *svm = to_svm(vcpu);
1740
1741        svm->vmcb->save.gdtr.limit = dt->size;
1742        svm->vmcb->save.gdtr.base = dt->address ;
1743        vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1744}
1745
1746void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1747{
1748        struct vcpu_svm *svm = to_svm(vcpu);
1749        u64 hcr0 = cr0;
1750
1751#ifdef CONFIG_X86_64
1752        if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
1753                if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1754                        vcpu->arch.efer |= EFER_LMA;
1755                        svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1756                }
1757
1758                if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1759                        vcpu->arch.efer &= ~EFER_LMA;
1760                        svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1761                }
1762        }
1763#endif
1764        vcpu->arch.cr0 = cr0;
1765
1766        if (!npt_enabled)
1767                hcr0 |= X86_CR0_PG | X86_CR0_WP;
1768
1769        /*
1770         * re-enable caching here because the QEMU bios
1771         * does not do it - this results in some delay at
1772         * reboot
1773         */
1774        if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1775                hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1776
1777        svm->vmcb->save.cr0 = hcr0;
1778        vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1779
1780        /*
1781         * SEV-ES guests must always keep the CR intercepts cleared. CR
1782         * tracking is done using the CR write traps.
1783         */
1784        if (sev_es_guest(vcpu->kvm))
1785                return;
1786
1787        if (hcr0 == cr0) {
1788                /* Selective CR0 write remains on.  */
1789                svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1790                svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1791        } else {
1792                svm_set_intercept(svm, INTERCEPT_CR0_READ);
1793                svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1794        }
1795}
1796
1797static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1798{
1799        return true;
1800}
1801
1802void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1803{
1804        unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1805        unsigned long old_cr4 = vcpu->arch.cr4;
1806
1807        if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1808                svm_flush_tlb(vcpu);
1809
1810        vcpu->arch.cr4 = cr4;
1811        if (!npt_enabled)
1812                cr4 |= X86_CR4_PAE;
1813        cr4 |= host_cr4_mce;
1814        to_svm(vcpu)->vmcb->save.cr4 = cr4;
1815        vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1816
1817        if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1818                kvm_update_cpuid_runtime(vcpu);
1819}
1820
1821static void svm_set_segment(struct kvm_vcpu *vcpu,
1822                            struct kvm_segment *var, int seg)
1823{
1824        struct vcpu_svm *svm = to_svm(vcpu);
1825        struct vmcb_seg *s = svm_seg(vcpu, seg);
1826
1827        s->base = var->base;
1828        s->limit = var->limit;
1829        s->selector = var->selector;
1830        s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1831        s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1832        s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1833        s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1834        s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1835        s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1836        s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1837        s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1838
1839        /*
1840         * This is always accurate, except if SYSRET returned to a segment
1841         * with SS.DPL != 3.  Intel does not have this quirk, and always
1842         * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1843         * would entail passing the CPL to userspace and back.
1844         */
1845        if (seg == VCPU_SREG_SS)
1846                /* This is symmetric with svm_get_segment() */
1847                svm->vmcb->save.cpl = (var->dpl & 3);
1848
1849        vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1850}
1851
1852static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1853{
1854        struct vcpu_svm *svm = to_svm(vcpu);
1855
1856        clr_exception_intercept(svm, BP_VECTOR);
1857
1858        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1859                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1860                        set_exception_intercept(svm, BP_VECTOR);
1861        }
1862}
1863
1864static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1865{
1866        if (sd->next_asid > sd->max_asid) {
1867                ++sd->asid_generation;
1868                sd->next_asid = sd->min_asid;
1869                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1870                vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1871        }
1872
1873        svm->current_vmcb->asid_generation = sd->asid_generation;
1874        svm->asid = sd->next_asid++;
1875}
1876
1877static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1878{
1879        struct vmcb *vmcb = svm->vmcb;
1880
1881        if (svm->vcpu.arch.guest_state_protected)
1882                return;
1883
1884        if (unlikely(value != vmcb->save.dr6)) {
1885                vmcb->save.dr6 = value;
1886                vmcb_mark_dirty(vmcb, VMCB_DR);
1887        }
1888}
1889
1890static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1891{
1892        struct vcpu_svm *svm = to_svm(vcpu);
1893
1894        if (vcpu->arch.guest_state_protected)
1895                return;
1896
1897        get_debugreg(vcpu->arch.db[0], 0);
1898        get_debugreg(vcpu->arch.db[1], 1);
1899        get_debugreg(vcpu->arch.db[2], 2);
1900        get_debugreg(vcpu->arch.db[3], 3);
1901        /*
1902         * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
1903         * because db_interception might need it.  We can do it before vmentry.
1904         */
1905        vcpu->arch.dr6 = svm->vmcb->save.dr6;
1906        vcpu->arch.dr7 = svm->vmcb->save.dr7;
1907        vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1908        set_dr_intercepts(svm);
1909}
1910
1911static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1912{
1913        struct vcpu_svm *svm = to_svm(vcpu);
1914
1915        if (vcpu->arch.guest_state_protected)
1916                return;
1917
1918        svm->vmcb->save.dr7 = value;
1919        vmcb_mark_dirty(svm->vmcb, VMCB_DR);
1920}
1921
1922static int pf_interception(struct kvm_vcpu *vcpu)
1923{
1924        struct vcpu_svm *svm = to_svm(vcpu);
1925
1926        u64 fault_address = svm->vmcb->control.exit_info_2;
1927        u64 error_code = svm->vmcb->control.exit_info_1;
1928
1929        return kvm_handle_page_fault(vcpu, error_code, fault_address,
1930                        static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1931                        svm->vmcb->control.insn_bytes : NULL,
1932                        svm->vmcb->control.insn_len);
1933}
1934
1935static int npf_interception(struct kvm_vcpu *vcpu)
1936{
1937        struct vcpu_svm *svm = to_svm(vcpu);
1938
1939        u64 fault_address = svm->vmcb->control.exit_info_2;
1940        u64 error_code = svm->vmcb->control.exit_info_1;
1941
1942        trace_kvm_page_fault(fault_address, error_code);
1943        return kvm_mmu_page_fault(vcpu, fault_address, error_code,
1944                        static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1945                        svm->vmcb->control.insn_bytes : NULL,
1946                        svm->vmcb->control.insn_len);
1947}
1948
1949static int db_interception(struct kvm_vcpu *vcpu)
1950{
1951        struct kvm_run *kvm_run = vcpu->run;
1952        struct vcpu_svm *svm = to_svm(vcpu);
1953
1954        if (!(vcpu->guest_debug &
1955              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1956                !svm->nmi_singlestep) {
1957                u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
1958                kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
1959                return 1;
1960        }
1961
1962        if (svm->nmi_singlestep) {
1963                disable_nmi_singlestep(svm);
1964                /* Make sure we check for pending NMIs upon entry */
1965                kvm_make_request(KVM_REQ_EVENT, vcpu);
1966        }
1967
1968        if (vcpu->guest_debug &
1969            (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1970                kvm_run->exit_reason = KVM_EXIT_DEBUG;
1971                kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
1972                kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
1973                kvm_run->debug.arch.pc =
1974                        svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1975                kvm_run->debug.arch.exception = DB_VECTOR;
1976                return 0;
1977        }
1978
1979        return 1;
1980}
1981
1982static int bp_interception(struct kvm_vcpu *vcpu)
1983{
1984        struct vcpu_svm *svm = to_svm(vcpu);
1985        struct kvm_run *kvm_run = vcpu->run;
1986
1987        kvm_run->exit_reason = KVM_EXIT_DEBUG;
1988        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1989        kvm_run->debug.arch.exception = BP_VECTOR;
1990        return 0;
1991}
1992
1993static int ud_interception(struct kvm_vcpu *vcpu)
1994{
1995        return handle_ud(vcpu);
1996}
1997
1998static int ac_interception(struct kvm_vcpu *vcpu)
1999{
2000        kvm_queue_exception_e(vcpu, AC_VECTOR, 0);

2001        return 1;
2002}
2003
2004static bool is_erratum_383(void)
2005{
2006        int err, i;
2007        u64 value;
2008
2009        if (!erratum_383_found)
2010                return false;
2011
2012        value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2013        if (err)
2014                return false;
2015
2016        /* Bit 62 may or may not be set for this mce */
2017        value &= ~(1ULL << 62);
2018
2019        if (value != 0xb600000000010015ULL)
2020                return false;
2021
2022        /* Clear MCi_STATUS registers */
2023        for (i = 0; i < 6; ++i)
2024                native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2025
2026        value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2027        if (!err) {
2028                u32 low, high;
2029
2030                value &= ~(1ULL << 2);
2031                low    = lower_32_bits(value);
2032                high   = upper_32_bits(value);
2033
2034                native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2035        }
2036
2037        /* Flush tlb to evict multi-match entries */
2038        __flush_tlb_all();
2039
2040        return true;
2041}
2042
2043static void svm_handle_mce(struct kvm_vcpu *vcpu)
2044{
2045        if (is_erratum_383()) {
2046                /*
2047                 * Erratum 383 triggered. Guest state is corrupt so kill the
2048                 * guest.
2049                 */
2050                pr_err("KVM: Guest triggered AMD Erratum 383\n");
2051
2052                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2053
2054                return;
2055        }
2056
2057        /*
2058         * On an #MC intercept the MCE handler is not called automatically in
2059         * the host. So do it by hand here.
2060         */
2061        kvm_machine_check();
2062}
2063
2064static int mc_interception(struct kvm_vcpu *vcpu)
2065{
2066        return 1;
2067}
2068
2069static int shutdown_interception(struct kvm_vcpu *vcpu)
2070{
2071        struct kvm_run *kvm_run = vcpu->run;
2072        struct vcpu_svm *svm = to_svm(vcpu);
2073
2074        /*
2075         * The VM save area has already been encrypted so it
2076         * cannot be reinitialized - just terminate.
2077         */
2078        if (sev_es_guest(vcpu->kvm))
2079                return -EINVAL;
2080
2081        /*
2082         * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
2083         * the VMCB in a known good state.  Unfortuately, KVM doesn't have
2084         * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2085         * userspace.  At a platform view, INIT is acceptable behavior as
2086         * there exist bare metal platforms that automatically INIT the CPU
2087         * in response to shutdown.
2088         */
2089        clear_page(svm->vmcb);
2090        kvm_vcpu_reset(vcpu, true);
2091
2092        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2093        return 0;
2094}
2095
2096static int io_interception(struct kvm_vcpu *vcpu)
2097{
2098        struct vcpu_svm *svm = to_svm(vcpu);
2099        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2100        int size, in, string;
2101        unsigned port;
2102
2103        ++vcpu->stat.io_exits;
2104        string = (io_info & SVM_IOIO_STR_MASK) != 0;
2105        in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2106        port = io_info >> 16;
2107        size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2108
2109        if (string) {
2110                if (sev_es_guest(vcpu->kvm))
2111                        return sev_es_string_io(svm, size, port, in);
2112                else
2113                        return kvm_emulate_instruction(vcpu, 0);
2114        }
2115
2116        svm->next_rip = svm->vmcb->control.exit_info_2;
2117
2118        return kvm_fast_pio(vcpu, size, port, in);
2119}
2120
2121static int nmi_interception(struct kvm_vcpu *vcpu)
2122{
2123        return 1;
2124}
2125
2126static int smi_interception(struct kvm_vcpu *vcpu)
2127{
2128        return 1;
2129}
2130
2131static int intr_interception(struct kvm_vcpu *vcpu)
2132{
2133        ++vcpu->stat.irq_exits;
2134        return 1;
2135}
2136
2137static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2138{
2139        struct vcpu_svm *svm = to_svm(vcpu);
2140        struct vmcb *vmcb12;
2141        struct kvm_host_map map;
2142        int ret;
2143
2144        if (nested_svm_check_permissions(vcpu))
2145                return 1;
2146
2147        ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2148        if (ret) {
2149                if (ret == -EINVAL)
2150                        kvm_inject_gp(vcpu, 0);
2151                return 1;
2152        }
2153
2154        vmcb12 = map.hva;
2155
2156        ret = kvm_skip_emulated_instruction(vcpu);
2157
2158        if (vmload) {
2159                svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
2160                svm->sysenter_eip_hi = 0;
2161                svm->sysenter_esp_hi = 0;
2162        } else {
2163                svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
2164        }
2165
2166        kvm_vcpu_unmap(vcpu, &map, true);
2167
2168        return ret;
2169}
2170
2171static int vmload_interception(struct kvm_vcpu *vcpu)
2172{
2173        return vmload_vmsave_interception(vcpu, true);
2174}
2175
2176static int vmsave_interception(struct kvm_vcpu *vcpu)
2177{
2178        return vmload_vmsave_interception(vcpu, false);
2179}
2180
2181static int vmrun_interception(struct kvm_vcpu *vcpu)
2182{
2183        if (nested_svm_check_permissions(vcpu))
2184                return 1;
2185
2186        return nested_svm_vmrun(vcpu);
2187}
2188
2189enum {
2190        NONE_SVM_INSTR,
2191        SVM_INSTR_VMRUN,
2192        SVM_INSTR_VMLOAD,
2193        SVM_INSTR_VMSAVE,
2194};
2195
2196/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2197static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2198{
2199        struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2200
2201        if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2202                return NONE_SVM_INSTR;
2203
2204        switch (ctxt->modrm) {
2205        case 0xd8: /* VMRUN */
2206                return SVM_INSTR_VMRUN;
2207        case 0xda: /* VMLOAD */
2208                return SVM_INSTR_VMLOAD;
2209        case 0xdb: /* VMSAVE */
2210                return SVM_INSTR_VMSAVE;
2211        default:
2212                break;
2213        }
2214
2215        return NONE_SVM_INSTR;
2216}
2217
2218static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2219{
2220        const int guest_mode_exit_codes[] = {
2221                [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2222                [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2223                [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2224        };
2225        int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2226                [SVM_INSTR_VMRUN] = vmrun_interception,
2227                [SVM_INSTR_VMLOAD] = vmload_interception,
2228                [SVM_INSTR_VMSAVE] = vmsave_interception,
2229        };
2230        struct vcpu_svm *svm = to_svm(vcpu);
2231        int ret;
2232
2233        if (is_guest_mode(vcpu)) {
2234                /* Returns '1' or -errno on failure, '0' on success. */
2235                ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2236                if (ret)
2237                        return ret;
2238                return 1;
2239        }
2240        return svm_instr_handlers[opcode](vcpu);
2241}
2242
2243/*
2244 * #GP handling code. Note that #GP can be triggered under the following two
2245 * cases:
2246 *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2247 *      some AMD CPUs when EAX of these instructions are in the reserved memory
2248 *      regions (e.g. SMM memory on host).
2249 *   2) VMware backdoor
2250 */
2251static int gp_interception(struct kvm_vcpu *vcpu)
2252{
2253        struct vcpu_svm *svm = to_svm(vcpu);
2254        u32 error_code = svm->vmcb->control.exit_info_1;
2255        int opcode;
2256
2257        /* Both #GP cases have zero error_code */
2258        if (error_code)
2259                goto reinject;
2260
2261        /* All SVM instructions expect page aligned RAX */
2262        if (svm->vmcb->save.rax & ~PAGE_MASK)
2263                goto reinject;
2264
2265        /* Decode the instruction for usage later */
2266        if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2267                goto reinject;
2268
2269        opcode = svm_instr_opcode(vcpu);
2270
2271        if (opcode == NONE_SVM_INSTR) {
2272                if (!enable_vmware_backdoor)
2273                        goto reinject;
2274
2275                /*
2276                 * VMware backdoor emulation on #GP interception only handles
2277                 * IN{S}, OUT{S}, and RDPMC.
2278                 */
2279                if (!is_guest_mode(vcpu))
2280                        return kvm_emulate_instruction(vcpu,
2281                                EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2282        } else
2283                return emulate_svm_instr(vcpu, opcode);
2284
2285reinject:
2286        kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2287        return 1;
2288}
2289
2290void svm_set_gif(struct vcpu_svm *svm, bool value)
2291{
2292        if (value) {
2293                /*
2294                 * If VGIF is enabled, the STGI intercept is only added to
2295                 * detect the opening of the SMI/NMI window; remove it now.
2296                 * Likewise, clear the VINTR intercept, we will set it
2297                 * again while processing KVM_REQ_EVENT if needed.
2298                 */
2299                if (vgif_enabled(svm))
2300                        svm_clr_intercept(svm, INTERCEPT_STGI);
2301                if (svm_is_intercept(svm, INTERCEPT_VINTR))
2302                        svm_clear_vintr(svm);
2303
2304                enable_gif(svm);
2305                if (svm->vcpu.arch.smi_pending ||
2306                    svm->vcpu.arch.nmi_pending ||
2307                    kvm_cpu_has_injectable_intr(&svm->vcpu))
2308                        kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2309        } else {
2310                disable_gif(svm);
2311
2312                /*
2313                 * After a CLGI no interrupts should come.  But if vGIF is
2314                 * in use, we still rely on the VINTR intercept (rather than
2315                 * STGI) to detect an open interrupt window.
2316                */
2317                if (!vgif_enabled(svm))
2318                        svm_clear_vintr(svm);
2319        }
2320}
2321
2322static int stgi_interception(struct kvm_vcpu *vcpu)
2323{
2324        int ret;
2325
2326        if (nested_svm_check_permissions(vcpu))
2327                return 1;
2328
2329        ret = kvm_skip_emulated_instruction(vcpu);
2330        svm_set_gif(to_svm(vcpu), true);
2331        return ret;
2332}
2333
2334static int clgi_interception(struct kvm_vcpu *vcpu)
2335{
2336        int ret;
2337
2338        if (nested_svm_check_permissions(vcpu))
2339                return 1;
2340
2341        ret = kvm_skip_emulated_instruction(vcpu);
2342        svm_set_gif(to_svm(vcpu), false);
2343        return ret;
2344}
2345
2346static int invlpga_interception(struct kvm_vcpu *vcpu)
2347{
2348        gva_t gva = kvm_rax_read(vcpu);
2349        u32 asid = kvm_rcx_read(vcpu);
2350
2351        /* FIXME: Handle an address size prefix. */
2352        if (!is_long_mode(vcpu))
2353                gva = (u32)gva;
2354
2355        trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2356
2357        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2358        kvm_mmu_invlpg(vcpu, gva);
2359
2360        return kvm_skip_emulated_instruction(vcpu);
2361}
2362
2363static int skinit_interception(struct kvm_vcpu *vcpu)
2364{
2365        trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2366
2367        kvm_queue_exception(vcpu, UD_VECTOR);
2368        return 1;
2369}
2370
2371static int task_switch_interception(struct kvm_vcpu *vcpu)
2372{
2373        struct vcpu_svm *svm = to_svm(vcpu);
2374        u16 tss_selector;
2375        int reason;
2376        int int_type = svm->vmcb->control.exit_int_info &
2377                SVM_EXITINTINFO_TYPE_MASK;
2378        int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2379        uint32_t type =
2380                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2381        uint32_t idt_v =
2382                svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2383        bool has_error_code = false;
2384        u32 error_code = 0;
2385
2386        tss_selector = (u16)svm->vmcb->control.exit_info_1;
2387
2388        if (svm->vmcb->control.exit_info_2 &
2389            (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2390                reason = TASK_SWITCH_IRET;
2391        else if (svm->vmcb->control.exit_info_2 &
2392                 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2393                reason = TASK_SWITCH_JMP;
2394        else if (idt_v)
2395                reason = TASK_SWITCH_GATE;
2396        else
2397                reason = TASK_SWITCH_CALL;
2398
2399        if (reason == TASK_SWITCH_GATE) {
2400                switch (type) {
2401                case SVM_EXITINTINFO_TYPE_NMI:
2402                        vcpu->arch.nmi_injected = false;
2403                        break;
2404                case SVM_EXITINTINFO_TYPE_EXEPT:
2405                        if (svm->vmcb->control.exit_info_2 &
2406                            (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2407                                has_error_code = true;
2408                                error_code =
2409                                        (u32)svm->vmcb->control.exit_info_2;
2410                        }
2411                        kvm_clear_exception_queue(vcpu);
2412                        break;
2413                case SVM_EXITINTINFO_TYPE_INTR:
2414                        kvm_clear_interrupt_queue(vcpu);
2415                        break;
2416                default:
2417                        break;
2418                }
2419        }
2420
2421        if (reason != TASK_SWITCH_GATE ||
2422            int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2423            (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2424             (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2425                if (!skip_emulated_instruction(vcpu))
2426                        return 0;
2427        }
2428
2429        if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2430                int_vec = -1;
2431
2432        return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2433                               has_error_code, error_code);
2434}
2435
2436static int iret_interception(struct kvm_vcpu *vcpu)
2437{
2438        struct vcpu_svm *svm = to_svm(vcpu);
2439
2440        ++vcpu->stat.nmi_window_exits;
2441        vcpu->arch.hflags |= HF_IRET_MASK;
2442        if (!sev_es_guest(vcpu->kvm)) {
2443                svm_clr_intercept(svm, INTERCEPT_IRET);
2444                svm->nmi_iret_rip = kvm_rip_read(vcpu);
2445        }
2446        kvm_make_request(KVM_REQ_EVENT, vcpu);
2447        return 1;
2448}
2449
2450static int invlpg_interception(struct kvm_vcpu *vcpu)
2451{
2452        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2453                return kvm_emulate_instruction(vcpu, 0);
2454
2455        kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2456        return kvm_skip_emulated_instruction(vcpu);
2457}
2458
2459static int emulate_on_interception(struct kvm_vcpu *vcpu)
2460{
2461        return kvm_emulate_instruction(vcpu, 0);
2462}
2463
2464static int rsm_interception(struct kvm_vcpu *vcpu)
2465{
2466        return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2467}
2468
2469static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2470                                            unsigned long val)
2471{
2472        struct vcpu_svm *svm = to_svm(vcpu);
2473        unsigned long cr0 = vcpu->arch.cr0;
2474        bool ret = false;
2475
2476        if (!is_guest_mode(vcpu) ||
2477            (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2478                return false;
2479
2480        cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2481        val &= ~SVM_CR0_SELECTIVE_MASK;
2482
2483        if (cr0 ^ val) {
2484                svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2485                ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2486        }
2487
2488        return ret;
2489}
2490
2491#define CR_VALID (1ULL << 63)
2492
2493static int cr_interception(struct kvm_vcpu *vcpu)
2494{
2495        struct vcpu_svm *svm = to_svm(vcpu);
2496        int reg, cr;
2497        unsigned long val;
2498        int err;
2499
2500        if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2501                return emulate_on_interception(vcpu);
2502
2503        if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2504                return emulate_on_interception(vcpu);
2505
2506        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2507        if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2508                cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2509        else
2510                cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2511
2512        err = 0;
2513        if (cr >= 16) { /* mov to cr */
2514                cr -= 16;
2515                val = kvm_register_read(vcpu, reg);
2516                trace_kvm_cr_write(cr, val);
2517                switch (cr) {
2518                case 0:
2519                        if (!check_selective_cr0_intercepted(vcpu, val))
2520                                err = kvm_set_cr0(vcpu, val);
2521                        else
2522                                return 1;
2523
2524                        break;
2525                case 3:
2526                        err = kvm_set_cr3(vcpu, val);
2527                        break;
2528                case 4:
2529                        err = kvm_set_cr4(vcpu, val);
2530                        break;
2531                case 8:
2532                        err = kvm_set_cr8(vcpu, val);
2533                        break;
2534                default:
2535                        WARN(1, "unhandled write to CR%d", cr);
2536                        kvm_queue_exception(vcpu, UD_VECTOR);
2537                        return 1;
2538                }
2539        } else { /* mov from cr */
2540                switch (cr) {
2541                case 0:
2542                        val = kvm_read_cr0(vcpu);
2543                        break;
2544                case 2:
2545                        val = vcpu->arch.cr2;
2546                        break;
2547                case 3:
2548                        val = kvm_read_cr3(vcpu);
2549                        break;
2550                case 4:
2551                        val = kvm_read_cr4(vcpu);
2552                        break;
2553                case 8:
2554                        val = kvm_get_cr8(vcpu);
2555                        break;
2556                default:
2557                        WARN(1, "unhandled read from CR%d", cr);
2558                        kvm_queue_exception(vcpu, UD_VECTOR);
2559                        return 1;
2560                }
2561                kvm_register_write(vcpu, reg, val);
2562                trace_kvm_cr_read(cr, val);
2563        }
2564        return kvm_complete_insn_gp(vcpu, err);
2565}
2566
2567static int cr_trap(struct kvm_vcpu *vcpu)
2568{
2569        struct vcpu_svm *svm = to_svm(vcpu);
2570        unsigned long old_value, new_value;
2571        unsigned int cr;
2572        int ret = 0;
2573
2574        new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2575
2576        cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2577        switch (cr) {
2578        case 0:
2579                old_value = kvm_read_cr0(vcpu);
2580                svm_set_cr0(vcpu, new_value);
2581
2582                kvm_post_set_cr0(vcpu, old_value, new_value);
2583                break;
2584        case 4:
2585                old_value = kvm_read_cr4(vcpu);
2586                svm_set_cr4(vcpu, new_value);
2587
2588                kvm_post_set_cr4(vcpu, old_value, new_value);
2589                break;
2590        case 8:
2591                ret = kvm_set_cr8(vcpu, new_value);
2592                break;
2593        default:
2594                WARN(1, "unhandled CR%d write trap", cr);
2595                kvm_queue_exception(vcpu, UD_VECTOR);
2596                return 1;
2597        }
2598
2599        return kvm_complete_insn_gp(vcpu, ret);
2600}
2601
2602static int dr_interception(struct kvm_vcpu *vcpu)
2603{
2604        struct vcpu_svm *svm = to_svm(vcpu);
2605        int reg, dr;
2606        unsigned long val;
2607        int err = 0;
2608
2609        if (vcpu->guest_debug == 0) {
2610                /*
2611                 * No more DR vmexits; force a reload of the debug registers
2612                 * and reenter on this instruction.  The next vmexit will
2613                 * retrieve the full state of the debug registers.
2614                 */
2615                clr_dr_intercepts(svm);
2616                vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2617                return 1;
2618        }
2619
2620        if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2621                return emulate_on_interception(vcpu);
2622
2623        reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2624        dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2625        if (dr >= 16) { /* mov to DRn  */
2626                dr -= 16;
2627                val = kvm_register_read(vcpu, reg);
2628                err = kvm_set_dr(vcpu, dr, val);
2629        } else {
2630                kvm_get_dr(vcpu, dr, &val);
2631                kvm_register_write(vcpu, reg, val);
2632        }
2633
2634        return kvm_complete_insn_gp(vcpu, err);
2635}
2636
2637static int cr8_write_interception(struct kvm_vcpu *vcpu)
2638{
2639        int r;
2640
2641        u8 cr8_prev = kvm_get_cr8(vcpu);
2642        /* instruction emulation calls kvm_set_cr8() */
2643        r = cr_interception(vcpu);
2644        if (lapic_in_kernel(vcpu))
2645                return r;
2646        if (cr8_prev <= kvm_get_cr8(vcpu))
2647                return r;
2648        vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2649        return 0;
2650}
2651
2652static int efer_trap(struct kvm_vcpu *vcpu)
2653{
2654        struct msr_data msr_info;
2655        int ret;
2656
2657        /*
2658         * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2659         * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2660         * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2661         * the guest doesn't have X86_FEATURE_SVM.
2662         */
2663        msr_info.host_initiated = false;
2664        msr_info.index = MSR_EFER;
2665        msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2666        ret = kvm_set_msr_common(vcpu, &msr_info);
2667
2668        return kvm_complete_insn_gp(vcpu, ret);
2669}
2670
2671static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2672{
2673        msr->data = 0;
2674
2675        switch (msr->index) {
2676        case MSR_F10H_DECFG:
2677                if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
2678                        msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
2679                break;
2680        case MSR_IA32_PERF_CAPABILITIES:
2681                return 0;
2682        default:
2683                return KVM_MSR_RET_INVALID;
2684        }
2685
2686        return 0;
2687}
2688
2689static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2690{
2691        struct vcpu_svm *svm = to_svm(vcpu);
2692
2693        switch (msr_info->index) {
2694        case MSR_STAR:
2695                msr_info->data = svm->vmcb01.ptr->save.star;
2696                break;
2697#ifdef CONFIG_X86_64
2698        case MSR_LSTAR:
2699                msr_info->data = svm->vmcb01.ptr->save.lstar;
2700                break;
2701        case MSR_CSTAR:
2702                msr_info->data = svm->vmcb01.ptr->save.cstar;
2703                break;
2704        case MSR_KERNEL_GS_BASE:
2705                msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2706                break;
2707        case MSR_SYSCALL_MASK:
2708                msr_info->data = svm->vmcb01.ptr->save.sfmask;
2709                break;
2710#endif
2711        case MSR_IA32_SYSENTER_CS:
2712                msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2713                break;
2714        case MSR_IA32_SYSENTER_EIP:
2715                msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2716                if (guest_cpuid_is_intel(vcpu))
2717                        msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2718                break;
2719        case MSR_IA32_SYSENTER_ESP:
2720                msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2721                if (guest_cpuid_is_intel(vcpu))
2722                        msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2723                break;
2724        case MSR_TSC_AUX:
2725                msr_info->data = svm->tsc_aux;
2726                break;
2727        /*
2728         * Nobody will change the following 5 values in the VMCB so we can
2729         * safely return them on rdmsr. They will always be 0 until LBRV is
2730         * implemented.
2731         */
2732        case MSR_IA32_DEBUGCTLMSR:
2733                msr_info->data = svm->vmcb->save.dbgctl;
2734                break;
2735        case MSR_IA32_LASTBRANCHFROMIP:
2736                msr_info->data = svm->vmcb->save.br_from;
2737                break;
2738        case MSR_IA32_LASTBRANCHTOIP:
2739                msr_info->data = svm->vmcb->save.br_to;
2740                break;
2741        case MSR_IA32_LASTINTFROMIP:
2742                msr_info->data = svm->vmcb->save.last_excp_from;
2743                break;
2744        case MSR_IA32_LASTINTTOIP:
2745                msr_info->data = svm->vmcb->save.last_excp_to;
2746                break;
2747        case MSR_VM_HSAVE_PA:
2748                msr_info->data = svm->nested.hsave_msr;
2749                break;
2750        case MSR_VM_CR:
2751                msr_info->data = svm->nested.vm_cr_msr;
2752                break;
2753        case MSR_IA32_SPEC_CTRL:
2754                if (!msr_info->host_initiated &&
2755                    !guest_has_spec_ctrl_msr(vcpu))
2756                        return 1;
2757
2758                if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2759                        msr_info->data = svm->vmcb->save.spec_ctrl;
2760                else
2761                        msr_info->data = svm->spec_ctrl;
2762                break;
2763        case MSR_AMD64_VIRT_SPEC_CTRL:
2764                if (!msr_info->host_initiated &&
2765                    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2766                        return 1;
2767
2768                msr_info->data = svm->virt_spec_ctrl;
2769                break;
2770        case MSR_F15H_IC_CFG: {
2771
2772                int family, model;
2773
2774                family = guest_cpuid_family(vcpu);
2775                model  = guest_cpuid_model(vcpu);
2776
2777                if (family < 0 || model < 0)
2778                        return kvm_get_msr_common(vcpu, msr_info);
2779
2780                msr_info->data = 0;
2781
2782                if (family == 0x15 &&
2783                    (model >= 0x2 && model < 0x20))
2784                        msr_info->data = 0x1E;
2785                }
2786                break;
2787        case MSR_F10H_DECFG:
2788                msr_info->data = svm->msr_decfg;
2789                break;
2790        default:
2791                return kvm_get_msr_common(vcpu, msr_info);
2792        }
2793        return 0;
2794}
2795
2796static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2797{
2798        struct vcpu_svm *svm = to_svm(vcpu);
2799        if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2800                return kvm_complete_insn_gp(vcpu, err);
2801
2802        ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2803        ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
2804                                X86_TRAP_GP |
2805                                SVM_EVTINJ_TYPE_EXEPT |
2806                                SVM_EVTINJ_VALID);
2807        return 1;
2808}
2809
2810static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2811{
2812        struct vcpu_svm *svm = to_svm(vcpu);
2813        int svm_dis, chg_mask;
2814
2815        if (data & ~SVM_VM_CR_VALID_MASK)
2816                return 1;
2817
2818        chg_mask = SVM_VM_CR_VALID_MASK;
2819
2820        if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2821                chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2822
2823        svm->nested.vm_cr_msr &= ~chg_mask;
2824        svm->nested.vm_cr_msr |= (data & chg_mask);
2825
2826        svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2827
2828        /* check for svm_disable while efer.svme is set */
2829        if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2830                return 1;
2831
2832        return 0;
2833}
2834
2835static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2836{
2837        struct vcpu_svm *svm = to_svm(vcpu);
2838        int r;
2839
2840        u32 ecx = msr->index;
2841        u64 data = msr->data;
2842        switch (ecx) {
2843        case MSR_IA32_CR_PAT:
2844                if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2845                        return 1;
2846                vcpu->arch.pat = data;
2847                svm->vmcb01.ptr->save.g_pat = data;
2848                if (is_guest_mode(vcpu))
2849                        nested_vmcb02_compute_g_pat(svm);
2850                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
2851                break;
2852        case MSR_IA32_SPEC_CTRL:
2853                if (!msr->host_initiated &&
2854                    !guest_has_spec_ctrl_msr(vcpu))
2855                        return 1;
2856
2857                if (kvm_spec_ctrl_test_value(data))
2858                        return 1;
2859
2860                if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2861                        svm->vmcb->save.spec_ctrl = data;
2862                else
2863                        svm->spec_ctrl = data;
2864                if (!data)
2865                        break;
2866
2867                /*
2868                 * For non-nested:
2869                 * When it's written (to non-zero) for the first time, pass
2870                 * it through.
2871                 *
2872                 * For nested:
2873                 * The handling of the MSR bitmap for L2 guests is done in
2874                 * nested_svm_vmrun_msrpm.
2875                 * We update the L1 MSR bit as well since it will end up
2876                 * touching the MSR anyway now.
2877                 */
2878                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
2879                break;
2880        case MSR_IA32_PRED_CMD:
2881                if (!msr->host_initiated &&
2882                    !guest_has_pred_cmd_msr(vcpu))
2883                        return 1;
2884
2885                if (data & ~PRED_CMD_IBPB)
2886                        return 1;
2887                if (!boot_cpu_has(X86_FEATURE_IBPB))
2888                        return 1;
2889                if (!data)
2890                        break;
2891
2892                wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2893                set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
2894                break;
2895        case MSR_AMD64_VIRT_SPEC_CTRL:
2896                if (!msr->host_initiated &&
2897                    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2898                        return 1;
2899
2900                if (data & ~SPEC_CTRL_SSBD)
2901                        return 1;
2902
2903                svm->virt_spec_ctrl = data;
2904                break;
2905        case MSR_STAR:
2906                svm->vmcb01.ptr->save.star = data;
2907                break;
2908#ifdef CONFIG_X86_64
2909        case MSR_LSTAR:
2910                svm->vmcb01.ptr->save.lstar = data;
2911                break;
2912        case MSR_CSTAR:
2913                svm->vmcb01.ptr->save.cstar = data;
2914                break;
2915        case MSR_KERNEL_GS_BASE:
2916                svm->vmcb01.ptr->save.kernel_gs_base = data;
2917                break;
2918        case MSR_SYSCALL_MASK:
2919                svm->vmcb01.ptr->save.sfmask = data;
2920                break;
2921#endif
2922        case MSR_IA32_SYSENTER_CS:
2923                svm->vmcb01.ptr->save.sysenter_cs = data;
2924                break;
2925        case MSR_IA32_SYSENTER_EIP:
2926                svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
2927                /*
2928                 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
2929                 * when we spoof an Intel vendor ID (for cross vendor migration).
2930                 * In this case we use this intercept to track the high
2931                 * 32 bit part of these msrs to support Intel's
2932                 * implementation of SYSENTER/SYSEXIT.
2933                 */
2934                svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2935                break;
2936        case MSR_IA32_SYSENTER_ESP:
2937                svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
2938                svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2939                break;
2940        case MSR_TSC_AUX:
2941                /*
2942                 * TSC_AUX is usually changed only during boot and never read
2943                 * directly.  Intercept TSC_AUX instead of exposing it to the
2944                 * guest via direct_access_msrs, and switch it via user return.
2945                 */
2946                preempt_disable();
2947                r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
2948                preempt_enable();
2949                if (r)
2950                        return 1;
2951
2952                svm->tsc_aux = data;
2953                break;
2954        case MSR_IA32_DEBUGCTLMSR:
2955                if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2956                        vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2957                                    __func__, data);
2958                        break;
2959                }
2960                if (data & DEBUGCTL_RESERVED_BITS)
2961                        return 1;
2962
2963                svm->vmcb->save.dbgctl = data;
2964                vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
2965                if (data & (1ULL<<0))
2966                        svm_enable_lbrv(vcpu);
2967                else
2968                        svm_disable_lbrv(vcpu);
2969                break;
2970        case MSR_VM_HSAVE_PA:
2971                /*
2972                 * Old kernels did not validate the value written to
2973                 * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
2974                 * value to allow live migrating buggy or malicious guests
2975                 * originating from those kernels.
2976                 */
2977                if (!msr->host_initiated && !page_address_valid(vcpu, data))
2978                        return 1;
2979
2980                svm->nested.hsave_msr = data & PAGE_MASK;
2981                break;
2982        case MSR_VM_CR:
2983                return svm_set_vm_cr(vcpu, data);
2984        case MSR_VM_IGNNE:
2985                vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2986                break;
2987        case MSR_F10H_DECFG: {
2988                struct kvm_msr_entry msr_entry;
2989
2990                msr_entry.index = msr->index;
2991                if (svm_get_msr_feature(&msr_entry))
2992                        return 1;
2993
2994                /* Check the supported bits */
2995                if (data & ~msr_entry.data)
2996                        return 1;
2997
2998                /* Don't allow the guest to change a bit, #GP */
2999                if (!msr->host_initiated && (data ^ msr_entry.data))
3000                        return 1;

3001
3002                svm->msr_decfg = data;
3003                break;
3004        }
3005        default:
3006                return kvm_set_msr_common(vcpu, msr);
3007        }
3008        return 0;
3009}
3010
3011static int msr_interception(struct kvm_vcpu *vcpu)
3012{
3013        if (to_svm(vcpu)->vmcb->control.exit_info_1)
3014                return kvm_emulate_wrmsr(vcpu);
3015        else
3016                return kvm_emulate_rdmsr(vcpu);
3017}
3018
3019static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3020{
3021        kvm_make_request(KVM_REQ_EVENT, vcpu);
3022        svm_clear_vintr(to_svm(vcpu));
3023
3024        /*
3025         * For AVIC, the only reason to end up here is ExtINTs.
3026         * In this case AVIC was temporarily disabled for
3027         * requesting the IRQ window and we have to re-enable it.
3028         */
3029        kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN);
3030
3031        ++vcpu->stat.irq_window_exits;
3032        return 1;
3033}
3034
3035static int pause_interception(struct kvm_vcpu *vcpu)
3036{
3037        bool in_kernel;
3038
3039        /*
3040         * CPL is not made available for an SEV-ES guest, therefore
3041         * vcpu->arch.preempted_in_kernel can never be true.  Just
3042         * set in_kernel to false as well.
3043         */
3044        in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3045
3046        if (!kvm_pause_in_guest(vcpu->kvm))
3047                grow_ple_window(vcpu);
3048
3049        kvm_vcpu_on_spin(vcpu, in_kernel);
3050        return kvm_skip_emulated_instruction(vcpu);
3051}
3052
3053static int invpcid_interception(struct kvm_vcpu *vcpu)
3054{
3055        struct vcpu_svm *svm = to_svm(vcpu);
3056        unsigned long type;
3057        gva_t gva;
3058
3059        if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3060                kvm_queue_exception(vcpu, UD_VECTOR);
3061                return 1;
3062        }
3063
3064        /*
3065         * For an INVPCID intercept:
3066         * EXITINFO1 provides the linear address of the memory operand.
3067         * EXITINFO2 provides the contents of the register operand.
3068         */
3069        type = svm->vmcb->control.exit_info_2;
3070        gva = svm->vmcb->control.exit_info_1;
3071
3072        if (type > 3) {
3073                kvm_inject_gp(vcpu, 0);
3074                return 1;
3075        }
3076
3077        return kvm_handle_invpcid(vcpu, type, gva);
3078}
3079
3080static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3081        [SVM_EXIT_READ_CR0]                     = cr_interception,
3082        [SVM_EXIT_READ_CR3]                     = cr_interception,
3083        [SVM_EXIT_READ_CR4]                     = cr_interception,
3084        [SVM_EXIT_READ_CR8]                     = cr_interception,
3085        [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3086        [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3087        [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3088        [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3089        [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3090        [SVM_EXIT_READ_DR0]                     = dr_interception,
3091        [SVM_EXIT_READ_DR1]                     = dr_interception,
3092        [SVM_EXIT_READ_DR2]                     = dr_interception,
3093        [SVM_EXIT_READ_DR3]                     = dr_interception,
3094        [SVM_EXIT_READ_DR4]                     = dr_interception,
3095        [SVM_EXIT_READ_DR5]                     = dr_interception,
3096        [SVM_EXIT_READ_DR6]                     = dr_interception,
3097        [SVM_EXIT_READ_DR7]                     = dr_interception,
3098        [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3099        [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3100        [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3101        [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3102        [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3103        [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3104        [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3105        [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3106        [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3107        [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3108        [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3109        [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3110        [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3111        [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
3112        [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
3113        [SVM_EXIT_INTR]                         = intr_interception,
3114        [SVM_EXIT_NMI]                          = nmi_interception,
3115        [SVM_EXIT_SMI]                          = smi_interception,
3116        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3117        [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
3118        [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
3119        [SVM_EXIT_IRET]                         = iret_interception,
3120        [SVM_EXIT_INVD]                         = kvm_emulate_invd,
3121        [SVM_EXIT_PAUSE]                        = pause_interception,
3122        [SVM_EXIT_HLT]                          = kvm_emulate_halt,
3123        [SVM_EXIT_INVLPG]                       = invlpg_interception,
3124        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3125        [SVM_EXIT_IOIO]                         = io_interception,
3126        [SVM_EXIT_MSR]                          = msr_interception,
3127        [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3128        [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3129        [SVM_EXIT_VMRUN]                        = vmrun_interception,
3130        [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
3131        [SVM_EXIT_VMLOAD]                       = vmload_interception,
3132        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3133        [SVM_EXIT_STGI]                         = stgi_interception,
3134        [SVM_EXIT_CLGI]                         = clgi_interception,
3135        [SVM_EXIT_SKINIT]                       = skinit_interception,
3136        [SVM_EXIT_RDTSCP]                       = kvm_handle_invalid_op,
3137        [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
3138        [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
3139        [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
3140        [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
3141        [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
3142        [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
3143        [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
3144        [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
3145        [SVM_EXIT_CR8_WRITE_TRAP]               = cr_trap,
3146        [SVM_EXIT_INVPCID]                      = invpcid_interception,
3147        [SVM_EXIT_NPF]                          = npf_interception,
3148        [SVM_EXIT_RSM]                          = rsm_interception,
3149        [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
3150        [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
3151        [SVM_EXIT_VMGEXIT]                      = sev_handle_vmgexit,
3152};
3153
3154static void dump_vmcb(struct kvm_vcpu *vcpu)
3155{
3156        struct vcpu_svm *svm = to_svm(vcpu);
3157        struct vmcb_control_area *control = &svm->vmcb->control;
3158        struct vmcb_save_area *save = &svm->vmcb->save;
3159        struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3160
3161        if (!dump_invalid_vmcb) {
3162                pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3163                return;
3164        }
3165
3166        pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3167               svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3168        pr_err("VMCB Control Area:\n");
3169        pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3170        pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3171        pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3172        pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3173        pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3174        pr_err("%-20s%08x %08x\n", "intercepts:",
3175              control->intercepts[INTERCEPT_WORD3],
3176               control->intercepts[INTERCEPT_WORD4]);
3177        pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3178        pr_err("%-20s%d\n", "pause filter threshold:",
3179               control->pause_filter_thresh);
3180        pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3181        pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3182        pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3183        pr_err("%-20s%d\n", "asid:", control->asid);
3184        pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3185        pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3186        pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3187        pr_err("%-20s%08x\n", "int_state:", control->int_state);
3188        pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3189        pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3190        pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3191        pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3192        pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3193        pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3194        pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3195        pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3196        pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3197        pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3198        pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3199        pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3200        pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3201        pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3202        pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3203        pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3204        pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3205        pr_err("VMCB State Save Area:\n");
3206        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3207               "es:",
3208               save->es.selector, save->es.attrib,
3209               save->es.limit, save->es.base);
3210        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3211               "cs:",
3212               save->cs.selector, save->cs.attrib,
3213               save->cs.limit, save->cs.base);
3214        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3215               "ss:",
3216               save->ss.selector, save->ss.attrib,
3217               save->ss.limit, save->ss.base);
3218        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3219               "ds:",
3220               save->ds.selector, save->ds.attrib,
3221               save->ds.limit, save->ds.base);
3222        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3223               "fs:",
3224               save01->fs.selector, save01->fs.attrib,
3225               save01->fs.limit, save01->fs.base);
3226        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3227               "gs:",
3228               save01->gs.selector, save01->gs.attrib,
3229               save01->gs.limit, save01->gs.base);
3230        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3231               "gdtr:",
3232               save->gdtr.selector, save->gdtr.attrib,
3233               save->gdtr.limit, save->gdtr.base);
3234        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3235               "ldtr:",
3236               save01->ldtr.selector, save01->ldtr.attrib,
3237               save01->ldtr.limit, save01->ldtr.base);
3238        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3239               "idtr:",
3240               save->idtr.selector, save->idtr.attrib,
3241               save->idtr.limit, save->idtr.base);
3242        pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3243               "tr:",
3244               save01->tr.selector, save01->tr.attrib,
3245               save01->tr.limit, save01->tr.base);
3246        pr_err("cpl:            %d                efer:         %016llx\n",
3247                save->cpl, save->efer);
3248        pr_err("%-15s %016llx %-13s %016llx\n",
3249               "cr0:", save->cr0, "cr2:", save->cr2);
3250        pr_err("%-15s %016llx %-13s %016llx\n",
3251               "cr3:", save->cr3, "cr4:", save->cr4);
3252        pr_err("%-15s %016llx %-13s %016llx\n",
3253               "dr6:", save->dr6, "dr7:", save->dr7);
3254        pr_err("%-15s %016llx %-13s %016llx\n",
3255               "rip:", save->rip, "rflags:", save->rflags);
3256        pr_err("%-15s %016llx %-13s %016llx\n",
3257               "rsp:", save->rsp, "rax:", save->rax);
3258        pr_err("%-15s %016llx %-13s %016llx\n",
3259               "star:", save01->star, "lstar:", save01->lstar);
3260        pr_err("%-15s %016llx %-13s %016llx\n",
3261               "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3262        pr_err("%-15s %016llx %-13s %016llx\n",
3263               "kernel_gs_base:", save01->kernel_gs_base,
3264               "sysenter_cs:", save01->sysenter_cs);
3265        pr_err("%-15s %016llx %-13s %016llx\n",
3266               "sysenter_esp:", save01->sysenter_esp,
3267               "sysenter_eip:", save01->sysenter_eip);
3268        pr_err("%-15s %016llx %-13s %016llx\n",
3269               "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3270        pr_err("%-15s %016llx %-13s %016llx\n",
3271               "br_from:", save->br_from, "br_to:", save->br_to);
3272        pr_err("%-15s %016llx %-13s %016llx\n",
3273               "excp_from:", save->last_excp_from,
3274               "excp_to:", save->last_excp_to);
3275}
3276
3277static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code)
3278{
3279        return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3280                svm_exit_handlers[exit_code]);
3281}
3282
3283static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3284{
3285        vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3286        dump_vmcb(vcpu);
3287        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3288        vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3289        vcpu->run->internal.ndata = 2;
3290        vcpu->run->internal.data[0] = exit_code;
3291        vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3292        return 0;
3293}
3294
3295int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
3296{
3297        if (!svm_check_exit_valid(vcpu, exit_code))
3298                return svm_handle_invalid_exit(vcpu, exit_code);
3299
3300#ifdef CONFIG_RETPOLINE
3301        if (exit_code == SVM_EXIT_MSR)
3302                return msr_interception(vcpu);
3303        else if (exit_code == SVM_EXIT_VINTR)
3304                return interrupt_window_interception(vcpu);
3305        else if (exit_code == SVM_EXIT_INTR)
3306                return intr_interception(vcpu);
3307        else if (exit_code == SVM_EXIT_HLT)
3308                return kvm_emulate_halt(vcpu);
3309        else if (exit_code == SVM_EXIT_NPF)
3310                return npf_interception(vcpu);
3311#endif
3312        return svm_exit_handlers[exit_code](vcpu);
3313}
3314
3315static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
3316                              u32 *intr_info, u32 *error_code)
3317{
3318        struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3319
3320        *info1 = control->exit_info_1;
3321        *info2 = control->exit_info_2;
3322        *intr_info = control->exit_int_info;
3323        if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3324            (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3325                *error_code = control->exit_int_info_err;
3326        else
3327                *error_code = 0;
3328}
3329
3330static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3331{
3332        struct vcpu_svm *svm = to_svm(vcpu);
3333        struct kvm_run *kvm_run = vcpu->run;
3334        u32 exit_code = svm->vmcb->control.exit_code;
3335
3336        trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
3337
3338        /* SEV-ES guests must use the CR write traps to track CR registers. */
3339        if (!sev_es_guest(vcpu->kvm)) {
3340                if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3341                        vcpu->arch.cr0 = svm->vmcb->save.cr0;
3342                if (npt_enabled)
3343                        vcpu->arch.cr3 = svm->vmcb->save.cr3;
3344        }
3345
3346        if (is_guest_mode(vcpu)) {
3347                int vmexit;
3348
3349                trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM);
3350
3351                vmexit = nested_svm_exit_special(svm);
3352
3353                if (vmexit == NESTED_EXIT_CONTINUE)
3354                        vmexit = nested_svm_exit_handled(svm);
3355
3356                if (vmexit == NESTED_EXIT_DONE)
3357                        return 1;
3358        }
3359
3360        if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3361                kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3362                kvm_run->fail_entry.hardware_entry_failure_reason
3363                        = svm->vmcb->control.exit_code;
3364                kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3365                dump_vmcb(vcpu);
3366                return 0;
3367        }
3368
3369        if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
3370            exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3371            exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3372            exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3373                printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
3374                       "exit_code 0x%x\n",
3375                       __func__, svm->vmcb->control.exit_int_info,
3376                       exit_code);
3377
3378        if (exit_fastpath != EXIT_FASTPATH_NONE)
3379                return 1;
3380
3381        return svm_invoke_exit_handler(vcpu, exit_code);
3382}
3383
3384static void reload_tss(struct kvm_vcpu *vcpu)
3385{
3386        struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3387
3388        sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3389        load_TR_desc();
3390}
3391
3392static void pre_svm_run(struct kvm_vcpu *vcpu)
3393{
3394        struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3395        struct vcpu_svm *svm = to_svm(vcpu);
3396
3397        /*
3398         * If the previous vmrun of the vmcb occurred on a different physical
3399         * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
3400         * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3401         */
3402        if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3403                svm->current_vmcb->asid_generation = 0;
3404                vmcb_mark_all_dirty(svm->vmcb);
3405                svm->current_vmcb->cpu = vcpu->cpu;
3406        }
3407
3408        if (sev_guest(vcpu->kvm))
3409                return pre_sev_run(svm, vcpu->cpu);
3410
3411        /* FIXME: handle wraparound of asid_generation */
3412        if (svm->current_vmcb->asid_generation != sd->asid_generation)
3413                new_asid(svm, sd);
3414}
3415
3416static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3417{
3418        struct vcpu_svm *svm = to_svm(vcpu);
3419
3420        svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3421        vcpu->arch.hflags |= HF_NMI_MASK;
3422        if (!sev_es_guest(vcpu->kvm))
3423                svm_set_intercept(svm, INTERCEPT_IRET);
3424        ++vcpu->stat.nmi_injections;
3425}
3426
3427static void svm_set_irq(struct kvm_vcpu *vcpu)
3428{
3429        struct vcpu_svm *svm = to_svm(vcpu);
3430
3431        BUG_ON(!(gif_set(svm)));
3432
3433        trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
3434        ++vcpu->stat.irq_injections;
3435
3436        svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3437                SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
3438}
3439
3440static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3441{
3442        struct vcpu_svm *svm = to_svm(vcpu);
3443
3444        /*
3445         * SEV-ES guests must always keep the CR intercepts cleared. CR
3446         * tracking is done using the CR write traps.
3447         */
3448        if (sev_es_guest(vcpu->kvm))
3449                return;
3450
3451        if (nested_svm_virtualize_tpr(vcpu))
3452                return;
3453
3454        svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3455
3456        if (irr == -1)
3457                return;
3458
3459        if (tpr >= irr)
3460                svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3461}
3462
3463bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3464{
3465        struct vcpu_svm *svm = to_svm(vcpu);
3466        struct vmcb *vmcb = svm->vmcb;
3467        bool ret;
3468
3469        if (!gif_set(svm))
3470                return true;
3471
3472        if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3473                return false;
3474
3475        ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
3476              (vcpu->arch.hflags & HF_NMI_MASK);
3477
3478        return ret;
3479}
3480
3481static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3482{
3483        struct vcpu_svm *svm = to_svm(vcpu);
3484        if (svm->nested.nested_run_pending)
3485                return -EBUSY;
3486
3487        /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
3488        if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3489                return -EBUSY;
3490
3491        return !svm_nmi_blocked(vcpu);
3492}
3493
3494static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3495{
3496        return !!(vcpu->arch.hflags & HF_NMI_MASK);
3497}
3498
3499static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3500{
3501        struct vcpu_svm *svm = to_svm(vcpu);
3502
3503        if (masked) {
3504                vcpu->arch.hflags |= HF_NMI_MASK;
3505                if (!sev_es_guest(vcpu->kvm))
3506                        svm_set_intercept(svm, INTERCEPT_IRET);
3507        } else {
3508                vcpu->arch.hflags &= ~HF_NMI_MASK;
3509                if (!sev_es_guest(vcpu->kvm))
3510                        svm_clr_intercept(svm, INTERCEPT_IRET);
3511        }
3512}
3513
3514bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3515{
3516        struct vcpu_svm *svm = to_svm(vcpu);
3517        struct vmcb *vmcb = svm->vmcb;
3518
3519        if (!gif_set(svm))
3520                return true;
3521
3522        if (sev_es_guest(vcpu->kvm)) {
3523                /*
3524                 * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
3525                 * bit to determine the state of the IF flag.
3526                 */
3527                if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK))
3528                        return true;
3529        } else if (is_guest_mode(vcpu)) {
3530                /* As long as interrupts are being delivered...  */
3531                if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3532                    ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3533                    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3534                        return true;
3535
3536                /* ... vmexits aren't blocked by the interrupt shadow  */
3537                if (nested_exit_on_intr(svm))
3538                        return false;
3539        } else {
3540                if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3541                        return true;
3542        }
3543
3544        return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3545}
3546
3547static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3548{
3549        struct vcpu_svm *svm = to_svm(vcpu);
3550        if (svm->nested.nested_run_pending)
3551                return -EBUSY;
3552
3553        /*
3554         * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3555         * e.g. if the IRQ arrived asynchronously after checking nested events.
3556         */
3557        if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3558                return -EBUSY;
3559
3560        return !svm_interrupt_blocked(vcpu);
3561}
3562
3563static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3564{
3565        struct vcpu_svm *svm = to_svm(vcpu);
3566
3567        /*
3568         * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3569         * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3570         * get that intercept, this function will be called again though and
3571         * we'll get the vintr intercept. However, if the vGIF feature is
3572         * enabled, the STGI interception will not occur. Enable the irq
3573         * window under the assumption that the hardware will set the GIF.
3574         */
3575        if (vgif_enabled(svm) || gif_set(svm)) {
3576                /*
3577                 * IRQ window is not needed when AVIC is enabled,
3578                 * unless we have pending ExtINT since it cannot be injected
3579                 * via AVIC. In such case, we need to temporarily disable AVIC,
3580                 * and fallback to injecting IRQ via V_IRQ.
3581                 */
3582                kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN);
3583                svm_set_vintr(svm);
3584        }
3585}
3586
3587static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3588{
3589        struct vcpu_svm *svm = to_svm(vcpu);
3590
3591        if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
3592                return; /* IRET will cause a vm exit */
3593
3594        if (!gif_set(svm)) {
3595                if (vgif_enabled(svm))
3596                        svm_set_intercept(svm, INTERCEPT_STGI);
3597                return; /* STGI will cause a vm exit */
3598        }
3599
3600        /*
3601         * Something prevents NMI from been injected. Single step over possible
3602         * problem (IRET or exception injection or interrupt shadow)
3603         */
3604        svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3605        svm->nmi_singlestep = true;
3606        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3607}
3608
3609static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3610{
3611        return 0;
3612}
3613
3614static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
3615{
3616        return 0;
3617}
3618
3619void svm_flush_tlb(struct kvm_vcpu *vcpu)
3620{
3621        struct vcpu_svm *svm = to_svm(vcpu);
3622
3623        /*
3624         * Flush only the current ASID even if the TLB flush was invoked via
3625         * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
3626         * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3627         * unconditionally does a TLB flush on both nested VM-Enter and nested
3628         * VM-Exit (via kvm_mmu_reset_context()).
3629         */
3630        if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3631                svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3632        else
3633                svm->current_vmcb->asid_generation--;
3634}
3635
3636static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3637{
3638        struct vcpu_svm *svm = to_svm(vcpu);
3639
3640        invlpga(gva, svm->vmcb->control.asid);
3641}
3642
3643static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3644{
3645        struct vcpu_svm *svm = to_svm(vcpu);
3646
3647        if (nested_svm_virtualize_tpr(vcpu))
3648                return;
3649
3650        if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3651                int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3652                kvm_set_cr8(vcpu, cr8);
3653        }
3654}
3655
3656static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3657{
3658        struct vcpu_svm *svm = to_svm(vcpu);
3659        u64 cr8;
3660
3661        if (nested_svm_virtualize_tpr(vcpu) ||
3662            kvm_vcpu_apicv_active(vcpu))
3663                return;
3664
3665        cr8 = kvm_get_cr8(vcpu);
3666        svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3667        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3668}
3669
3670static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
3671{
3672        struct vcpu_svm *svm = to_svm(vcpu);
3673        u8 vector;
3674        int type;
3675        u32 exitintinfo = svm->vmcb->control.exit_int_info;
3676        unsigned int3_injected = svm->int3_injected;
3677
3678        svm->int3_injected = 0;
3679
3680        /*
3681         * If we've made progress since setting HF_IRET_MASK, we've
3682         * executed an IRET and can allow NMI injection.
3683         */
3684        if ((vcpu->arch.hflags & HF_IRET_MASK) &&
3685            (sev_es_guest(vcpu->kvm) ||
3686             kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
3687                vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3688                kvm_make_request(KVM_REQ_EVENT, vcpu);
3689        }
3690
3691        vcpu->arch.nmi_injected = false;
3692        kvm_clear_exception_queue(vcpu);
3693        kvm_clear_interrupt_queue(vcpu);
3694
3695        if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3696                return;
3697
3698        kvm_make_request(KVM_REQ_EVENT, vcpu);
3699
3700        vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3701        type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3702
3703        switch (type) {
3704        case SVM_EXITINTINFO_TYPE_NMI:
3705                vcpu->arch.nmi_injected = true;
3706                break;
3707        case SVM_EXITINTINFO_TYPE_EXEPT:
3708                /*
3709                 * Never re-inject a #VC exception.
3710                 */
3711                if (vector == X86_TRAP_VC)
3712                        break;
3713
3714                /*
3715                 * In case of software exceptions, do not reinject the vector,
3716                 * but re-execute the instruction instead. Rewind RIP first
3717                 * if we emulated INT3 before.
3718                 */
3719                if (kvm_exception_is_soft(vector)) {
3720                        if (vector == BP_VECTOR && int3_injected &&
3721                            kvm_is_linear_rip(vcpu, svm->int3_rip))
3722                                kvm_rip_write(vcpu,
3723                                              kvm_rip_read(vcpu) - int3_injected);
3724                        break;
3725                }
3726                if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3727                        u32 err = svm->vmcb->control.exit_int_info_err;
3728                        kvm_requeue_exception_e(vcpu, vector, err);
3729
3730                } else
3731                        kvm_requeue_exception(vcpu, vector);
3732                break;
3733        case SVM_EXITINTINFO_TYPE_INTR:
3734                kvm_queue_interrupt(vcpu, vector, false);
3735                break;
3736        default:
3737                break;
3738        }
3739}
3740
3741static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3742{
3743        struct vcpu_svm *svm = to_svm(vcpu);
3744        struct vmcb_control_area *control = &svm->vmcb->control;
3745
3746        control->exit_int_info = control->event_inj;
3747        control->exit_int_info_err = control->event_inj_err;
3748        control->event_inj = 0;
3749        svm_complete_interrupts(vcpu);
3750}
3751
3752static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
3753{
3754        if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
3755            to_svm(vcpu)->vmcb->control.exit_info_1)
3756                return handle_fastpath_set_msr_irqoff(vcpu);
3757
3758        return EXIT_FASTPATH_NONE;
3759}
3760
3761static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
3762{
3763        struct vcpu_svm *svm = to_svm(vcpu);
3764        unsigned long vmcb_pa = svm->current_vmcb->pa;
3765
3766        kvm_guest_enter_irqoff();
3767
3768        if (sev_es_guest(vcpu->kvm)) {
3769                __svm_sev_es_vcpu_run(vmcb_pa);
3770        } else {
3771                struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3772
3773                /*
3774                 * Use a single vmcb (vmcb01 because it's always valid) for
3775                 * context switching guest state via VMLOAD/VMSAVE, that way
3776                 * the state doesn't need to be copied between vmcb01 and
3777                 * vmcb02 when switching vmcbs for nested virtualization.
3778                 */
3779                vmload(svm->vmcb01.pa);
3780                __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
3781                vmsave(svm->vmcb01.pa);
3782
3783                vmload(__sme_page_pa(sd->save_area));
3784        }
3785
3786        kvm_guest_exit_irqoff();
3787}
3788
3789static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
3790{
3791        struct vcpu_svm *svm = to_svm(vcpu);
3792
3793        trace_kvm_entry(vcpu);
3794
3795        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3796        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3797        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3798
3799        /*
3800         * Disable singlestep if we're injecting an interrupt/exception.
3801         * We don't want our modified rflags to be pushed on the stack where
3802         * we might not be able to easily reset them if we disabled NMI
3803         * singlestep later.
3804         */
3805        if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
3806                /*
3807                 * Event injection happens before external interrupts cause a
3808                 * vmexit and interrupts are disabled here, so smp_send_reschedule
3809                 * is enough to force an immediate vmexit.
3810                 */
3811                disable_nmi_singlestep(svm);
3812                smp_send_reschedule(vcpu->cpu);
3813        }
3814
3815        pre_svm_run(vcpu);
3816
3817        WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
3818
3819        sync_lapic_to_cr8(vcpu);
3820
3821        if (unlikely(svm->asid != svm->vmcb->control.asid)) {
3822                svm->vmcb->control.asid = svm->asid;
3823                vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3824        }
3825        svm->vmcb->save.cr2 = vcpu->arch.cr2;
3826
3827        svm_hv_update_vp_id(svm->vmcb, vcpu);
3828
3829        /*
3830         * Run with all-zero DR6 unless needed, so that we can get the exact cause
3831         * of a #DB.
3832         */
3833        if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
3834                svm_set_dr6(svm, vcpu->arch.dr6);
3835        else
3836                svm_set_dr6(svm, DR6_ACTIVE_LOW);
3837
3838        clgi();
3839        kvm_load_guest_xsave_state(vcpu);
3840
3841        kvm_wait_lapic_expire(vcpu);
3842
3843        /*
3844         * If this vCPU has touched SPEC_CTRL, restore the guest's value if
3845         * it's non-zero. Since vmentry is serialising on affected CPUs, there
3846         * is no need to worry about the conditional branch over the wrmsr
3847         * being speculatively taken.
3848         */
3849        if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3850                x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
3851
3852        svm_vcpu_enter_exit(vcpu);
3853
3854        /*
3855         * We do not use IBRS in the kernel. If this vCPU has used the
3856         * SPEC_CTRL MSR it may have left it on; save the value and
3857         * turn it off. This is much more efficient than blindly adding
3858         * it to the atomic save/restore list. Especially as the former
3859         * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
3860         *
3861         * For non-nested case:
3862         * If the L01 MSR bitmap does not intercept the MSR, then we need to
3863         * save it.
3864         *
3865         * For nested case:
3866         * If the L02 MSR bitmap does not intercept the MSR, then we need to
3867         * save it.
3868         */
3869        if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
3870            unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
3871                svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
3872
3873        if (!sev_es_guest(vcpu->kvm))
3874                reload_tss(vcpu);
3875
3876        if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3877                x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
3878
3879        if (!sev_es_guest(vcpu->kvm)) {
3880                vcpu->arch.cr2 = svm->vmcb->save.cr2;
3881                vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3882                vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3883                vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3884        }
3885
3886        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3887                kvm_before_interrupt(vcpu);
3888
3889        kvm_load_host_xsave_state(vcpu);
3890        stgi();
3891
3892        /* Any pending NMI will happen here */
3893
3894        if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3895                kvm_after_interrupt(vcpu);
3896
3897        sync_cr8_to_lapic(vcpu);
3898
3899        svm->next_rip = 0;
3900        if (is_guest_mode(vcpu)) {
3901                nested_sync_control_from_vmcb02(svm);
3902
3903                /* Track VMRUNs that have made past consistency checking */
3904                if (svm->nested.nested_run_pending &&
3905                    svm->vmcb->control.exit_code != SVM_EXIT_ERR)
3906                        ++vcpu->stat.nested_run;
3907
3908                svm->nested.nested_run_pending = 0;
3909        }
3910
3911        svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3912        vmcb_mark_all_clean(svm->vmcb);
3913
3914        /* if exit due to PF check for async PF */
3915        if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3916                vcpu->arch.apf.host_apf_flags =
3917                        kvm_read_and_reset_apf_flags();
3918
3919        if (npt_enabled)
3920                kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR);
3921
3922        /*
3923         * We need to handle MC intercepts here before the vcpu has a chance to
3924         * change the physical cpu
3925         */
3926        if (unlikely(svm->vmcb->control.exit_code ==
3927                     SVM_EXIT_EXCP_BASE + MC_VECTOR))
3928                svm_handle_mce(vcpu);
3929
3930        svm_complete_interrupts(vcpu);
3931
3932        if (is_guest_mode(vcpu))
3933                return EXIT_FASTPATH_NONE;
3934
3935        return svm_exit_handlers_fastpath(vcpu);
3936}
3937
3938static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
3939                             int root_level)
3940{
3941        struct vcpu_svm *svm = to_svm(vcpu);
3942        unsigned long cr3;
3943
3944        if (npt_enabled) {
3945                svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
3946                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
3947
3948                hv_track_root_tdp(vcpu, root_hpa);
3949
3950                /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
3951                if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3952                        return;
3953                cr3 = vcpu->arch.cr3;
3954        } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3955                cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
3956        } else {
3957                /* PCID in the guest should be impossible with a 32-bit MMU. */
3958                WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
3959                cr3 = root_hpa;
3960        }
3961
3962        svm->vmcb->save.cr3 = cr3;
3963        vmcb_mark_dirty(svm->vmcb, VMCB_CR);
3964}
3965
3966static int is_disabled(void)
3967{
3968        u64 vm_cr;
3969
3970        rdmsrl(MSR_VM_CR, vm_cr);
3971        if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
3972                return 1;
3973
3974        return 0;
3975}
3976
3977static void
3978svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3979{
3980        /*
3981         * Patch in the VMMCALL instruction:
3982         */
3983        hypercall[0] = 0x0f;
3984        hypercall[1] = 0x01;
3985        hypercall[2] = 0xd9;
3986}
3987
3988static int __init svm_check_processor_compat(void)
3989{
3990        return 0;
3991}
3992
3993static bool svm_cpu_has_accelerated_tpr(void)
3994{
3995        return false;
3996}
3997
3998/*
3999 * The kvm parameter can be NULL (module initialization, or invocation before
4000 * VM creation). Be sure to check the kvm parameter before using it.

4001 */
4002static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4003{
4004        switch (index) {
4005        case MSR_IA32_MCG_EXT_CTL:
4006        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4007                return false;
4008        case MSR_IA32_SMBASE:
4009                /* SEV-ES guests do not support SMM, so report false */
4010                if (kvm && sev_es_guest(kvm))
4011                        return false;
4012                break;
4013        default:
4014                break;
4015        }
4016
4017        return true;
4018}
4019
4020static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4021{
4022        return 0;
4023}
4024
4025static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4026{
4027        struct vcpu_svm *svm = to_svm(vcpu);
4028        struct kvm_cpuid_entry2 *best;
4029
4030        vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4031                                    boot_cpu_has(X86_FEATURE_XSAVE) &&
4032                                    boot_cpu_has(X86_FEATURE_XSAVES);
4033
4034        /* Update nrips enabled cache */
4035        svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
4036                             guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
4037
4038        svm_recalc_instruction_intercepts(vcpu, svm);
4039
4040        /* For sev guests, the memory encryption bit is not reserved in CR3.  */
4041        if (sev_guest(vcpu->kvm)) {
4042                best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
4043                if (best)
4044                        vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4045        }
4046
4047        if (kvm_vcpu_apicv_active(vcpu)) {
4048                /*
4049                 * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
4050                 * is exposed to the guest, disable AVIC.
4051                 */
4052                if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
4053                        kvm_request_apicv_update(vcpu->kvm, false,
4054                                                 APICV_INHIBIT_REASON_X2APIC);
4055
4056                /*
4057                 * Currently, AVIC does not work with nested virtualization.
4058                 * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
4059                 */
4060                if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4061                        kvm_request_apicv_update(vcpu->kvm, false,
4062                                                 APICV_INHIBIT_REASON_NESTED);
4063        }
4064        init_vmcb_after_set_cpuid(vcpu);
4065}
4066
4067static bool svm_has_wbinvd_exit(void)
4068{
4069        return true;
4070}
4071
4072#define PRE_EX(exit)  { .exit_code = (exit), \
4073                        .stage = X86_ICPT_PRE_EXCEPT, }
4074#define POST_EX(exit) { .exit_code = (exit), \
4075                        .stage = X86_ICPT_POST_EXCEPT, }
4076#define POST_MEM(exit) { .exit_code = (exit), \
4077                        .stage = X86_ICPT_POST_MEMACCESS, }
4078
4079static const struct __x86_intercept {
4080        u32 exit_code;
4081        enum x86_intercept_stage stage;
4082} x86_intercept_map[] = {
4083        [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4084        [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4085        [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4086        [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4087        [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4088        [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4089        [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4090        [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4091        [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4092        [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4093        [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4094        [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4095        [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4096        [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4097        [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4098        [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4099        [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4100        [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4101        [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4102        [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4103        [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4104        [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4105        [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4106        [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4107        [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4108        [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4109        [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4110        [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4111        [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4112        [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4113        [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4114        [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4115        [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4116        [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4117        [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4118        [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4119        [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4120        [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4121        [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4122        [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4123        [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4124        [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4125        [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4126        [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4127        [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4128        [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4129        [x86_intercept_xsetbv]          = PRE_EX(SVM_EXIT_XSETBV),
4130};
4131
4132#undef PRE_EX
4133#undef POST_EX
4134#undef POST_MEM
4135
4136static int svm_check_intercept(struct kvm_vcpu *vcpu,
4137                               struct x86_instruction_info *info,
4138                               enum x86_intercept_stage stage,
4139                               struct x86_exception *exception)
4140{
4141        struct vcpu_svm *svm = to_svm(vcpu);
4142        int vmexit, ret = X86EMUL_CONTINUE;
4143        struct __x86_intercept icpt_info;
4144        struct vmcb *vmcb = svm->vmcb;
4145
4146        if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4147                goto out;
4148
4149        icpt_info = x86_intercept_map[info->intercept];
4150
4151        if (stage != icpt_info.stage)
4152                goto out;
4153
4154        switch (icpt_info.exit_code) {
4155        case SVM_EXIT_READ_CR0:
4156                if (info->intercept == x86_intercept_cr_read)
4157                        icpt_info.exit_code += info->modrm_reg;
4158                break;
4159        case SVM_EXIT_WRITE_CR0: {
4160                unsigned long cr0, val;
4161
4162                if (info->intercept == x86_intercept_cr_write)
4163                        icpt_info.exit_code += info->modrm_reg;
4164
4165                if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4166                    info->intercept == x86_intercept_clts)
4167                        break;
4168
4169                if (!(vmcb_is_intercept(&svm->nested.ctl,
4170                                        INTERCEPT_SELECTIVE_CR0)))
4171                        break;
4172
4173                cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4174                val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4175
4176                if (info->intercept == x86_intercept_lmsw) {
4177                        cr0 &= 0xfUL;
4178                        val &= 0xfUL;
4179                        /* lmsw can't clear PE - catch this here */
4180                        if (cr0 & X86_CR0_PE)
4181                                val |= X86_CR0_PE;
4182                }
4183
4184                if (cr0 ^ val)
4185                        icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4186
4187                break;
4188        }
4189        case SVM_EXIT_READ_DR0:
4190        case SVM_EXIT_WRITE_DR0:
4191                icpt_info.exit_code += info->modrm_reg;
4192                break;
4193        case SVM_EXIT_MSR:
4194                if (info->intercept == x86_intercept_wrmsr)
4195                        vmcb->control.exit_info_1 = 1;
4196                else
4197                        vmcb->control.exit_info_1 = 0;
4198                break;
4199        case SVM_EXIT_PAUSE:
4200                /*
4201                 * We get this for NOP only, but pause
4202                 * is rep not, check this here
4203                 */
4204                if (info->rep_prefix != REPE_PREFIX)
4205                        goto out;
4206                break;
4207        case SVM_EXIT_IOIO: {
4208                u64 exit_info;
4209                u32 bytes;
4210
4211                if (info->intercept == x86_intercept_in ||
4212                    info->intercept == x86_intercept_ins) {
4213                        exit_info = ((info->src_val & 0xffff) << 16) |
4214                                SVM_IOIO_TYPE_MASK;
4215                        bytes = info->dst_bytes;
4216                } else {
4217                        exit_info = (info->dst_val & 0xffff) << 16;
4218                        bytes = info->src_bytes;
4219                }
4220
4221                if (info->intercept == x86_intercept_outs ||
4222                    info->intercept == x86_intercept_ins)
4223                        exit_info |= SVM_IOIO_STR_MASK;
4224
4225                if (info->rep_prefix)
4226                        exit_info |= SVM_IOIO_REP_MASK;
4227
4228                bytes = min(bytes, 4u);
4229
4230                exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4231
4232                exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4233
4234                vmcb->control.exit_info_1 = exit_info;
4235                vmcb->control.exit_info_2 = info->next_rip;
4236
4237                break;
4238        }
4239        default:
4240                break;
4241        }
4242
4243        /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4244        if (static_cpu_has(X86_FEATURE_NRIPS))
4245                vmcb->control.next_rip  = info->next_rip;
4246        vmcb->control.exit_code = icpt_info.exit_code;
4247        vmexit = nested_svm_exit_handled(svm);
4248
4249        ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4250                                           : X86EMUL_CONTINUE;
4251
4252out:
4253        return ret;
4254}
4255
4256static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4257{
4258}
4259
4260static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4261{
4262        if (!kvm_pause_in_guest(vcpu->kvm))
4263                shrink_ple_window(vcpu);
4264}
4265
4266static void svm_setup_mce(struct kvm_vcpu *vcpu)
4267{
4268        /* [63:9] are reserved. */
4269        vcpu->arch.mcg_cap &= 0x1ff;
4270}
4271
4272bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4273{
4274        struct vcpu_svm *svm = to_svm(vcpu);
4275
4276        /* Per APM Vol.2 15.22.2 "Response to SMI" */
4277        if (!gif_set(svm))
4278                return true;
4279
4280        return is_smm(vcpu);
4281}
4282
4283static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4284{
4285        struct vcpu_svm *svm = to_svm(vcpu);
4286        if (svm->nested.nested_run_pending)
4287                return -EBUSY;
4288
4289        /* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
4290        if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4291                return -EBUSY;
4292
4293        return !svm_smi_blocked(vcpu);
4294}
4295
4296static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
4297{
4298        struct vcpu_svm *svm = to_svm(vcpu);
4299        struct kvm_host_map map_save;
4300        int ret;
4301
4302        if (!is_guest_mode(vcpu))
4303                return 0;
4304
4305        /* FED8h - SVM Guest */
4306        put_smstate(u64, smstate, 0x7ed8, 1);
4307        /* FEE0h - SVM Guest VMCB Physical Address */
4308        put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
4309
4310        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4311        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4312        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4313
4314        ret = nested_svm_vmexit(svm);
4315        if (ret)
4316                return ret;
4317
4318        /*
4319         * KVM uses VMCB01 to store L1 host state while L2 runs but
4320         * VMCB01 is going to be used during SMM and thus the state will
4321         * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4322         * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4323         * format of the area is identical to guest save area offsetted
4324         * by 0x400 (matches the offset of 'struct vmcb_save_area'
4325         * within 'struct vmcb'). Note: HSAVE area may also be used by
4326         * L1 hypervisor to save additional host context (e.g. KVM does
4327         * that, see svm_prepare_guest_switch()) which must be
4328         * preserved.
4329         */
4330        if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
4331                         &map_save) == -EINVAL)
4332                return 1;
4333
4334        BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4335
4336        svm_copy_vmrun_state(map_save.hva + 0x400,
4337                             &svm->vmcb01.ptr->save);
4338
4339        kvm_vcpu_unmap(vcpu, &map_save, true);
4340        return 0;
4341}
4342
4343static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
4344{
4345        struct vcpu_svm *svm = to_svm(vcpu);
4346        struct kvm_host_map map, map_save;
4347        u64 saved_efer, vmcb12_gpa;
4348        struct vmcb *vmcb12;
4349        int ret;
4350
4351        if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4352                return 0;
4353
4354        /* Non-zero if SMI arrived while vCPU was in guest mode. */
4355        if (!GET_SMSTATE(u64, smstate, 0x7ed8))
4356                return 0;
4357
4358        if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4359                return 1;
4360
4361        saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
4362        if (!(saved_efer & EFER_SVME))
4363                return 1;
4364
4365        vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
4366        if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
4367                return 1;
4368
4369        ret = 1;
4370        if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
4371                goto unmap_map;
4372
4373        if (svm_allocate_nested(svm))
4374                goto unmap_save;
4375
4376        /*
4377         * Restore L1 host state from L1 HSAVE area as VMCB01 was
4378         * used during SMM (see svm_enter_smm())
4379         */
4380
4381        svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4382
4383        /*
4384         * Enter the nested guest now
4385         */
4386
4387        vmcb12 = map.hva;
4388        nested_load_control_from_vmcb12(svm, &vmcb12->control);
4389        ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
4390
4391unmap_save:
4392        kvm_vcpu_unmap(vcpu, &map_save, true);
4393unmap_map:
4394        kvm_vcpu_unmap(vcpu, &map, true);
4395        return ret;
4396}
4397
4398static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4399{
4400        struct vcpu_svm *svm = to_svm(vcpu);
4401
4402        if (!gif_set(svm)) {
4403                if (vgif_enabled(svm))
4404                        svm_set_intercept(svm, INTERCEPT_STGI);
4405                /* STGI will cause a vm exit */
4406        } else {
4407                /* We must be in SMM; RSM will cause a vmexit anyway.  */
4408        }
4409}
4410
4411static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
4412{
4413        bool smep, smap, is_user;
4414        unsigned long cr4;
4415
4416        /*
4417         * When the guest is an SEV-ES guest, emulation is not possible.
4418         */
4419        if (sev_es_guest(vcpu->kvm))
4420                return false;
4421
4422        /*
4423         * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4424         *
4425         * Errata:
4426         * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
4427         * possible that CPU microcode implementing DecodeAssist will fail
4428         * to read bytes of instruction which caused #NPF. In this case,
4429         * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
4430         * return 0 instead of the correct guest instruction bytes.
4431         *
4432         * This happens because CPU microcode reading instruction bytes
4433         * uses a special opcode which attempts to read data using CPL=0
4434         * priviledges. The microcode reads CS:RIP and if it hits a SMAP
4435         * fault, it gives up and returns no instruction bytes.
4436         *
4437         * Detection:
4438         * We reach here in case CPU supports DecodeAssist, raised #NPF and
4439         * returned 0 in GuestIntrBytes field of the VMCB.
4440         * First, errata can only be triggered in case vCPU CR4.SMAP=1.
4441         * Second, if vCPU CR4.SMEP=1, errata could only be triggered
4442         * in case vCPU CPL==3 (Because otherwise guest would have triggered
4443         * a SMEP fault instead of #NPF).
4444         * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
4445         * As most guests enable SMAP if they have also enabled SMEP, use above
4446         * logic in order to attempt minimize false-positive of detecting errata
4447         * while still preserving all cases semantic correctness.
4448         *
4449         * Workaround:
4450         * To determine what instruction the guest was executing, the hypervisor
4451         * will have to decode the instruction at the instruction pointer.
4452         *
4453         * In non SEV guest, hypervisor will be able to read the guest
4454         * memory to decode the instruction pointer when insn_len is zero
4455         * so we return true to indicate that decoding is possible.
4456         *
4457         * But in the SEV guest, the guest memory is encrypted with the
4458         * guest specific key and hypervisor will not be able to decode the
4459         * instruction pointer so we will not able to workaround it. Lets
4460         * print the error and request to kill the guest.
4461         */
4462        if (likely(!insn || insn_len))
4463                return true;
4464
4465        /*
4466         * If RIP is invalid, go ahead with emulation which will cause an
4467         * internal error exit.
4468         */
4469        if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
4470                return true;
4471
4472        cr4 = kvm_read_cr4(vcpu);
4473        smep = cr4 & X86_CR4_SMEP;
4474        smap = cr4 & X86_CR4_SMAP;
4475        is_user = svm_get_cpl(vcpu) == 3;
4476        if (smap && (!smep || is_user)) {
4477                if (!sev_guest(vcpu->kvm))
4478                        return true;
4479
4480                pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
4481                kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4482        }
4483
4484        return false;
4485}
4486
4487static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4488{
4489        struct vcpu_svm *svm = to_svm(vcpu);
4490
4491        /*
4492         * TODO: Last condition latch INIT signals on vCPU when
4493         * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
4494         * To properly emulate the INIT intercept,
4495         * svm_check_nested_events() should call nested_svm_vmexit()
4496         * if an INIT signal is pending.
4497         */
4498        return !gif_set(svm) ||
4499                   (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
4500}
4501
4502static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4503{
4504        if (!sev_es_guest(vcpu->kvm))
4505                return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4506
4507        sev_vcpu_deliver_sipi_vector(vcpu, vector);
4508}
4509
4510static void svm_vm_destroy(struct kvm *kvm)
4511{
4512        avic_vm_destroy(kvm);
4513        sev_vm_destroy(kvm);
4514}
4515
4516static int svm_vm_init(struct kvm *kvm)
4517{
4518        if (!pause_filter_count || !pause_filter_thresh)
4519                kvm->arch.pause_in_guest = true;
4520
4521        if (enable_apicv) {
4522                int ret = avic_vm_init(kvm);
4523                if (ret)
4524                        return ret;
4525        }
4526
4527        return 0;
4528}
4529
4530static struct kvm_x86_ops svm_x86_ops __initdata = {
4531        .hardware_unsetup = svm_hardware_teardown,
4532        .hardware_enable = svm_hardware_enable,
4533        .hardware_disable = svm_hardware_disable,
4534        .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
4535        .has_emulated_msr = svm_has_emulated_msr,
4536
4537        .vcpu_create = svm_create_vcpu,
4538        .vcpu_free = svm_free_vcpu,
4539        .vcpu_reset = svm_vcpu_reset,
4540
4541        .vm_size = sizeof(struct kvm_svm),
4542        .vm_init = svm_vm_init,
4543        .vm_destroy = svm_vm_destroy,
4544
4545        .prepare_guest_switch = svm_prepare_guest_switch,
4546        .vcpu_load = svm_vcpu_load,
4547        .vcpu_put = svm_vcpu_put,
4548        .vcpu_blocking = svm_vcpu_blocking,
4549        .vcpu_unblocking = svm_vcpu_unblocking,
4550
4551        .update_exception_bitmap = svm_update_exception_bitmap,
4552        .get_msr_feature = svm_get_msr_feature,
4553        .get_msr = svm_get_msr,
4554        .set_msr = svm_set_msr,
4555        .get_segment_base = svm_get_segment_base,
4556        .get_segment = svm_get_segment,
4557        .set_segment = svm_set_segment,
4558        .get_cpl = svm_get_cpl,
4559        .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
4560        .set_cr0 = svm_set_cr0,
4561        .is_valid_cr4 = svm_is_valid_cr4,
4562        .set_cr4 = svm_set_cr4,
4563        .set_efer = svm_set_efer,
4564        .get_idt = svm_get_idt,
4565        .set_idt = svm_set_idt,
4566        .get_gdt = svm_get_gdt,
4567        .set_gdt = svm_set_gdt,
4568        .set_dr7 = svm_set_dr7,
4569        .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4570        .cache_reg = svm_cache_reg,
4571        .get_rflags = svm_get_rflags,
4572        .set_rflags = svm_set_rflags,
4573
4574        .tlb_flush_all = svm_flush_tlb,
4575        .tlb_flush_current = svm_flush_tlb,
4576        .tlb_flush_gva = svm_flush_tlb_gva,
4577        .tlb_flush_guest = svm_flush_tlb,
4578
4579        .run = svm_vcpu_run,
4580        .handle_exit = handle_exit,
4581        .skip_emulated_instruction = skip_emulated_instruction,
4582        .update_emulated_instruction = NULL,
4583        .set_interrupt_shadow = svm_set_interrupt_shadow,
4584        .get_interrupt_shadow = svm_get_interrupt_shadow,
4585        .patch_hypercall = svm_patch_hypercall,
4586        .set_irq = svm_set_irq,
4587        .set_nmi = svm_inject_nmi,
4588        .queue_exception = svm_queue_exception,
4589        .cancel_injection = svm_cancel_injection,
4590        .interrupt_allowed = svm_interrupt_allowed,
4591        .nmi_allowed = svm_nmi_allowed,
4592        .get_nmi_mask = svm_get_nmi_mask,
4593        .set_nmi_mask = svm_set_nmi_mask,
4594        .enable_nmi_window = svm_enable_nmi_window,
4595        .enable_irq_window = svm_enable_irq_window,
4596        .update_cr8_intercept = svm_update_cr8_intercept,
4597        .set_virtual_apic_mode = svm_set_virtual_apic_mode,
4598        .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
4599        .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
4600        .load_eoi_exitmap = svm_load_eoi_exitmap,
4601        .hwapic_irr_update = svm_hwapic_irr_update,
4602        .hwapic_isr_update = svm_hwapic_isr_update,
4603        .sync_pir_to_irr = kvm_lapic_find_highest_irr,
4604        .apicv_post_state_restore = avic_post_state_restore,
4605
4606        .set_tss_addr = svm_set_tss_addr,
4607        .set_identity_map_addr = svm_set_identity_map_addr,
4608        .get_mt_mask = svm_get_mt_mask,
4609
4610        .get_exit_info = svm_get_exit_info,
4611
4612        .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4613
4614        .has_wbinvd_exit = svm_has_wbinvd_exit,
4615
4616        .get_l2_tsc_offset = svm_get_l2_tsc_offset,
4617        .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
4618        .write_tsc_offset = svm_write_tsc_offset,
4619        .write_tsc_multiplier = svm_write_tsc_multiplier,
4620
4621        .load_mmu_pgd = svm_load_mmu_pgd,
4622
4623        .check_intercept = svm_check_intercept,
4624        .handle_exit_irqoff = svm_handle_exit_irqoff,
4625
4626        .request_immediate_exit = __kvm_request_immediate_exit,
4627
4628        .sched_in = svm_sched_in,
4629
4630        .pmu_ops = &amd_pmu_ops,
4631        .nested_ops = &svm_nested_ops,
4632
4633        .deliver_posted_interrupt = svm_deliver_avic_intr,
4634        .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
4635        .update_pi_irte = svm_update_pi_irte,
4636        .setup_mce = svm_setup_mce,
4637
4638        .smi_allowed = svm_smi_allowed,
4639        .enter_smm = svm_enter_smm,
4640        .leave_smm = svm_leave_smm,
4641        .enable_smi_window = svm_enable_smi_window,
4642
4643        .mem_enc_op = svm_mem_enc_op,
4644        .mem_enc_reg_region = svm_register_enc_region,
4645        .mem_enc_unreg_region = svm_unregister_enc_region,
4646
4647        .vm_copy_enc_context_from = svm_vm_copy_asid_from,
4648
4649        .can_emulate_instruction = svm_can_emulate_instruction,
4650
4651        .apic_init_signal_blocked = svm_apic_init_signal_blocked,
4652
4653        .msr_filter_changed = svm_msr_filter_changed,
4654        .complete_emulated_msr = svm_complete_emulated_msr,
4655
4656        .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
4657};
4658
4659static struct kvm_x86_init_ops svm_init_ops __initdata = {
4660        .cpu_has_kvm_support = has_svm,
4661        .disabled_by_bios = is_disabled,
4662        .hardware_setup = svm_hardware_setup,
4663        .check_processor_compatibility = svm_check_processor_compat,
4664
4665        .runtime_ops = &svm_x86_ops,
4666};
4667
4668static int __init svm_init(void)
4669{
4670        __unused_size_checks();
4671
4672        return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
4673                        __alignof__(struct vcpu_svm), THIS_MODULE);
4674}
4675
4676static void __exit svm_exit(void)
4677{
4678        kvm_exit();
4679}
4680
4681module_init(svm_init)
4682module_exit(svm_exit)
4683