qemu/target/i386/whpx/whpx-all.c
<<
>>
Prefs
   1/*
   2 * QEMU Windows Hypervisor Platform accelerator (WHPX)
   3 *
   4 * Copyright Microsoft Corp. 2017
   5 *
   6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
   7 * See the COPYING file in the top-level directory.
   8 *
   9 */
  10
  11#include "qemu/osdep.h"
  12#include "cpu.h"
  13#include "exec/address-spaces.h"
  14#include "exec/ioport.h"
  15#include "exec/gdbstub.h"
  16#include "qemu/accel.h"
  17#include "sysemu/whpx.h"
  18#include "sysemu/cpus.h"
  19#include "sysemu/runstate.h"
  20#include "qemu/main-loop.h"
  21#include "hw/boards.h"
  22#include "hw/i386/ioapic.h"
  23#include "hw/i386/apic_internal.h"
  24#include "qemu/error-report.h"
  25#include "qapi/error.h"
  26#include "qapi/qapi-types-common.h"
  27#include "qapi/qapi-visit-common.h"
  28#include "migration/blocker.h"
  29#include <winerror.h>
  30
  31#include "whpx-internal.h"
  32#include "whpx-accel-ops.h"
  33
  34#include <WinHvPlatform.h>
  35#include <WinHvEmulation.h>
  36
  37#define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
  38
  39static const WHV_REGISTER_NAME whpx_register_names[] = {
  40
  41    /* X64 General purpose registers */
  42    WHvX64RegisterRax,
  43    WHvX64RegisterRcx,
  44    WHvX64RegisterRdx,
  45    WHvX64RegisterRbx,
  46    WHvX64RegisterRsp,
  47    WHvX64RegisterRbp,
  48    WHvX64RegisterRsi,
  49    WHvX64RegisterRdi,
  50    WHvX64RegisterR8,
  51    WHvX64RegisterR9,
  52    WHvX64RegisterR10,
  53    WHvX64RegisterR11,
  54    WHvX64RegisterR12,
  55    WHvX64RegisterR13,
  56    WHvX64RegisterR14,
  57    WHvX64RegisterR15,
  58    WHvX64RegisterRip,
  59    WHvX64RegisterRflags,
  60
  61    /* X64 Segment registers */
  62    WHvX64RegisterEs,
  63    WHvX64RegisterCs,
  64    WHvX64RegisterSs,
  65    WHvX64RegisterDs,
  66    WHvX64RegisterFs,
  67    WHvX64RegisterGs,
  68    WHvX64RegisterLdtr,
  69    WHvX64RegisterTr,
  70
  71    /* X64 Table registers */
  72    WHvX64RegisterIdtr,
  73    WHvX64RegisterGdtr,
  74
  75    /* X64 Control Registers */
  76    WHvX64RegisterCr0,
  77    WHvX64RegisterCr2,
  78    WHvX64RegisterCr3,
  79    WHvX64RegisterCr4,
  80    WHvX64RegisterCr8,
  81
  82    /* X64 Debug Registers */
  83    /*
  84     * WHvX64RegisterDr0,
  85     * WHvX64RegisterDr1,
  86     * WHvX64RegisterDr2,
  87     * WHvX64RegisterDr3,
  88     * WHvX64RegisterDr6,
  89     * WHvX64RegisterDr7,
  90     */
  91
  92    /* X64 Floating Point and Vector Registers */
  93    WHvX64RegisterXmm0,
  94    WHvX64RegisterXmm1,
  95    WHvX64RegisterXmm2,
  96    WHvX64RegisterXmm3,
  97    WHvX64RegisterXmm4,
  98    WHvX64RegisterXmm5,
  99    WHvX64RegisterXmm6,
 100    WHvX64RegisterXmm7,
 101    WHvX64RegisterXmm8,
 102    WHvX64RegisterXmm9,
 103    WHvX64RegisterXmm10,
 104    WHvX64RegisterXmm11,
 105    WHvX64RegisterXmm12,
 106    WHvX64RegisterXmm13,
 107    WHvX64RegisterXmm14,
 108    WHvX64RegisterXmm15,
 109    WHvX64RegisterFpMmx0,
 110    WHvX64RegisterFpMmx1,
 111    WHvX64RegisterFpMmx2,
 112    WHvX64RegisterFpMmx3,
 113    WHvX64RegisterFpMmx4,
 114    WHvX64RegisterFpMmx5,
 115    WHvX64RegisterFpMmx6,
 116    WHvX64RegisterFpMmx7,
 117    WHvX64RegisterFpControlStatus,
 118    WHvX64RegisterXmmControlStatus,
 119
 120    /* X64 MSRs */
 121    WHvX64RegisterEfer,
 122#ifdef TARGET_X86_64
 123    WHvX64RegisterKernelGsBase,
 124#endif
 125    WHvX64RegisterApicBase,
 126    /* WHvX64RegisterPat, */
 127    WHvX64RegisterSysenterCs,
 128    WHvX64RegisterSysenterEip,
 129    WHvX64RegisterSysenterEsp,
 130    WHvX64RegisterStar,
 131#ifdef TARGET_X86_64
 132    WHvX64RegisterLstar,
 133    WHvX64RegisterCstar,
 134    WHvX64RegisterSfmask,
 135#endif
 136
 137    /* Interrupt / Event Registers */
 138    /*
 139     * WHvRegisterPendingInterruption,
 140     * WHvRegisterInterruptState,
 141     * WHvRegisterPendingEvent0,
 142     * WHvRegisterPendingEvent1
 143     * WHvX64RegisterDeliverabilityNotifications,
 144     */
 145};
 146
 147struct whpx_register_set {
 148    WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
 149};
 150
 151/*
 152 * The current implementation of instruction stepping sets the TF flag
 153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
 154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
 155 *
 156 * This approach has a few limitations:
 157 *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
 158 *        along with the other flags, possibly restoring it later. It would
 159 *        result in another INT1 when the flags are restored, triggering
 160 *        a stop in gdb that could be cleared by doing another step.
 161 *
 162 *        Stepping over a POPF/LAHF instruction will let it overwrite the
 163 *        TF flags, ending the stepping mode.
 164 *
 165 *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
 166 *        or anything that could result in a page fault) will save the flags
 167 *        to the stack, clear the TF flag, and let the guest execute the
 168 *        handler. Normally, the guest will restore the original flags,
 169 *        that will continue single-stepping.
 170 *
 171 *     3. Debuggers running on the guest may wish to set TF to do instruction
 172 *        stepping. INT1 events generated by it would be intercepted by us,
 173 *        as long as the gdb is connected to QEMU.
 174 *
 175 * In practice this means that:
 176 *     1. Stepping through flags-modifying instructions may cause gdb to
 177 *        continue or stop in unexpected places. This will be fully recoverable
 178 *        and will not crash the target.
 179 *
 180 *     2. Stepping over an instruction that triggers an exception will step
 181 *        over the exception handler, not into it.
 182 *
 183 *     3. Debugging the guest via gdb, while running debugger on the guest
 184 *        at the same time may lead to unexpected effects. Removing all
 185 *        breakpoints set via QEMU will prevent any further interference
 186 *        with the guest-level debuggers.
 187 *
 188 * The limitations can be addressed as shown below:
 189 *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
 190 *        stepping through them. The exact semantics of the instructions is
 191 *        defined in the "Combined Volume Set of Intel 64 and IA-32
 192 *        Architectures Software Developer's Manuals", however it involves a
 193 *        fair amount of corner cases due to compatibility with real mode,
 194 *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
 195 *
 196 *     2. We could step into the guest's exception handlers using the following
 197 *        sequence:
 198 *          a. Temporarily enable catching of all exception types via
 199 *             whpx_set_exception_exit_bitmap().
 200 *          b. Once an exception is intercepted, read the IDT/GDT and locate
 201 *             the original handler.
 202 *          c. Patch the original handler, injecting an INT3 at the beginning.
 203 *          d. Update the exception exit bitmap to only catch the
 204 *             WHvX64ExceptionTypeBreakpointTrap exception.
 205 *          e. Let the affected CPU run in the exclusive mode.
 206 *          f. Restore the original handler and the exception exit bitmap.
 207 *        Note that handling all corner cases related to IDT/GDT is harder
 208 *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
 209 *        rough idea.
 210 *
 211 *     3. In order to properly support guest-level debugging in parallel with
 212 *        the QEMU-level debugging, we would need to be able to pass some INT1
 213 *        events to the guest. This could be done via the following methods:
 214 *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
 215 *             it seems to only work for interrupts and not software
 216 *             exceptions.
 217 *          b. Locating and patching the original handler by parsing IDT/GDT.
 218 *             This involves relatively complex logic outlined in the previous
 219 *             paragraph.
 220 *          c. Emulating the exception invocation (i.e. manually updating RIP,
 221 *             RFLAGS, and pushing the old values to stack). This is even more
 222 *             complicated than the previous option, since it involves checking
 223 *             CPL, gate attributes, and doing various adjustments depending
 224 *             on the current CPU mode, whether the CPL is changing, etc.
 225 */
 226typedef enum WhpxStepMode {
 227    WHPX_STEP_NONE = 0,
 228    /* Halt other VCPUs */
 229    WHPX_STEP_EXCLUSIVE,
 230} WhpxStepMode;
 231
 232struct whpx_vcpu {
 233    WHV_EMULATOR_HANDLE emulator;
 234    bool window_registered;
 235    bool interruptable;
 236    bool ready_for_pic_interrupt;
 237    uint64_t tpr;
 238    uint64_t apic_base;
 239    bool interruption_pending;
 240
 241    /* Must be the last field as it may have a tail */
 242    WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
 243};
 244
 245static bool whpx_allowed;
 246static bool whp_dispatch_initialized;
 247static HMODULE hWinHvPlatform, hWinHvEmulation;
 248static uint32_t max_vcpu_index;
 249static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
 250
 251struct whpx_state whpx_global;
 252struct WHPDispatch whp_dispatch;
 253
 254static bool whpx_has_xsave(void)
 255{
 256    return whpx_xsave_cap.XsaveSupport;
 257}
 258
 259/*
 260 * VP support
 261 */
 262
 263static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu)
 264{
 265    return (struct whpx_vcpu *)cpu->hax_vcpu;
 266}
 267
 268static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
 269                                             int r86)
 270{
 271    WHV_X64_SEGMENT_REGISTER hs;
 272    unsigned flags = qs->flags;
 273
 274    hs.Base = qs->base;
 275    hs.Limit = qs->limit;
 276    hs.Selector = qs->selector;
 277
 278    if (v86) {
 279        hs.Attributes = 0;
 280        hs.SegmentType = 3;
 281        hs.Present = 1;
 282        hs.DescriptorPrivilegeLevel = 3;
 283        hs.NonSystemSegment = 1;
 284
 285    } else {
 286        hs.Attributes = (flags >> DESC_TYPE_SHIFT);
 287
 288        if (r86) {
 289            /* hs.Base &= 0xfffff; */
 290        }
 291    }
 292
 293    return hs;
 294}
 295
 296static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
 297{
 298    SegmentCache qs;
 299
 300    qs.base = hs->Base;
 301    qs.limit = hs->Limit;
 302    qs.selector = hs->Selector;
 303
 304    qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
 305
 306    return qs;
 307}
 308
 309/* X64 Extended Control Registers */
 310static void whpx_set_xcrs(CPUState *cpu)
 311{
 312    CPUX86State *env = cpu->env_ptr;
 313    HRESULT hr;
 314    struct whpx_state *whpx = &whpx_global;
 315    WHV_REGISTER_VALUE xcr0;
 316    WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
 317
 318    if (!whpx_has_xsave()) {
 319        return;
 320    }
 321
 322    /* Only xcr0 is supported by the hypervisor currently */
 323    xcr0.Reg64 = env->xcr0;
 324    hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
 325        whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
 326    if (FAILED(hr)) {
 327        error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
 328    }
 329}
 330
 331static int whpx_set_tsc(CPUState *cpu)
 332{
 333    CPUX86State *env = cpu->env_ptr;
 334    WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
 335    WHV_REGISTER_VALUE tsc_val;
 336    HRESULT hr;
 337    struct whpx_state *whpx = &whpx_global;
 338
 339    /*
 340     * Suspend the partition prior to setting the TSC to reduce the variance
 341     * in TSC across vCPUs. When the first vCPU runs post suspend, the
 342     * partition is automatically resumed.
 343     */
 344    if (whp_dispatch.WHvSuspendPartitionTime) {
 345
 346        /*
 347         * Unable to suspend partition while setting TSC is not a fatal
 348         * error. It just increases the likelihood of TSC variance between
 349         * vCPUs and some guest OS are able to handle that just fine.
 350         */
 351        hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
 352        if (FAILED(hr)) {
 353            warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
 354        }
 355    }
 356
 357    tsc_val.Reg64 = env->tsc;
 358    hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
 359        whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
 360    if (FAILED(hr)) {
 361        error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
 362        return -1;
 363    }
 364
 365    return 0;
 366}
 367
 368/*
 369 * The CR8 register in the CPU is mapped to the TPR register of the APIC,
 370 * however, they use a slightly different encoding. Specifically:
 371 *
 372 *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
 373 *
 374 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
 375 * and IA-32 Architectures Software Developer's Manual.
 376 *
 377 * The functions below translate the value of CR8 to TPR and vice versa.
 378 */
 379
 380static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
 381{
 382    return tpr >> 4;
 383}
 384
 385static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
 386{
 387    return cr8 << 4;
 388}
 389
 390static void whpx_set_registers(CPUState *cpu, int level)
 391{
 392    struct whpx_state *whpx = &whpx_global;
 393    struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
 394    CPUX86State *env = cpu->env_ptr;
 395    X86CPU *x86_cpu = X86_CPU(cpu);
 396    struct whpx_register_set vcxt;
 397    HRESULT hr;
 398    int idx;
 399    int idx_next;
 400    int i;
 401    int v86, r86;
 402
 403    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
 404
 405    /*
 406     * Following MSRs have side effects on the guest or are too heavy for
 407     * runtime. Limit them to full state update.
 408     */
 409    if (level >= WHPX_SET_RESET_STATE) {
 410        whpx_set_tsc(cpu);
 411    }
 412
 413    memset(&vcxt, 0, sizeof(struct whpx_register_set));
 414
 415    v86 = (env->eflags & VM_MASK);
 416    r86 = !(env->cr[0] & CR0_PE_MASK);
 417
 418    vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
 419    vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
 420
 421    idx = 0;
 422
 423    /* Indexes for first 16 registers match between HV and QEMU definitions */
 424    idx_next = 16;
 425    for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
 426        vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
 427    }
 428    idx = idx_next;
 429
 430    /* Same goes for RIP and RFLAGS */
 431    assert(whpx_register_names[idx] == WHvX64RegisterRip);
 432    vcxt.values[idx++].Reg64 = env->eip;
 433
 434    assert(whpx_register_names[idx] == WHvX64RegisterRflags);
 435    vcxt.values[idx++].Reg64 = env->eflags;
 436
 437    /* Translate 6+4 segment registers. HV and QEMU order matches  */
 438    assert(idx == WHvX64RegisterEs);
 439    for (i = 0; i < 6; i += 1, idx += 1) {
 440        vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
 441    }
 442
 443    assert(idx == WHvX64RegisterLdtr);
 444    vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
 445
 446    assert(idx == WHvX64RegisterTr);
 447    vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
 448
 449    assert(idx == WHvX64RegisterIdtr);
 450    vcxt.values[idx].Table.Base = env->idt.base;
 451    vcxt.values[idx].Table.Limit = env->idt.limit;
 452    idx += 1;
 453
 454    assert(idx == WHvX64RegisterGdtr);
 455    vcxt.values[idx].Table.Base = env->gdt.base;
 456    vcxt.values[idx].Table.Limit = env->gdt.limit;
 457    idx += 1;
 458
 459    /* CR0, 2, 3, 4, 8 */
 460    assert(whpx_register_names[idx] == WHvX64RegisterCr0);
 461    vcxt.values[idx++].Reg64 = env->cr[0];
 462    assert(whpx_register_names[idx] == WHvX64RegisterCr2);
 463    vcxt.values[idx++].Reg64 = env->cr[2];
 464    assert(whpx_register_names[idx] == WHvX64RegisterCr3);
 465    vcxt.values[idx++].Reg64 = env->cr[3];
 466    assert(whpx_register_names[idx] == WHvX64RegisterCr4);
 467    vcxt.values[idx++].Reg64 = env->cr[4];
 468    assert(whpx_register_names[idx] == WHvX64RegisterCr8);
 469    vcxt.values[idx++].Reg64 = vcpu->tpr;
 470
 471    /* 8 Debug Registers - Skipped */
 472
 473    /*
 474     * Extended control registers needs to be handled separately depending
 475     * on whether xsave is supported/enabled or not.
 476     */
 477    whpx_set_xcrs(cpu);
 478
 479    /* 16 XMM registers */
 480    assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
 481    idx_next = idx + 16;
 482    for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
 483        vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
 484        vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
 485    }
 486    idx = idx_next;
 487
 488    /* 8 FP registers */
 489    assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
 490    for (i = 0; i < 8; i += 1, idx += 1) {
 491        vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
 492        /* vcxt.values[idx].Fp.AsUINT128.High64 =
 493               env->fpregs[i].mmx.MMX_Q(1);
 494        */
 495    }
 496
 497    /* FP control status register */
 498    assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
 499    vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
 500    vcxt.values[idx].FpControlStatus.FpStatus =
 501        (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
 502    vcxt.values[idx].FpControlStatus.FpTag = 0;
 503    for (i = 0; i < 8; ++i) {
 504        vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
 505    }
 506    vcxt.values[idx].FpControlStatus.Reserved = 0;
 507    vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
 508    vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
 509    idx += 1;
 510
 511    /* XMM control status register */
 512    assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
 513    vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
 514    vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
 515    vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
 516    idx += 1;
 517
 518    /* MSRs */
 519    assert(whpx_register_names[idx] == WHvX64RegisterEfer);
 520    vcxt.values[idx++].Reg64 = env->efer;
 521#ifdef TARGET_X86_64
 522    assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
 523    vcxt.values[idx++].Reg64 = env->kernelgsbase;
 524#endif
 525
 526    assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
 527    vcxt.values[idx++].Reg64 = vcpu->apic_base;
 528
 529    /* WHvX64RegisterPat - Skipped */
 530
 531    assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
 532    vcxt.values[idx++].Reg64 = env->sysenter_cs;
 533    assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
 534    vcxt.values[idx++].Reg64 = env->sysenter_eip;
 535    assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
 536    vcxt.values[idx++].Reg64 = env->sysenter_esp;
 537    assert(whpx_register_names[idx] == WHvX64RegisterStar);
 538    vcxt.values[idx++].Reg64 = env->star;
 539#ifdef TARGET_X86_64
 540    assert(whpx_register_names[idx] == WHvX64RegisterLstar);
 541    vcxt.values[idx++].Reg64 = env->lstar;
 542    assert(whpx_register_names[idx] == WHvX64RegisterCstar);
 543    vcxt.values[idx++].Reg64 = env->cstar;
 544    assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
 545    vcxt.values[idx++].Reg64 = env->fmask;
 546#endif
 547
 548    /* Interrupt / Event Registers - Skipped */
 549
 550    assert(idx == RTL_NUMBER_OF(whpx_register_names));
 551
 552    hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
 553        whpx->partition, cpu->cpu_index,
 554        whpx_register_names,
 555        RTL_NUMBER_OF(whpx_register_names),
 556        &vcxt.values[0]);
 557
 558    if (FAILED(hr)) {
 559        error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
 560                     hr);
 561    }
 562
 563    return;
 564}
 565
 566static int whpx_get_tsc(CPUState *cpu)
 567{
 568    CPUX86State *env = cpu->env_ptr;
 569    WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
 570    WHV_REGISTER_VALUE tsc_val;
 571    HRESULT hr;
 572    struct whpx_state *whpx = &whpx_global;
 573
 574    hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
 575        whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
 576    if (FAILED(hr)) {
 577        error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
 578        return -1;
 579    }
 580
 581    env->tsc = tsc_val.Reg64;
 582    return 0;
 583}
 584
 585/* X64 Extended Control Registers */
 586static void whpx_get_xcrs(CPUState *cpu)
 587{
 588    CPUX86State *env = cpu->env_ptr;
 589    HRESULT hr;
 590    struct whpx_state *whpx = &whpx_global;
 591    WHV_REGISTER_VALUE xcr0;
 592    WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
 593
 594    if (!whpx_has_xsave()) {
 595        return;
 596    }
 597
 598    /* Only xcr0 is supported by the hypervisor currently */
 599    hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
 600        whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
 601    if (FAILED(hr)) {
 602        error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
 603        return;
 604    }
 605
 606    env->xcr0 = xcr0.Reg64;
 607}
 608
 609static void whpx_get_registers(CPUState *cpu)
 610{
 611    struct whpx_state *whpx = &whpx_global;
 612    struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
 613    CPUX86State *env = cpu->env_ptr;
 614    X86CPU *x86_cpu = X86_CPU(cpu);
 615    struct whpx_register_set vcxt;
 616    uint64_t tpr, apic_base;
 617    HRESULT hr;
 618    int idx;
 619    int idx_next;
 620    int i;
 621
 622    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
 623
 624    if (!env->tsc_valid) {
 625        whpx_get_tsc(cpu);
 626        env->tsc_valid = !runstate_is_running();
 627    }
 628
 629    hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
 630        whpx->partition, cpu->cpu_index,
 631        whpx_register_names,
 632        RTL_NUMBER_OF(whpx_register_names),
 633        &vcxt.values[0]);
 634    if (FAILED(hr)) {
 635        error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
 636                     hr);
 637    }
 638
 639    if (whpx_apic_in_platform()) {
 640        /*
 641         * Fetch the TPR value from the emulated APIC. It may get overwritten
 642         * below with the value from CR8 returned by
 643         * WHvGetVirtualProcessorRegisters().
 644         */
 645        whpx_apic_get(x86_cpu->apic_state);
 646        vcpu->tpr = whpx_apic_tpr_to_cr8(
 647            cpu_get_apic_tpr(x86_cpu->apic_state));
 648    }
 649
 650    idx = 0;
 651
 652    /* Indexes for first 16 registers match between HV and QEMU definitions */
 653    idx_next = 16;
 654    for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
 655        env->regs[idx] = vcxt.values[idx].Reg64;
 656    }
 657    idx = idx_next;
 658
 659    /* Same goes for RIP and RFLAGS */
 660    assert(whpx_register_names[idx] == WHvX64RegisterRip);
 661    env->eip = vcxt.values[idx++].Reg64;
 662    assert(whpx_register_names[idx] == WHvX64RegisterRflags);
 663    env->eflags = vcxt.values[idx++].Reg64;
 664
 665    /* Translate 6+4 segment registers. HV and QEMU order matches  */
 666    assert(idx == WHvX64RegisterEs);
 667    for (i = 0; i < 6; i += 1, idx += 1) {
 668        env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
 669    }
 670
 671    assert(idx == WHvX64RegisterLdtr);
 672    env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
 673    assert(idx == WHvX64RegisterTr);
 674    env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
 675    assert(idx == WHvX64RegisterIdtr);
 676    env->idt.base = vcxt.values[idx].Table.Base;
 677    env->idt.limit = vcxt.values[idx].Table.Limit;
 678    idx += 1;
 679    assert(idx == WHvX64RegisterGdtr);
 680    env->gdt.base = vcxt.values[idx].Table.Base;
 681    env->gdt.limit = vcxt.values[idx].Table.Limit;
 682    idx += 1;
 683
 684    /* CR0, 2, 3, 4, 8 */
 685    assert(whpx_register_names[idx] == WHvX64RegisterCr0);
 686    env->cr[0] = vcxt.values[idx++].Reg64;
 687    assert(whpx_register_names[idx] == WHvX64RegisterCr2);
 688    env->cr[2] = vcxt.values[idx++].Reg64;
 689    assert(whpx_register_names[idx] == WHvX64RegisterCr3);
 690    env->cr[3] = vcxt.values[idx++].Reg64;
 691    assert(whpx_register_names[idx] == WHvX64RegisterCr4);
 692    env->cr[4] = vcxt.values[idx++].Reg64;
 693    assert(whpx_register_names[idx] == WHvX64RegisterCr8);
 694    tpr = vcxt.values[idx++].Reg64;
 695    if (tpr != vcpu->tpr) {
 696        vcpu->tpr = tpr;
 697        cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
 698    }
 699
 700    /* 8 Debug Registers - Skipped */
 701
 702    /*
 703     * Extended control registers needs to be handled separately depending
 704     * on whether xsave is supported/enabled or not.
 705     */
 706    whpx_get_xcrs(cpu);
 707
 708    /* 16 XMM registers */
 709    assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
 710    idx_next = idx + 16;
 711    for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
 712        env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
 713        env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
 714    }
 715    idx = idx_next;
 716
 717    /* 8 FP registers */
 718    assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
 719    for (i = 0; i < 8; i += 1, idx += 1) {
 720        env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
 721        /* env->fpregs[i].mmx.MMX_Q(1) =
 722               vcxt.values[idx].Fp.AsUINT128.High64;
 723        */
 724    }
 725
 726    /* FP control status register */
 727    assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
 728    env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
 729    env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
 730    env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
 731    for (i = 0; i < 8; ++i) {
 732        env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
 733    }
 734    env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
 735    env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
 736    idx += 1;
 737
 738    /* XMM control status register */
 739    assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
 740    env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
 741    idx += 1;
 742
 743    /* MSRs */
 744    assert(whpx_register_names[idx] == WHvX64RegisterEfer);
 745    env->efer = vcxt.values[idx++].Reg64;
 746#ifdef TARGET_X86_64
 747    assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
 748    env->kernelgsbase = vcxt.values[idx++].Reg64;
 749#endif
 750
 751    assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
 752    apic_base = vcxt.values[idx++].Reg64;
 753    if (apic_base != vcpu->apic_base) {
 754        vcpu->apic_base = apic_base;
 755        cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
 756    }
 757
 758    /* WHvX64RegisterPat - Skipped */
 759
 760    assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
 761    env->sysenter_cs = vcxt.values[idx++].Reg64;
 762    assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
 763    env->sysenter_eip = vcxt.values[idx++].Reg64;
 764    assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
 765    env->sysenter_esp = vcxt.values[idx++].Reg64;
 766    assert(whpx_register_names[idx] == WHvX64RegisterStar);
 767    env->star = vcxt.values[idx++].Reg64;
 768#ifdef TARGET_X86_64
 769    assert(whpx_register_names[idx] == WHvX64RegisterLstar);
 770    env->lstar = vcxt.values[idx++].Reg64;
 771    assert(whpx_register_names[idx] == WHvX64RegisterCstar);
 772    env->cstar = vcxt.values[idx++].Reg64;
 773    assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
 774    env->fmask = vcxt.values[idx++].Reg64;
 775#endif
 776
 777    /* Interrupt / Event Registers - Skipped */
 778
 779    assert(idx == RTL_NUMBER_OF(whpx_register_names));
 780
 781    if (whpx_apic_in_platform()) {
 782        whpx_apic_get(x86_cpu->apic_state);
 783    }
 784
 785    x86_update_hflags(env);
 786
 787    return;
 788}
 789
 790static HRESULT CALLBACK whpx_emu_ioport_callback(
 791    void *ctx,
 792    WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
 793{
 794    MemTxAttrs attrs = { 0 };
 795    address_space_rw(&address_space_io, IoAccess->Port, attrs,
 796                     &IoAccess->Data, IoAccess->AccessSize,
 797                     IoAccess->Direction);
 798    return S_OK;
 799}
 800
 801static HRESULT CALLBACK whpx_emu_mmio_callback(
 802    void *ctx,
 803    WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
 804{
 805    cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
 806                           ma->Direction);
 807    return S_OK;
 808}
 809
 810static HRESULT CALLBACK whpx_emu_getreg_callback(
 811    void *ctx,
 812    const WHV_REGISTER_NAME *RegisterNames,
 813    UINT32 RegisterCount,
 814    WHV_REGISTER_VALUE *RegisterValues)
 815{
 816    HRESULT hr;
 817    struct whpx_state *whpx = &whpx_global;
 818    CPUState *cpu = (CPUState *)ctx;
 819
 820    hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
 821        whpx->partition, cpu->cpu_index,
 822        RegisterNames, RegisterCount,
 823        RegisterValues);
 824    if (FAILED(hr)) {
 825        error_report("WHPX: Failed to get virtual processor registers,"
 826                     " hr=%08lx", hr);
 827    }
 828
 829    return hr;
 830}
 831
 832static HRESULT CALLBACK whpx_emu_setreg_callback(
 833    void *ctx,
 834    const WHV_REGISTER_NAME *RegisterNames,
 835    UINT32 RegisterCount,
 836    const WHV_REGISTER_VALUE *RegisterValues)
 837{
 838    HRESULT hr;
 839    struct whpx_state *whpx = &whpx_global;
 840    CPUState *cpu = (CPUState *)ctx;
 841
 842    hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
 843        whpx->partition, cpu->cpu_index,
 844        RegisterNames, RegisterCount,
 845        RegisterValues);
 846    if (FAILED(hr)) {
 847        error_report("WHPX: Failed to set virtual processor registers,"
 848                     " hr=%08lx", hr);
 849    }
 850
 851    /*
 852     * The emulator just successfully wrote the register state. We clear the
 853     * dirty state so we avoid the double write on resume of the VP.
 854     */
 855    cpu->vcpu_dirty = false;
 856
 857    return hr;
 858}
 859
 860static HRESULT CALLBACK whpx_emu_translate_callback(
 861    void *ctx,
 862    WHV_GUEST_VIRTUAL_ADDRESS Gva,
 863    WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
 864    WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
 865    WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
 866{
 867    HRESULT hr;
 868    struct whpx_state *whpx = &whpx_global;
 869    CPUState *cpu = (CPUState *)ctx;
 870    WHV_TRANSLATE_GVA_RESULT res;
 871
 872    hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
 873                                      Gva, TranslateFlags, &res, Gpa);
 874    if (FAILED(hr)) {
 875        error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
 876    } else {
 877        *TranslationResult = res.ResultCode;
 878    }
 879
 880    return hr;
 881}
 882
 883static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
 884    .Size = sizeof(WHV_EMULATOR_CALLBACKS),
 885    .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
 886    .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
 887    .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
 888    .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
 889    .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
 890};
 891
 892static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
 893{
 894    HRESULT hr;
 895    struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
 896    WHV_EMULATOR_STATUS emu_status;
 897
 898    hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
 899        vcpu->emulator, cpu,
 900        &vcpu->exit_ctx.VpContext, ctx,
 901        &emu_status);
 902    if (FAILED(hr)) {
 903        error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
 904        return -1;
 905    }
 906
 907    if (!emu_status.EmulationSuccessful) {
 908        error_report("WHPX: Failed to emulate MMIO access with"
 909                     " EmulatorReturnStatus: %u", emu_status.AsUINT32);
 910        return -1;
 911    }
 912
 913    return 0;
 914}
 915
 916static int whpx_handle_portio(CPUState *cpu,
 917                              WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
 918{
 919    HRESULT hr;
 920    struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
 921    WHV_EMULATOR_STATUS emu_status;
 922
 923    hr = whp_dispatch.WHvEmulatorTryIoEmulation(
 924        vcpu->emulator, cpu,
 925        &vcpu->exit_ctx.VpContext, ctx,
 926        &emu_status);
 927    if (FAILED(hr)) {
 928        error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
 929        return -1;
 930    }
 931
 932    if (!emu_status.EmulationSuccessful) {
 933        error_report("WHPX: Failed to emulate PortIO access with"
 934                     " EmulatorReturnStatus: %u", emu_status.AsUINT32);
 935        return -1;
 936    }
 937
 938    return 0;
 939}
 940
 941/*
 942 * Controls whether we should intercept various exceptions on the guest,
 943 * namely breakpoint/single-step events.
 944 *
 945 * The 'exceptions' argument accepts a bitmask, e.g:
 946 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
 947 */
 948static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
 949{
 950    struct whpx_state *whpx = &whpx_global;
 951    WHV_PARTITION_PROPERTY prop = { 0, };
 952    HRESULT hr;
 953
 954    if (exceptions == whpx->exception_exit_bitmap) {
 955        return S_OK;
 956    }
 957
 958    prop.ExceptionExitBitmap = exceptions;
 959
 960    hr = whp_dispatch.WHvSetPartitionProperty(
 961        whpx->partition,
 962        WHvPartitionPropertyCodeExceptionExitBitmap,
 963        &prop,
 964        sizeof(WHV_PARTITION_PROPERTY));
 965
 966    if (SUCCEEDED(hr)) {
 967        whpx->exception_exit_bitmap = exceptions;
 968    }
 969
 970    return hr;
 971}
 972
 973
 974/*
 975 * This function is called before/after stepping over a single instruction.
 976 * It will update the CPU registers to arm/disarm the instruction stepping
 977 * accordingly.
 978 */
 979static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
 980    bool set,
 981    uint64_t *exit_context_rflags)
 982{
 983    WHV_REGISTER_NAME reg_name;
 984    WHV_REGISTER_VALUE reg_value;
 985    HRESULT hr;
 986    struct whpx_state *whpx = &whpx_global;
 987
 988    /*
 989     * If we are trying to step over a single instruction, we need to set the
 990     * TF bit in rflags. Otherwise, clear it.
 991     */
 992    reg_name = WHvX64RegisterRflags;
 993    hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
 994        whpx->partition,
 995        cpu->cpu_index,
 996        &reg_name,
 997        1,
 998        &reg_value);
 999
1000    if (FAILED(hr)) {
1001        error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
1002        return hr;
1003    }
1004
1005    if (exit_context_rflags) {
1006        assert(*exit_context_rflags == reg_value.Reg64);
1007    }
1008
1009    if (set) {
1010        /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
1011        reg_value.Reg64 |= TF_MASK;
1012    } else {
1013        reg_value.Reg64 &= ~TF_MASK;
1014    }
1015
1016    if (exit_context_rflags) {
1017        *exit_context_rflags = reg_value.Reg64;
1018    }
1019
1020    hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1021        whpx->partition,
1022        cpu->cpu_index,
1023        &reg_name,
1024        1,
1025        &reg_value);
1026
1027    if (FAILED(hr)) {
1028        error_report("WHPX: Failed to set rflags,"
1029            " hr=%08lx",
1030            hr);
1031        return hr;
1032    }
1033
1034    reg_name = WHvRegisterInterruptState;
1035    reg_value.Reg64 = 0;
1036
1037    /* Suspend delivery of hardware interrupts during single-stepping. */
1038    reg_value.InterruptState.InterruptShadow = set != 0;
1039
1040    hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1041    whpx->partition,
1042        cpu->cpu_index,
1043        &reg_name,
1044        1,
1045        &reg_value);
1046
1047    if (FAILED(hr)) {
1048        error_report("WHPX: Failed to set InterruptState,"
1049            " hr=%08lx",
1050            hr);
1051        return hr;
1052    }
1053
1054    if (!set) {
1055        /*
1056         * We have just finished stepping over a single instruction,
1057         * and intercepted the INT1 generated by it.
1058         * We need to now hide the INT1 from the guest,
1059         * as it would not be expecting it.
1060         */
1061
1062        reg_name = WHvX64RegisterPendingDebugException;
1063        hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1064        whpx->partition,
1065            cpu->cpu_index,
1066            &reg_name,
1067            1,
1068            &reg_value);
1069
1070        if (FAILED(hr)) {
1071            error_report("WHPX: Failed to get pending debug exceptions,"
1072                         "hr=%08lx", hr);
1073            return hr;
1074        }
1075
1076        if (reg_value.PendingDebugException.SingleStep) {
1077            reg_value.PendingDebugException.SingleStep = 0;
1078
1079            hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1080                whpx->partition,
1081                cpu->cpu_index,
1082                &reg_name,
1083                1,
1084                &reg_value);
1085
1086            if (FAILED(hr)) {
1087                error_report("WHPX: Failed to clear pending debug exceptions,"
1088                             "hr=%08lx", hr);
1089             return hr;
1090            }
1091        }
1092
1093    }
1094
1095    return S_OK;
1096}
1097
1098/* Tries to find a breakpoint at the specified address. */
1099static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1100{
1101    struct whpx_state *whpx = &whpx_global;
1102    int i;
1103
1104    if (whpx->breakpoints.breakpoints) {
1105        for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1106            if (address == whpx->breakpoints.breakpoints->data[i].address) {
1107                return &whpx->breakpoints.breakpoints->data[i];
1108            }
1109        }
1110    }
1111
1112    return NULL;
1113}
1114
1115/*
1116 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1117 * debugging user-mode applications. Since the WHPX API does not offer
1118 * an easy way to pass the intercepted exception back to the guest, we
1119 * resort to using INT1 instead, and let the guest always handle INT3.
1120 */
1121static const uint8_t whpx_breakpoint_instruction = 0xF1;
1122
1123/*
1124 * The WHPX QEMU backend implements breakpoints by writing the INT1
1125 * instruction into memory (ignoring the DRx registers). This raises a few
1126 * issues that need to be carefully handled:
1127 *
1128 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1129 *    at the same location, and later remove them in arbitrary order.
1130 *    This should not cause memory corruption, and should only remove the
1131 *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1132 *
1133 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1134 *    physical location. Hence, physically adding/removing a breakpoint can
1135 *    theoretically fail at any time. We need to keep track of it.
1136 *
1137 * The function below rebuilds a list of low-level breakpoints (one per
1138 * address, tracking the original instruction and any errors) from the list of
1139 * high-level breakpoints (set via cpu_breakpoint_insert()).
1140 *
1141 * In order to optimize performance, this function stores the list of
1142 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1143 * low-level ones, so that it won't be re-invoked until these breakpoints
1144 * change.
1145 *
1146 * Note that this function decides which breakpoints should be inserted into,
1147 * memory, but doesn't actually do it. The memory accessing is done in
1148 * whpx_apply_breakpoints().
1149 */
1150static void whpx_translate_cpu_breakpoints(
1151    struct whpx_breakpoints *breakpoints,
1152    CPUState *cpu,
1153    int cpu_breakpoint_count)
1154{
1155    CPUBreakpoint *bp;
1156    int cpu_bp_index = 0;
1157
1158    breakpoints->original_addresses =
1159        g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1160
1161    breakpoints->original_address_count = cpu_breakpoint_count;
1162
1163    int max_breakpoints = cpu_breakpoint_count +
1164        (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1165
1166    struct whpx_breakpoint_collection *new_breakpoints =
1167        (struct whpx_breakpoint_collection *)g_malloc0(
1168        sizeof(struct whpx_breakpoint_collection) +
1169            max_breakpoints * sizeof(struct whpx_breakpoint));
1170
1171    new_breakpoints->allocated = max_breakpoints;
1172    new_breakpoints->used = 0;
1173
1174    /*
1175     * 1. Preserve all old breakpoints that could not be automatically
1176     * cleared when the CPU got stopped.
1177     */
1178    if (breakpoints->breakpoints) {
1179        int i;
1180        for (i = 0; i < breakpoints->breakpoints->used; i++) {
1181            if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1182                new_breakpoints->data[new_breakpoints->used++] =
1183                    breakpoints->breakpoints->data[i];
1184            }
1185        }
1186    }
1187
1188    /* 2. Map all CPU breakpoints to WHPX breakpoints */
1189    QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1190        int i;
1191        bool found = false;
1192
1193        /* This will be used to detect changed CPU breakpoints later. */
1194        breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1195
1196        for (i = 0; i < new_breakpoints->used; i++) {
1197            /*
1198             * WARNING: This loop has O(N^2) complexity, where N is the
1199             * number of breakpoints. It should not be a bottleneck in
1200             * real-world scenarios, since it only needs to run once after
1201             * the breakpoints have been modified.
1202             * If this ever becomes a concern, it can be optimized by storing
1203             * high-level breakpoint objects in a tree or hash map.
1204             */
1205
1206            if (new_breakpoints->data[i].address == bp->pc) {
1207                /* There was already a breakpoint at this address. */
1208                if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1209                    new_breakpoints->data[i].state = WHPX_BP_SET;
1210                } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1211                    new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1212                }
1213
1214                found = true;
1215                break;
1216            }
1217        }
1218
1219        if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1220            /* No WHPX breakpoint at this address. Create one. */
1221            new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1222            new_breakpoints->data[new_breakpoints->used].state =
1223                WHPX_BP_SET_PENDING;
1224            new_breakpoints->used++;
1225        }
1226    }
1227
1228    if (breakpoints->breakpoints) {
1229        /*
1230         * Free the previous breakpoint list. This can be optimized by keeping
1231         * it as shadow buffer for the next computation instead of freeing
1232         * it immediately.
1233         */
1234        g_free(breakpoints->breakpoints);
1235    }
1236
1237    breakpoints->breakpoints = new_breakpoints;
1238}
1239
1240/*
1241 * Physically inserts/removes the breakpoints by reading and writing the
1242 * physical memory, keeping a track of the failed attempts.
1243 *
1244 * Passing resuming=true  will try to set all previously unset breakpoints.
1245 * Passing resuming=false will remove all inserted ones.
1246 */
1247static void whpx_apply_breakpoints(
1248    struct whpx_breakpoint_collection *breakpoints,
1249    CPUState *cpu,
1250    bool resuming)
1251{
1252    int i, rc;
1253    if (!breakpoints) {
1254        return;
1255    }
1256
1257    for (i = 0; i < breakpoints->used; i++) {
1258        /* Decide what to do right now based on the last known state. */
1259        WhpxBreakpointState state = breakpoints->data[i].state;
1260        switch (state) {
1261        case WHPX_BP_CLEARED:
1262            if (resuming) {
1263                state = WHPX_BP_SET_PENDING;
1264            }
1265            break;
1266        case WHPX_BP_SET_PENDING:
1267            if (!resuming) {
1268                state = WHPX_BP_CLEARED;
1269            }
1270            break;
1271        case WHPX_BP_SET:
1272            if (!resuming) {
1273                state = WHPX_BP_CLEAR_PENDING;
1274            }
1275            break;
1276        case WHPX_BP_CLEAR_PENDING:
1277            if (resuming) {
1278                state = WHPX_BP_SET;
1279            }
1280            break;
1281        }
1282
1283        if (state == WHPX_BP_SET_PENDING) {
1284            /* Remember the original instruction. */
1285            rc = cpu_memory_rw_debug(cpu,
1286                breakpoints->data[i].address,
1287                &breakpoints->data[i].original_instruction,
1288                1,
1289                false);
1290
1291            if (!rc) {
1292                /* Write the breakpoint instruction. */
1293                rc = cpu_memory_rw_debug(cpu,
1294                    breakpoints->data[i].address,
1295                    (void *)&whpx_breakpoint_instruction,
1296                    1,
1297                    true);
1298            }
1299
1300            if (!rc) {
1301                state = WHPX_BP_SET;
1302            }
1303
1304        }
1305
1306        if (state == WHPX_BP_CLEAR_PENDING) {
1307            /* Restore the original instruction. */
1308            rc = cpu_memory_rw_debug(cpu,
1309                breakpoints->data[i].address,
1310                &breakpoints->data[i].original_instruction,
1311                1,
1312                true);
1313
1314            if (!rc) {
1315                state = WHPX_BP_CLEARED;
1316            }
1317        }
1318
1319        breakpoints->data[i].state = state;
1320    }
1321}
1322
1323/*
1324 * This function is called when the a VCPU is about to start and no other
1325 * VCPUs have been started so far. Since the VCPU start order could be
1326 * arbitrary, it doesn't have to be VCPU#0.
1327 *
1328 * It is used to commit the breakpoints into memory, and configure WHPX
1329 * to intercept debug exceptions.
1330 *
1331 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1332 * more VCPUs are already running, so this is the best place to do it.
1333 */
1334static int whpx_first_vcpu_starting(CPUState *cpu)
1335{
1336    struct whpx_state *whpx = &whpx_global;
1337    HRESULT hr;
1338
1339    g_assert(qemu_mutex_iothread_locked());
1340
1341    if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1342            (whpx->breakpoints.breakpoints &&
1343             whpx->breakpoints.breakpoints->used)) {
1344        CPUBreakpoint *bp;
1345        int i = 0;
1346        bool update_pending = false;
1347
1348        QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1349            if (i >= whpx->breakpoints.original_address_count ||
1350                bp->pc != whpx->breakpoints.original_addresses[i]) {
1351                update_pending = true;
1352            }
1353
1354            i++;
1355        }
1356
1357        if (i != whpx->breakpoints.original_address_count) {
1358            update_pending = true;
1359        }
1360
1361        if (update_pending) {
1362            /*
1363             * The CPU breakpoints have changed since the last call to
1364             * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1365             * now be recomputed.
1366             */
1367            whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1368        }
1369
1370        /* Actually insert the breakpoints into the memory. */
1371        whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1372    }
1373
1374    uint64_t exception_mask;
1375    if (whpx->step_pending ||
1376        (whpx->breakpoints.breakpoints &&
1377         whpx->breakpoints.breakpoints->used)) {
1378        /*
1379         * We are either attempting to single-step one or more CPUs, or
1380         * have one or more breakpoints enabled. Both require intercepting
1381         * the WHvX64ExceptionTypeBreakpointTrap exception.
1382         */
1383
1384        exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1385    } else {
1386        /* Let the guest handle all exceptions. */
1387        exception_mask = 0;
1388    }
1389
1390    hr = whpx_set_exception_exit_bitmap(exception_mask);
1391    if (!SUCCEEDED(hr)) {
1392        error_report("WHPX: Failed to update exception exit mask,"
1393                     "hr=%08lx.", hr);
1394        return 1;
1395    }
1396
1397    return 0;
1398}
1399
1400/*
1401 * This function is called when the last VCPU has finished running.
1402 * It is used to remove any previously set breakpoints from memory.
1403 */
1404static int whpx_last_vcpu_stopping(CPUState *cpu)
1405{
1406    whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1407    return 0;
1408}
1409
1410/* Returns the address of the next instruction that is about to be executed. */
1411static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1412{
1413    if (cpu->vcpu_dirty) {
1414        /* The CPU registers have been modified by other parts of QEMU. */
1415        CPUArchState *env = (CPUArchState *)(cpu->env_ptr);
1416        return env->eip;
1417    } else if (exit_context_valid) {
1418        /*
1419         * The CPU registers have not been modified by neither other parts
1420         * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1421         * This is the most common case.
1422         */
1423        struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1424        return vcpu->exit_ctx.VpContext.Rip;
1425    } else {
1426        /*
1427         * The CPU registers have been modified by a call to
1428         * WHvSetVirtualProcessorRegisters() and must be re-queried from
1429         * the target.
1430         */
1431        WHV_REGISTER_VALUE reg_value;
1432        WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1433        HRESULT hr;
1434        struct whpx_state *whpx = &whpx_global;
1435
1436        hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1437            whpx->partition,
1438            cpu->cpu_index,
1439            &reg_name,
1440            1,
1441            &reg_value);
1442
1443        if (FAILED(hr)) {
1444            error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1445            return 0;
1446        }
1447
1448        return reg_value.Reg64;
1449    }
1450}
1451
1452static int whpx_handle_halt(CPUState *cpu)
1453{
1454    CPUX86State *env = cpu->env_ptr;
1455    int ret = 0;
1456
1457    qemu_mutex_lock_iothread();
1458    if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1459          (env->eflags & IF_MASK)) &&
1460        !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1461        cpu->exception_index = EXCP_HLT;
1462        cpu->halted = true;
1463        ret = 1;
1464    }
1465    qemu_mutex_unlock_iothread();
1466
1467    return ret;
1468}
1469
1470static void whpx_vcpu_pre_run(CPUState *cpu)
1471{
1472    HRESULT hr;
1473    struct whpx_state *whpx = &whpx_global;
1474    struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1475    CPUX86State *env = cpu->env_ptr;
1476    X86CPU *x86_cpu = X86_CPU(cpu);
1477    int irq;
1478    uint8_t tpr;
1479    WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1480    UINT32 reg_count = 0;
1481    WHV_REGISTER_VALUE reg_values[3];
1482    WHV_REGISTER_NAME reg_names[3];
1483
1484    memset(&new_int, 0, sizeof(new_int));
1485    memset(reg_values, 0, sizeof(reg_values));
1486
1487    qemu_mutex_lock_iothread();
1488
1489    /* Inject NMI */
1490    if (!vcpu->interruption_pending &&
1491        cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1492        if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1493            cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1494            vcpu->interruptable = false;
1495            new_int.InterruptionType = WHvX64PendingNmi;
1496            new_int.InterruptionPending = 1;
1497            new_int.InterruptionVector = 2;
1498        }
1499        if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1500            cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1501        }
1502    }
1503
1504    /*
1505     * Force the VCPU out of its inner loop to process any INIT requests or
1506     * commit pending TPR access.
1507     */
1508    if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1509        if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1510            !(env->hflags & HF_SMM_MASK)) {
1511            cpu->exit_request = 1;
1512        }
1513        if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1514            cpu->exit_request = 1;
1515        }
1516    }
1517
1518    /* Get pending hard interruption or replay one that was overwritten */
1519    if (!whpx_apic_in_platform()) {
1520        if (!vcpu->interruption_pending &&
1521            vcpu->interruptable && (env->eflags & IF_MASK)) {
1522            assert(!new_int.InterruptionPending);
1523            if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1524                cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1525                irq = cpu_get_pic_interrupt(env);
1526                if (irq >= 0) {
1527                    new_int.InterruptionType = WHvX64PendingInterrupt;
1528                    new_int.InterruptionPending = 1;
1529                    new_int.InterruptionVector = irq;
1530                }
1531            }
1532        }
1533
1534        /* Setup interrupt state if new one was prepared */
1535        if (new_int.InterruptionPending) {
1536            reg_values[reg_count].PendingInterruption = new_int;
1537            reg_names[reg_count] = WHvRegisterPendingInterruption;
1538            reg_count += 1;
1539        }
1540    } else if (vcpu->ready_for_pic_interrupt &&
1541               (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1542        cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1543        irq = cpu_get_pic_interrupt(env);
1544        if (irq >= 0) {
1545            reg_names[reg_count] = WHvRegisterPendingEvent;
1546            reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1547            {
1548                .EventPending = 1,
1549                .EventType = WHvX64PendingEventExtInt,
1550                .Vector = irq,
1551            };
1552            reg_count += 1;
1553        }
1554     }
1555
1556    /* Sync the TPR to the CR8 if was modified during the intercept */
1557    tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1558    if (tpr != vcpu->tpr) {
1559        vcpu->tpr = tpr;
1560        reg_values[reg_count].Reg64 = tpr;
1561        cpu->exit_request = 1;
1562        reg_names[reg_count] = WHvX64RegisterCr8;
1563        reg_count += 1;
1564    }
1565
1566    /* Update the state of the interrupt delivery notification */
1567    if (!vcpu->window_registered &&
1568        cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1569        reg_values[reg_count].DeliverabilityNotifications =
1570            (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1571                .InterruptNotification = 1
1572            };
1573        vcpu->window_registered = 1;
1574        reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1575        reg_count += 1;
1576    }
1577
1578    qemu_mutex_unlock_iothread();
1579    vcpu->ready_for_pic_interrupt = false;
1580
1581    if (reg_count) {
1582        hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1583            whpx->partition, cpu->cpu_index,
1584            reg_names, reg_count, reg_values);
1585        if (FAILED(hr)) {
1586            error_report("WHPX: Failed to set interrupt state registers,"
1587                         " hr=%08lx", hr);
1588        }
1589    }
1590
1591    return;
1592}
1593
1594static void whpx_vcpu_post_run(CPUState *cpu)
1595{
1596    struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1597    CPUX86State *env = cpu->env_ptr;
1598    X86CPU *x86_cpu = X86_CPU(cpu);
1599
1600    env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1601
1602    uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1603    if (vcpu->tpr != tpr) {
1604        vcpu->tpr = tpr;
1605        qemu_mutex_lock_iothread();
1606        cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1607        qemu_mutex_unlock_iothread();
1608    }
1609
1610    vcpu->interruption_pending =
1611        vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1612
1613    vcpu->interruptable =
1614        !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1615
1616    return;
1617}
1618
1619static void whpx_vcpu_process_async_events(CPUState *cpu)
1620{
1621    CPUX86State *env = cpu->env_ptr;
1622    X86CPU *x86_cpu = X86_CPU(cpu);
1623    struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1624
1625    if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1626        !(env->hflags & HF_SMM_MASK)) {
1627        whpx_cpu_synchronize_state(cpu);
1628        do_cpu_init(x86_cpu);
1629        vcpu->interruptable = true;
1630    }
1631
1632    if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1633        cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1634        apic_poll_irq(x86_cpu->apic_state);
1635    }
1636
1637    if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1638         (env->eflags & IF_MASK)) ||
1639        (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1640        cpu->halted = false;
1641    }
1642
1643    if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1644        whpx_cpu_synchronize_state(cpu);
1645        do_cpu_sipi(x86_cpu);
1646    }
1647
1648    if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1649        cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1650        whpx_cpu_synchronize_state(cpu);
1651        apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1652                                      env->tpr_access_type);
1653    }
1654
1655    return;
1656}
1657
1658static int whpx_vcpu_run(CPUState *cpu)
1659{
1660    HRESULT hr;
1661    struct whpx_state *whpx = &whpx_global;
1662    struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
1663    struct whpx_breakpoint *stepped_over_bp = NULL;
1664    WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1665    int ret;
1666
1667    g_assert(qemu_mutex_iothread_locked());
1668
1669    if (whpx->running_cpus++ == 0) {
1670        /* Insert breakpoints into memory, update exception exit bitmap. */
1671        ret = whpx_first_vcpu_starting(cpu);
1672        if (ret != 0) {
1673            return ret;
1674        }
1675    }
1676
1677    if (whpx->breakpoints.breakpoints &&
1678        whpx->breakpoints.breakpoints->used > 0)
1679    {
1680        uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1681        stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1682        if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1683            stepped_over_bp = NULL;
1684        }
1685
1686        if (stepped_over_bp) {
1687            /*
1688             * We are trying to run the instruction overwritten by an active
1689             * breakpoint. We will temporarily disable the breakpoint, suspend
1690             * other CPUs, and step over the instruction.
1691             */
1692            exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1693        }
1694    }
1695
1696    if (exclusive_step_mode == WHPX_STEP_NONE) {
1697        whpx_vcpu_process_async_events(cpu);
1698        if (cpu->halted && !whpx_apic_in_platform()) {
1699            cpu->exception_index = EXCP_HLT;
1700            qatomic_set(&cpu->exit_request, false);
1701            return 0;
1702        }
1703    }
1704
1705    qemu_mutex_unlock_iothread();
1706
1707    if (exclusive_step_mode != WHPX_STEP_NONE) {
1708        start_exclusive();
1709        g_assert(cpu == current_cpu);
1710        g_assert(!cpu->running);
1711        cpu->running = true;
1712
1713        hr = whpx_set_exception_exit_bitmap(
1714            1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1715        if (!SUCCEEDED(hr)) {
1716            error_report("WHPX: Failed to update exception exit mask, "
1717                         "hr=%08lx.", hr);
1718            return 1;
1719        }
1720
1721        if (stepped_over_bp) {
1722            /* Temporarily disable the triggered breakpoint. */
1723            cpu_memory_rw_debug(cpu,
1724                stepped_over_bp->address,
1725                &stepped_over_bp->original_instruction,
1726                1,
1727                true);
1728        }
1729    } else {
1730        cpu_exec_start(cpu);
1731    }
1732
1733    do {
1734        if (cpu->vcpu_dirty) {
1735            whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1736            cpu->vcpu_dirty = false;
1737        }
1738
1739        if (exclusive_step_mode == WHPX_STEP_NONE) {
1740            whpx_vcpu_pre_run(cpu);
1741
1742            if (qatomic_read(&cpu->exit_request)) {
1743                whpx_vcpu_kick(cpu);
1744            }
1745        }
1746
1747        if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1748            whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1749        }
1750
1751        hr = whp_dispatch.WHvRunVirtualProcessor(
1752            whpx->partition, cpu->cpu_index,
1753            &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1754
1755        if (FAILED(hr)) {
1756            error_report("WHPX: Failed to exec a virtual processor,"
1757                         " hr=%08lx", hr);
1758            ret = -1;
1759            break;
1760        }
1761
1762        if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1763            whpx_vcpu_configure_single_stepping(cpu,
1764                false,
1765                &vcpu->exit_ctx.VpContext.Rflags);
1766        }
1767
1768        whpx_vcpu_post_run(cpu);
1769
1770        switch (vcpu->exit_ctx.ExitReason) {
1771        case WHvRunVpExitReasonMemoryAccess:
1772            ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1773            break;
1774
1775        case WHvRunVpExitReasonX64IoPortAccess:
1776            ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1777            break;
1778
1779        case WHvRunVpExitReasonX64InterruptWindow:
1780            vcpu->ready_for_pic_interrupt = 1;
1781            vcpu->window_registered = 0;
1782            ret = 0;
1783            break;
1784
1785        case WHvRunVpExitReasonX64ApicEoi:
1786            assert(whpx_apic_in_platform());
1787            ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1788            break;
1789
1790        case WHvRunVpExitReasonX64Halt:
1791            /*
1792             * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1793             * longer used.
1794             */
1795            ret = whpx_handle_halt(cpu);
1796            break;
1797
1798        case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1799            WHV_INTERRUPT_CONTROL ipi = {0};
1800            uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1801            uint32_t delivery_mode =
1802                (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1803            int dest_shorthand =
1804                (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1805            bool broadcast = false;
1806            bool include_self = false;
1807            uint32_t i;
1808
1809            /* We only registered for INIT and SIPI exits. */
1810            if ((delivery_mode != APIC_DM_INIT) &&
1811                (delivery_mode != APIC_DM_SIPI)) {
1812                error_report(
1813                    "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1814                break;
1815            }
1816
1817            if (delivery_mode == APIC_DM_INIT) {
1818                ipi.Type = WHvX64InterruptTypeInit;
1819            } else {
1820                ipi.Type = WHvX64InterruptTypeSipi;
1821            }
1822
1823            ipi.DestinationMode =
1824                ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1825                    WHvX64InterruptDestinationModeLogical :
1826                    WHvX64InterruptDestinationModePhysical;
1827
1828            ipi.TriggerMode =
1829                ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1830                    WHvX64InterruptTriggerModeLevel :
1831                    WHvX64InterruptTriggerModeEdge;
1832
1833            ipi.Vector = icr & APIC_VECTOR_MASK;
1834            switch (dest_shorthand) {
1835            /* no shorthand. Bits 56-63 contain the destination. */
1836            case 0:
1837                ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1838                hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1839                        &ipi, sizeof(ipi));
1840                if (FAILED(hr)) {
1841                    error_report("WHPX: Failed to request interrupt  hr=%08lx",
1842                        hr);
1843                }
1844
1845                break;
1846
1847            /* self */
1848            case 1:
1849                include_self = true;
1850                break;
1851
1852            /* broadcast, including self */
1853            case 2:
1854                broadcast = true;
1855                include_self = true;
1856                break;
1857
1858            /* broadcast, excluding self */
1859            case 3:
1860                broadcast = true;
1861                break;
1862            }
1863
1864            if (!broadcast && !include_self) {
1865                break;
1866            }
1867
1868            for (i = 0; i <= max_vcpu_index; i++) {
1869                if (i == cpu->cpu_index && !include_self) {
1870                    continue;
1871                }
1872
1873                /*
1874                 * Assuming that APIC Ids are identity mapped since
1875                 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1876                 * are not handled yet and the hypervisor doesn't allow the
1877                 * guest to modify the APIC ID.
1878                 */
1879                ipi.Destination = i;
1880                hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1881                        &ipi, sizeof(ipi));
1882                if (FAILED(hr)) {
1883                    error_report(
1884                        "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1885                        i, hr);
1886                }
1887            }
1888
1889            break;
1890        }
1891
1892        case WHvRunVpExitReasonCanceled:
1893            if (exclusive_step_mode != WHPX_STEP_NONE) {
1894                /*
1895                 * We are trying to step over a single instruction, and
1896                 * likely got a request to stop from another thread.
1897                 * Delay it until we are done stepping
1898                 * over.
1899                 */
1900                ret = 0;
1901            } else {
1902                cpu->exception_index = EXCP_INTERRUPT;
1903                ret = 1;
1904            }
1905            break;
1906        case WHvRunVpExitReasonX64MsrAccess: {
1907            WHV_REGISTER_VALUE reg_values[3] = {0};
1908            WHV_REGISTER_NAME reg_names[3];
1909            UINT32 reg_count;
1910
1911            reg_names[0] = WHvX64RegisterRip;
1912            reg_names[1] = WHvX64RegisterRax;
1913            reg_names[2] = WHvX64RegisterRdx;
1914
1915            reg_values[0].Reg64 =
1916                vcpu->exit_ctx.VpContext.Rip +
1917                vcpu->exit_ctx.VpContext.InstructionLength;
1918
1919            /*
1920             * For all unsupported MSR access we:
1921             *     ignore writes
1922             *     return 0 on read.
1923             */
1924            reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1925                        1 : 3;
1926
1927            hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1928                whpx->partition,
1929                cpu->cpu_index,
1930                reg_names, reg_count,
1931                reg_values);
1932
1933            if (FAILED(hr)) {
1934                error_report("WHPX: Failed to set MsrAccess state "
1935                             " registers, hr=%08lx", hr);
1936            }
1937            ret = 0;
1938            break;
1939        }
1940        case WHvRunVpExitReasonX64Cpuid: {
1941            WHV_REGISTER_VALUE reg_values[5];
1942            WHV_REGISTER_NAME reg_names[5];
1943            UINT32 reg_count = 5;
1944            UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1945            X86CPU *x86_cpu = X86_CPU(cpu);
1946            CPUX86State *env = &x86_cpu->env;
1947
1948            memset(reg_values, 0, sizeof(reg_values));
1949
1950            rip = vcpu->exit_ctx.VpContext.Rip +
1951                  vcpu->exit_ctx.VpContext.InstructionLength;
1952            cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1953
1954            /*
1955             * Ideally, these should be supplied to the hypervisor during VCPU
1956             * initialization and it should be able to satisfy this request.
1957             * But, currently, WHPX doesn't support setting CPUID values in the
1958             * hypervisor once the partition has been setup, which is too late
1959             * since VCPUs are realized later. For now, use the values from
1960             * QEMU to satisfy these requests, until WHPX adds support for
1961             * being able to set these values in the hypervisor at runtime.
1962             */
1963            cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1964                (UINT32 *)&rcx, (UINT32 *)&rdx);
1965            switch (cpuid_fn) {
1966            case 0x40000000:
1967                /* Expose the vmware cpu frequency cpuid leaf */
1968                rax = 0x40000010;
1969                rbx = rcx = rdx = 0;
1970                break;
1971
1972            case 0x40000010:
1973                rax = env->tsc_khz;
1974                rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1975                rcx = rdx = 0;
1976                break;
1977
1978            case 0x80000001:
1979                /* Remove any support of OSVW */
1980                rcx &= ~CPUID_EXT3_OSVW;
1981                break;
1982            }
1983
1984            reg_names[0] = WHvX64RegisterRip;
1985            reg_names[1] = WHvX64RegisterRax;
1986            reg_names[2] = WHvX64RegisterRcx;
1987            reg_names[3] = WHvX64RegisterRdx;
1988            reg_names[4] = WHvX64RegisterRbx;
1989
1990            reg_values[0].Reg64 = rip;
1991            reg_values[1].Reg64 = rax;
1992            reg_values[2].Reg64 = rcx;
1993            reg_values[3].Reg64 = rdx;
1994            reg_values[4].Reg64 = rbx;
1995
1996            hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1997                whpx->partition, cpu->cpu_index,
1998                reg_names,
1999                reg_count,
2000                reg_values);
2001
2002            if (FAILED(hr)) {
2003                error_report("WHPX: Failed to set CpuidAccess state registers,"
2004                             " hr=%08lx", hr);
2005            }
2006            ret = 0;
2007            break;
2008        }
2009        case WHvRunVpExitReasonException:
2010            whpx_get_registers(cpu);
2011
2012            if ((vcpu->exit_ctx.VpException.ExceptionType ==
2013                 WHvX64ExceptionTypeDebugTrapOrFault) &&
2014                (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
2015                (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
2016                 whpx_breakpoint_instruction)) {
2017                /* Stopped at a software breakpoint. */
2018                cpu->exception_index = EXCP_DEBUG;
2019            } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
2020                        WHvX64ExceptionTypeDebugTrapOrFault) &&
2021                       !cpu->singlestep_enabled) {
2022                /*
2023                 * Just finished stepping over a breakpoint, but the
2024                 * gdb does not expect us to do single-stepping.
2025                 * Don't do anything special.
2026                 */
2027                cpu->exception_index = EXCP_INTERRUPT;
2028            } else {
2029                /* Another exception or debug event. Report it to GDB. */
2030                cpu->exception_index = EXCP_DEBUG;
2031            }
2032
2033            ret = 1;
2034            break;
2035        case WHvRunVpExitReasonNone:
2036        case WHvRunVpExitReasonUnrecoverableException:
2037        case WHvRunVpExitReasonInvalidVpRegisterValue:
2038        case WHvRunVpExitReasonUnsupportedFeature:
2039        default:
2040            error_report("WHPX: Unexpected VP exit code %d",
2041                         vcpu->exit_ctx.ExitReason);
2042            whpx_get_registers(cpu);
2043            qemu_mutex_lock_iothread();
2044            qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2045            qemu_mutex_unlock_iothread();
2046            break;
2047        }
2048
2049    } while (!ret);
2050
2051    if (stepped_over_bp) {
2052        /* Restore the breakpoint we stepped over */
2053        cpu_memory_rw_debug(cpu,
2054            stepped_over_bp->address,
2055            (void *)&whpx_breakpoint_instruction,
2056            1,
2057            true);
2058    }
2059
2060    if (exclusive_step_mode != WHPX_STEP_NONE) {
2061        g_assert(cpu_in_exclusive_context(cpu));
2062        cpu->running = false;
2063        end_exclusive();
2064
2065        exclusive_step_mode = WHPX_STEP_NONE;
2066    } else {
2067        cpu_exec_end(cpu);
2068    }
2069
2070    qemu_mutex_lock_iothread();
2071    current_cpu = cpu;
2072
2073    if (--whpx->running_cpus == 0) {
2074        whpx_last_vcpu_stopping(cpu);
2075    }
2076
2077    qatomic_set(&cpu->exit_request, false);
2078
2079    return ret < 0;
2080}
2081
2082static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2083{
2084    if (!cpu->vcpu_dirty) {
2085        whpx_get_registers(cpu);
2086        cpu->vcpu_dirty = true;
2087    }
2088}
2089
2090static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2091                                               run_on_cpu_data arg)
2092{
2093    whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2094    cpu->vcpu_dirty = false;
2095}
2096
2097static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2098                                              run_on_cpu_data arg)
2099{
2100    whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2101    cpu->vcpu_dirty = false;
2102}
2103
2104static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2105                                               run_on_cpu_data arg)
2106{
2107    cpu->vcpu_dirty = true;
2108}
2109
2110/*
2111 * CPU support.
2112 */
2113
2114void whpx_cpu_synchronize_state(CPUState *cpu)
2115{
2116    if (!cpu->vcpu_dirty) {
2117        run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2118    }
2119}
2120
2121void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2122{
2123    run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2124}
2125
2126void whpx_cpu_synchronize_post_init(CPUState *cpu)
2127{
2128    run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2129}
2130
2131void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2132{
2133    run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2134}
2135
2136void whpx_cpu_synchronize_pre_resume(bool step_pending)
2137{
2138    whpx_global.step_pending = step_pending;
2139}
2140
2141/*
2142 * Vcpu support.
2143 */
2144
2145static Error *whpx_migration_blocker;
2146
2147static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2148{
2149    CPUX86State *env = opaque;
2150
2151    if (running) {
2152        env->tsc_valid = false;
2153    }
2154}
2155
2156int whpx_init_vcpu(CPUState *cpu)
2157{
2158    HRESULT hr;
2159    struct whpx_state *whpx = &whpx_global;
2160    struct whpx_vcpu *vcpu = NULL;
2161    Error *local_error = NULL;
2162    CPUX86State *env = cpu->env_ptr;
2163    X86CPU *x86_cpu = X86_CPU(cpu);
2164    UINT64 freq = 0;
2165    int ret;
2166
2167    /* Add migration blockers for all unsupported features of the
2168     * Windows Hypervisor Platform
2169     */
2170    if (whpx_migration_blocker == NULL) {
2171        error_setg(&whpx_migration_blocker,
2172               "State blocked due to non-migratable CPUID feature support,"
2173               "dirty memory tracking support, and XSAVE/XRSTOR support");
2174
2175        if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) {
2176            error_report_err(local_error);
2177            error_free(whpx_migration_blocker);
2178            ret = -EINVAL;
2179            goto error;
2180        }
2181    }
2182
2183    vcpu = g_new0(struct whpx_vcpu, 1);
2184
2185    if (!vcpu) {
2186        error_report("WHPX: Failed to allocte VCPU context.");
2187        ret = -ENOMEM;
2188        goto error;
2189    }
2190
2191    hr = whp_dispatch.WHvEmulatorCreateEmulator(
2192        &whpx_emu_callbacks,
2193        &vcpu->emulator);
2194    if (FAILED(hr)) {
2195        error_report("WHPX: Failed to setup instruction completion support,"
2196                     " hr=%08lx", hr);
2197        ret = -EINVAL;
2198        goto error;
2199    }
2200
2201    hr = whp_dispatch.WHvCreateVirtualProcessor(
2202        whpx->partition, cpu->cpu_index, 0);
2203    if (FAILED(hr)) {
2204        error_report("WHPX: Failed to create a virtual processor,"
2205                     " hr=%08lx", hr);
2206        whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2207        ret = -EINVAL;
2208        goto error;
2209    }
2210
2211    /*
2212     * vcpu's TSC frequency is either specified by user, or use the value
2213     * provided by Hyper-V if the former is not present. In the latter case, we
2214     * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2215     * frequency can be migrated later via this field.
2216     */
2217    if (!env->tsc_khz) {
2218        hr = whp_dispatch.WHvGetCapability(
2219            WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2220                NULL);
2221        if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2222            if (FAILED(hr)) {
2223                printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2224            } else {
2225                env->tsc_khz = freq / 1000; /* Hz to KHz */
2226            }
2227        }
2228    }
2229
2230    env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2231    hr = whp_dispatch.WHvGetCapability(
2232        WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2233    if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2234        if (FAILED(hr)) {
2235            printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2236        } else {
2237            env->apic_bus_freq = freq;
2238        }
2239    }
2240
2241    /*
2242     * If the vmware cpuid frequency leaf option is set, and we have a valid
2243     * tsc value, trap the corresponding cpuid's.
2244     */
2245    if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2246        UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2247
2248        hr = whp_dispatch.WHvSetPartitionProperty(
2249                whpx->partition,
2250                WHvPartitionPropertyCodeCpuidExitList,
2251                cpuidExitList,
2252                RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2253
2254        if (FAILED(hr)) {
2255            error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2256                        hr);
2257            ret = -EINVAL;
2258            goto error;
2259        }
2260    }
2261
2262    vcpu->interruptable = true;
2263    cpu->vcpu_dirty = true;
2264    cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu;
2265    max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2266    qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr);
2267
2268    return 0;
2269
2270error:
2271    g_free(vcpu);
2272
2273    return ret;
2274}
2275
2276int whpx_vcpu_exec(CPUState *cpu)
2277{
2278    int ret;
2279    int fatal;
2280
2281    for (;;) {
2282        if (cpu->exception_index >= EXCP_INTERRUPT) {
2283            ret = cpu->exception_index;
2284            cpu->exception_index = -1;
2285            break;
2286        }
2287
2288        fatal = whpx_vcpu_run(cpu);
2289
2290        if (fatal) {
2291            error_report("WHPX: Failed to exec a virtual processor");
2292            abort();
2293        }
2294    }
2295
2296    return ret;
2297}
2298
2299void whpx_destroy_vcpu(CPUState *cpu)
2300{
2301    struct whpx_state *whpx = &whpx_global;
2302    struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
2303
2304    whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2305    whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2306    g_free(cpu->hax_vcpu);
2307    return;
2308}
2309
2310void whpx_vcpu_kick(CPUState *cpu)
2311{
2312    struct whpx_state *whpx = &whpx_global;
2313    whp_dispatch.WHvCancelRunVirtualProcessor(
2314        whpx->partition, cpu->cpu_index, 0);
2315}
2316
2317/*
2318 * Memory support.
2319 */
2320
2321static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2322                                void *host_va, int add, int rom,
2323                                const char *name)
2324{
2325    struct whpx_state *whpx = &whpx_global;
2326    HRESULT hr;
2327
2328    /*
2329    if (add) {
2330        printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2331               (void*)start_pa, (void*)size, host_va,
2332               (rom ? "ROM" : "RAM"), name);
2333    } else {
2334        printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2335               (void*)start_pa, (void*)size, host_va, name);
2336    }
2337    */
2338
2339    if (add) {
2340        hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2341                                         host_va,
2342                                         start_pa,
2343                                         size,
2344                                         (WHvMapGpaRangeFlagRead |
2345                                          WHvMapGpaRangeFlagExecute |
2346                                          (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2347    } else {
2348        hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2349                                           start_pa,
2350                                           size);
2351    }
2352
2353    if (FAILED(hr)) {
2354        error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2355                     " Host:%p, hr=%08lx",
2356                     (add ? "MAP" : "UNMAP"), name,
2357                     (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2358    }
2359}
2360
2361static void whpx_process_section(MemoryRegionSection *section, int add)
2362{
2363    MemoryRegion *mr = section->mr;
2364    hwaddr start_pa = section->offset_within_address_space;
2365    ram_addr_t size = int128_get64(section->size);
2366    unsigned int delta;
2367    uint64_t host_va;
2368
2369    if (!memory_region_is_ram(mr)) {
2370        return;
2371    }
2372
2373    delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2374    delta &= ~qemu_real_host_page_mask();
2375    if (delta > size) {
2376        return;
2377    }
2378    start_pa += delta;
2379    size -= delta;
2380    size &= qemu_real_host_page_mask();
2381    if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2382        return;
2383    }
2384
2385    host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2386            + section->offset_within_region + delta;
2387
2388    whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2389                        memory_region_is_rom(mr), mr->name);
2390}
2391
2392static void whpx_region_add(MemoryListener *listener,
2393                           MemoryRegionSection *section)
2394{
2395    memory_region_ref(section->mr);
2396    whpx_process_section(section, 1);
2397}
2398
2399static void whpx_region_del(MemoryListener *listener,
2400                           MemoryRegionSection *section)
2401{
2402    whpx_process_section(section, 0);
2403    memory_region_unref(section->mr);
2404}
2405
2406static void whpx_transaction_begin(MemoryListener *listener)
2407{
2408}
2409
2410static void whpx_transaction_commit(MemoryListener *listener)
2411{
2412}
2413
2414static void whpx_log_sync(MemoryListener *listener,
2415                         MemoryRegionSection *section)
2416{
2417    MemoryRegion *mr = section->mr;
2418
2419    if (!memory_region_is_ram(mr)) {
2420        return;
2421    }
2422
2423    memory_region_set_dirty(mr, 0, int128_get64(section->size));
2424}
2425
2426static MemoryListener whpx_memory_listener = {
2427    .name = "whpx",
2428    .begin = whpx_transaction_begin,
2429    .commit = whpx_transaction_commit,
2430    .region_add = whpx_region_add,
2431    .region_del = whpx_region_del,
2432    .log_sync = whpx_log_sync,
2433    .priority = 10,
2434};
2435
2436static void whpx_memory_init(void)
2437{
2438    memory_listener_register(&whpx_memory_listener, &address_space_memory);
2439}
2440
2441/*
2442 * Load the functions from the given library, using the given handle. If a
2443 * handle is provided, it is used, otherwise the library is opened. The
2444 * handle will be updated on return with the opened one.
2445 */
2446static bool load_whp_dispatch_fns(HMODULE *handle,
2447    WHPFunctionList function_list)
2448{
2449    HMODULE hLib = *handle;
2450
2451    #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2452    #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2453    #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2454        whp_dispatch.function_name = \
2455            (function_name ## _t)GetProcAddress(hLib, #function_name); \
2456
2457    #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2458        whp_dispatch.function_name = \
2459            (function_name ## _t)GetProcAddress(hLib, #function_name); \
2460        if (!whp_dispatch.function_name) { \
2461            error_report("Could not load function %s", #function_name); \
2462            goto error; \
2463        } \
2464
2465    #define WHP_LOAD_LIB(lib_name, handle_lib) \
2466    if (!handle_lib) { \
2467        handle_lib = LoadLibrary(lib_name); \
2468        if (!handle_lib) { \
2469            error_report("Could not load library %s.", lib_name); \
2470            goto error; \
2471        } \
2472    } \
2473
2474    switch (function_list) {
2475    case WINHV_PLATFORM_FNS_DEFAULT:
2476        WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2477        LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2478        break;
2479
2480    case WINHV_EMULATION_FNS_DEFAULT:
2481        WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2482        LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2483        break;
2484
2485    case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2486        WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2487        LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2488        break;
2489    }
2490
2491    *handle = hLib;
2492    return true;
2493
2494error:
2495    if (hLib) {
2496        FreeLibrary(hLib);
2497    }
2498
2499    return false;
2500}
2501
2502static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2503                                   const char *name, void *opaque,
2504                                   Error **errp)
2505{
2506    struct whpx_state *whpx = &whpx_global;
2507    OnOffSplit mode;
2508
2509    if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2510        return;
2511    }
2512
2513    switch (mode) {
2514    case ON_OFF_SPLIT_ON:
2515        whpx->kernel_irqchip_allowed = true;
2516        whpx->kernel_irqchip_required = true;
2517        break;
2518
2519    case ON_OFF_SPLIT_OFF:
2520        whpx->kernel_irqchip_allowed = false;
2521        whpx->kernel_irqchip_required = false;
2522        break;
2523
2524    case ON_OFF_SPLIT_SPLIT:
2525        error_setg(errp, "WHPX: split irqchip currently not supported");
2526        error_append_hint(errp,
2527            "Try without kernel-irqchip or with kernel-irqchip=on|off");
2528        break;
2529
2530    default:
2531        /*
2532         * The value was checked in visit_type_OnOffSplit() above. If
2533         * we get here, then something is wrong in QEMU.
2534         */
2535        abort();
2536    }
2537}
2538
2539/*
2540 * Partition support
2541 */
2542
2543static int whpx_accel_init(MachineState *ms)
2544{
2545    struct whpx_state *whpx;
2546    int ret;
2547    HRESULT hr;
2548    WHV_CAPABILITY whpx_cap;
2549    UINT32 whpx_cap_size;
2550    WHV_PARTITION_PROPERTY prop;
2551    UINT32 cpuidExitList[] = {1, 0x80000001};
2552    WHV_CAPABILITY_FEATURES features = {0};
2553
2554    whpx = &whpx_global;
2555
2556    if (!init_whp_dispatch()) {
2557        ret = -ENOSYS;
2558        goto error;
2559    }
2560
2561    whpx->mem_quota = ms->ram_size;
2562
2563    hr = whp_dispatch.WHvGetCapability(
2564        WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2565        sizeof(whpx_cap), &whpx_cap_size);
2566    if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2567        error_report("WHPX: No accelerator found, hr=%08lx", hr);
2568        ret = -ENOSPC;
2569        goto error;
2570    }
2571
2572    hr = whp_dispatch.WHvGetCapability(
2573        WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2574    if (FAILED(hr)) {
2575        error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2576        ret = -EINVAL;
2577        goto error;
2578    }
2579
2580    hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2581    if (FAILED(hr)) {
2582        error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2583        ret = -EINVAL;
2584        goto error;
2585    }
2586
2587    /*
2588     * Query the XSAVE capability of the partition. Any error here is not
2589     * considered fatal.
2590     */
2591    hr = whp_dispatch.WHvGetPartitionProperty(
2592        whpx->partition,
2593        WHvPartitionPropertyCodeProcessorXsaveFeatures,
2594        &whpx_xsave_cap,
2595        sizeof(whpx_xsave_cap),
2596        &whpx_cap_size);
2597
2598    /*
2599     * Windows version which don't support this property will return with the
2600     * specific error code.
2601     */
2602    if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2603        error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2604    }
2605
2606    if (!whpx_has_xsave()) {
2607        printf("WHPX: Partition is not XSAVE capable\n");
2608    }
2609
2610    memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2611    prop.ProcessorCount = ms->smp.cpus;
2612    hr = whp_dispatch.WHvSetPartitionProperty(
2613        whpx->partition,
2614        WHvPartitionPropertyCodeProcessorCount,
2615        &prop,
2616        sizeof(WHV_PARTITION_PROPERTY));
2617
2618    if (FAILED(hr)) {
2619        error_report("WHPX: Failed to set partition core count to %d,"
2620                     " hr=%08lx", ms->smp.cores, hr);
2621        ret = -EINVAL;
2622        goto error;
2623    }
2624
2625    /*
2626     * Error out if WHP doesn't support apic emulation and user is requiring
2627     * it.
2628     */
2629    if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2630            !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2631        error_report("WHPX: kernel irqchip requested, but unavailable. "
2632            "Try without kernel-irqchip or with kernel-irqchip=off");
2633        ret = -EINVAL;
2634        goto error;
2635    }
2636
2637    if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2638        whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2639        WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2640            WHvX64LocalApicEmulationModeXApic;
2641        printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2642        hr = whp_dispatch.WHvSetPartitionProperty(
2643            whpx->partition,
2644            WHvPartitionPropertyCodeLocalApicEmulationMode,
2645            &mode,
2646            sizeof(mode));
2647        if (FAILED(hr)) {
2648            error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2649            if (whpx->kernel_irqchip_required) {
2650                error_report("WHPX: kernel irqchip requested, but unavailable");
2651                ret = -EINVAL;
2652                goto error;
2653            }
2654        } else {
2655            whpx->apic_in_platform = true;
2656        }
2657    }
2658
2659    /* Register for MSR and CPUID exits */
2660    memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2661    prop.ExtendedVmExits.X64MsrExit = 1;
2662    prop.ExtendedVmExits.X64CpuidExit = 1;
2663    prop.ExtendedVmExits.ExceptionExit = 1;
2664    if (whpx_apic_in_platform()) {
2665        prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2666    }
2667
2668    hr = whp_dispatch.WHvSetPartitionProperty(
2669            whpx->partition,
2670            WHvPartitionPropertyCodeExtendedVmExits,
2671            &prop,
2672            sizeof(WHV_PARTITION_PROPERTY));
2673    if (FAILED(hr)) {
2674        error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2675        ret = -EINVAL;
2676        goto error;
2677    }
2678
2679    hr = whp_dispatch.WHvSetPartitionProperty(
2680        whpx->partition,
2681        WHvPartitionPropertyCodeCpuidExitList,
2682        cpuidExitList,
2683        RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2684
2685    if (FAILED(hr)) {
2686        error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2687                     hr);
2688        ret = -EINVAL;
2689        goto error;
2690    }
2691
2692    /*
2693     * We do not want to intercept any exceptions from the guest,
2694     * until we actually start debugging with gdb.
2695     */
2696    whpx->exception_exit_bitmap = -1;
2697    hr = whpx_set_exception_exit_bitmap(0);
2698
2699    if (FAILED(hr)) {
2700        error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2701        ret = -EINVAL;
2702        goto error;
2703    }
2704
2705    hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2706    if (FAILED(hr)) {
2707        error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2708        ret = -EINVAL;
2709        goto error;
2710    }
2711
2712    whpx_memory_init();
2713
2714    printf("Windows Hypervisor Platform accelerator is operational\n");
2715    return 0;
2716
2717error:
2718
2719    if (NULL != whpx->partition) {
2720        whp_dispatch.WHvDeletePartition(whpx->partition);
2721        whpx->partition = NULL;
2722    }
2723
2724    return ret;
2725}
2726
2727int whpx_enabled(void)
2728{
2729    return whpx_allowed;
2730}
2731
2732bool whpx_apic_in_platform(void) {
2733    return whpx_global.apic_in_platform;
2734}
2735
2736static void whpx_accel_class_init(ObjectClass *oc, void *data)
2737{
2738    AccelClass *ac = ACCEL_CLASS(oc);
2739    ac->name = "WHPX";
2740    ac->init_machine = whpx_accel_init;
2741    ac->allowed = &whpx_allowed;
2742
2743    object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2744        NULL, whpx_set_kernel_irqchip,
2745        NULL, NULL);
2746    object_class_property_set_description(oc, "kernel-irqchip",
2747        "Configure WHPX in-kernel irqchip");
2748}
2749
2750static void whpx_accel_instance_init(Object *obj)
2751{
2752    struct whpx_state *whpx = &whpx_global;
2753
2754    memset(whpx, 0, sizeof(struct whpx_state));
2755    /* Turn on kernel-irqchip, by default */
2756    whpx->kernel_irqchip_allowed = true;
2757}
2758
2759static const TypeInfo whpx_accel_type = {
2760    .name = ACCEL_CLASS_NAME("whpx"),
2761    .parent = TYPE_ACCEL,
2762    .instance_init = whpx_accel_instance_init,
2763    .class_init = whpx_accel_class_init,
2764};
2765
2766static void whpx_type_init(void)
2767{
2768    type_register_static(&whpx_accel_type);
2769}
2770
2771bool init_whp_dispatch(void)
2772{
2773    if (whp_dispatch_initialized) {
2774        return true;
2775    }
2776
2777    if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2778        goto error;
2779    }
2780
2781    if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2782        goto error;
2783    }
2784
2785    assert(load_whp_dispatch_fns(&hWinHvPlatform,
2786        WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2787    whp_dispatch_initialized = true;
2788
2789    return true;
2790error:
2791    if (hWinHvPlatform) {
2792        FreeLibrary(hWinHvPlatform);
2793    }
2794
2795    if (hWinHvEmulation) {
2796        FreeLibrary(hWinHvEmulation);
2797    }
2798
2799    return false;
2800}
2801
2802type_init(whpx_type_init);
2803