qemu/hw/i386/x86.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2004 Fabrice Bellard
   3 * Copyright (c) 2019 Red Hat, Inc.
   4 *
   5 * Permission is hereby granted, free of charge, to any person obtaining a copy
   6 * of this software and associated documentation files (the "Software"), to deal
   7 * in the Software without restriction, including without limitation the rights
   8 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   9 * copies of the Software, and to permit persons to whom the Software is
  10 * furnished to do so, subject to the following conditions:
  11 *
  12 * The above copyright notice and this permission notice shall be included in
  13 * all copies or substantial portions of the Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  21 * THE SOFTWARE.
  22 */
  23#include "qemu/osdep.h"
  24#include "qemu/error-report.h"
  25#include "qemu/option.h"
  26#include "qemu/cutils.h"
  27#include "qemu/units.h"
  28#include "qemu/datadir.h"
  29#include "qemu/guest-random.h"
  30#include "qapi/error.h"
  31#include "qapi/qmp/qerror.h"
  32#include "qapi/qapi-visit-common.h"
  33#include "qapi/clone-visitor.h"
  34#include "qapi/qapi-visit-machine.h"
  35#include "qapi/visitor.h"
  36#include "sysemu/qtest.h"
  37#include "sysemu/whpx.h"
  38#include "sysemu/numa.h"
  39#include "sysemu/replay.h"
  40#include "sysemu/sysemu.h"
  41#include "sysemu/cpu-timers.h"
  42#include "sysemu/xen.h"
  43#include "trace.h"
  44
  45#include "hw/i386/x86.h"
  46#include "target/i386/cpu.h"
  47#include "hw/i386/topology.h"
  48#include "hw/i386/fw_cfg.h"
  49#include "hw/intc/i8259.h"
  50#include "hw/rtc/mc146818rtc.h"
  51#include "target/i386/sev.h"
  52
  53#include "hw/acpi/cpu_hotplug.h"
  54#include "hw/irq.h"
  55#include "hw/nmi.h"
  56#include "hw/loader.h"
  57#include "multiboot.h"
  58#include "elf.h"
  59#include "standard-headers/asm-x86/bootparam.h"
  60#include CONFIG_DEVICES
  61#include "kvm/kvm_i386.h"
  62
  63/* Physical Address of PVH entry point read from kernel ELF NOTE */
  64static size_t pvh_start_addr;
  65
  66inline void init_topo_info(X86CPUTopoInfo *topo_info,
  67                           const X86MachineState *x86ms)
  68{
  69    MachineState *ms = MACHINE(x86ms);
  70
  71    topo_info->dies_per_pkg = ms->smp.dies;
  72    topo_info->cores_per_die = ms->smp.cores;
  73    topo_info->threads_per_core = ms->smp.threads;
  74}
  75
  76/*
  77 * Calculates initial APIC ID for a specific CPU index
  78 *
  79 * Currently we need to be able to calculate the APIC ID from the CPU index
  80 * alone (without requiring a CPU object), as the QEMU<->Seabios interfaces have
  81 * no concept of "CPU index", and the NUMA tables on fw_cfg need the APIC ID of
  82 * all CPUs up to max_cpus.
  83 */
  84uint32_t x86_cpu_apic_id_from_index(X86MachineState *x86ms,
  85                                    unsigned int cpu_index)
  86{
  87    X86CPUTopoInfo topo_info;
  88
  89    init_topo_info(&topo_info, x86ms);
  90
  91    return x86_apicid_from_cpu_idx(&topo_info, cpu_index);
  92}
  93
  94
  95void x86_cpu_new(X86MachineState *x86ms, int64_t apic_id, Error **errp)
  96{
  97    Object *cpu = object_new(MACHINE(x86ms)->cpu_type);
  98
  99    if (!object_property_set_uint(cpu, "apic-id", apic_id, errp)) {
 100        goto out;
 101    }
 102    qdev_realize(DEVICE(cpu), NULL, errp);
 103
 104out:
 105    object_unref(cpu);
 106}
 107
 108void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
 109{
 110    int i;
 111    const CPUArchIdList *possible_cpus;
 112    MachineState *ms = MACHINE(x86ms);
 113    MachineClass *mc = MACHINE_GET_CLASS(x86ms);
 114
 115    x86_cpu_set_default_version(default_cpu_version);
 116
 117    /*
 118     * Calculates the limit to CPU APIC ID values
 119     *
 120     * Limit for the APIC ID value, so that all
 121     * CPU APIC IDs are < x86ms->apic_id_limit.
 122     *
 123     * This is used for FW_CFG_MAX_CPUS. See comments on fw_cfg_arch_create().
 124     */
 125    x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms,
 126                                                      ms->smp.max_cpus - 1) + 1;
 127
 128    /*
 129     * Can we support APIC ID 255 or higher?
 130     *
 131     * Under Xen: yes.
 132     * With userspace emulated lapic: no
 133     * With KVM's in-kernel lapic: only if X2APIC API is enabled.
 134     */
 135    if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
 136        (!kvm_irqchip_in_kernel() || !kvm_enable_x2apic())) {
 137        error_report("current -smp configuration requires kernel "
 138                     "irqchip and X2APIC API support.");
 139        exit(EXIT_FAILURE);
 140    }
 141
 142    possible_cpus = mc->possible_cpu_arch_ids(ms);
 143    for (i = 0; i < ms->smp.cpus; i++) {
 144        x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
 145    }
 146}
 147
 148void x86_rtc_set_cpus_count(ISADevice *rtc, uint16_t cpus_count)
 149{
 150    if (cpus_count > 0xff) {
 151        /*
 152         * If the number of CPUs can't be represented in 8 bits, the
 153         * BIOS must use "FW_CFG_NB_CPUS". Set RTC field to 0 just
 154         * to make old BIOSes fail more predictably.
 155         */
 156        rtc_set_memory(rtc, 0x5f, 0);
 157    } else {
 158        rtc_set_memory(rtc, 0x5f, cpus_count - 1);
 159    }
 160}
 161
 162static int x86_apic_cmp(const void *a, const void *b)
 163{
 164   CPUArchId *apic_a = (CPUArchId *)a;
 165   CPUArchId *apic_b = (CPUArchId *)b;
 166
 167   return apic_a->arch_id - apic_b->arch_id;
 168}
 169
 170/*
 171 * returns pointer to CPUArchId descriptor that matches CPU's apic_id
 172 * in ms->possible_cpus->cpus, if ms->possible_cpus->cpus has no
 173 * entry corresponding to CPU's apic_id returns NULL.
 174 */
 175CPUArchId *x86_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
 176{
 177    CPUArchId apic_id, *found_cpu;
 178
 179    apic_id.arch_id = id;
 180    found_cpu = bsearch(&apic_id, ms->possible_cpus->cpus,
 181        ms->possible_cpus->len, sizeof(*ms->possible_cpus->cpus),
 182        x86_apic_cmp);
 183    if (found_cpu && idx) {
 184        *idx = found_cpu - ms->possible_cpus->cpus;
 185    }
 186    return found_cpu;
 187}
 188
 189void x86_cpu_plug(HotplugHandler *hotplug_dev,
 190                  DeviceState *dev, Error **errp)
 191{
 192    CPUArchId *found_cpu;
 193    Error *local_err = NULL;
 194    X86CPU *cpu = X86_CPU(dev);
 195    X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
 196
 197    if (x86ms->acpi_dev) {
 198        hotplug_handler_plug(x86ms->acpi_dev, dev, &local_err);
 199        if (local_err) {
 200            goto out;
 201        }
 202    }
 203
 204    /* increment the number of CPUs */
 205    x86ms->boot_cpus++;
 206    if (x86ms->rtc) {
 207        x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
 208    }
 209    if (x86ms->fw_cfg) {
 210        fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
 211    }
 212
 213    found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL);
 214    found_cpu->cpu = OBJECT(dev);
 215out:
 216    error_propagate(errp, local_err);
 217}
 218
 219void x86_cpu_unplug_request_cb(HotplugHandler *hotplug_dev,
 220                               DeviceState *dev, Error **errp)
 221{
 222    int idx = -1;
 223    X86CPU *cpu = X86_CPU(dev);
 224    X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
 225
 226    if (!x86ms->acpi_dev) {
 227        error_setg(errp, "CPU hot unplug not supported without ACPI");
 228        return;
 229    }
 230
 231    x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx);
 232    assert(idx != -1);
 233    if (idx == 0) {
 234        error_setg(errp, "Boot CPU is unpluggable");
 235        return;
 236    }
 237
 238    hotplug_handler_unplug_request(x86ms->acpi_dev, dev,
 239                                   errp);
 240}
 241
 242void x86_cpu_unplug_cb(HotplugHandler *hotplug_dev,
 243                       DeviceState *dev, Error **errp)
 244{
 245    CPUArchId *found_cpu;
 246    Error *local_err = NULL;
 247    X86CPU *cpu = X86_CPU(dev);
 248    X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
 249
 250    hotplug_handler_unplug(x86ms->acpi_dev, dev, &local_err);
 251    if (local_err) {
 252        goto out;
 253    }
 254
 255    found_cpu = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, NULL);
 256    found_cpu->cpu = NULL;
 257    qdev_unrealize(dev);
 258
 259    /* decrement the number of CPUs */
 260    x86ms->boot_cpus--;
 261    /* Update the number of CPUs in CMOS */
 262    x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
 263    fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
 264 out:
 265    error_propagate(errp, local_err);
 266}
 267
 268void x86_cpu_pre_plug(HotplugHandler *hotplug_dev,
 269                      DeviceState *dev, Error **errp)
 270{
 271    int idx;
 272    CPUState *cs;
 273    CPUArchId *cpu_slot;
 274    X86CPUTopoIDs topo_ids;
 275    X86CPU *cpu = X86_CPU(dev);
 276    CPUX86State *env = &cpu->env;
 277    MachineState *ms = MACHINE(hotplug_dev);
 278    X86MachineState *x86ms = X86_MACHINE(hotplug_dev);
 279    unsigned int smp_cores = ms->smp.cores;
 280    unsigned int smp_threads = ms->smp.threads;
 281    X86CPUTopoInfo topo_info;
 282
 283    if (!object_dynamic_cast(OBJECT(cpu), ms->cpu_type)) {
 284        error_setg(errp, "Invalid CPU type, expected cpu type: '%s'",
 285                   ms->cpu_type);
 286        return;
 287    }
 288
 289    if (x86ms->acpi_dev) {
 290        Error *local_err = NULL;
 291
 292        hotplug_handler_pre_plug(HOTPLUG_HANDLER(x86ms->acpi_dev), dev,
 293                                 &local_err);
 294        if (local_err) {
 295            error_propagate(errp, local_err);
 296            return;
 297        }
 298    }
 299
 300    init_topo_info(&topo_info, x86ms);
 301
 302    env->nr_dies = ms->smp.dies;
 303
 304    /*
 305     * If APIC ID is not set,
 306     * set it based on socket/die/core/thread properties.
 307     */
 308    if (cpu->apic_id == UNASSIGNED_APIC_ID) {
 309        int max_socket = (ms->smp.max_cpus - 1) /
 310                                smp_threads / smp_cores / ms->smp.dies;
 311
 312        /*
 313         * die-id was optional in QEMU 4.0 and older, so keep it optional
 314         * if there's only one die per socket.
 315         */
 316        if (cpu->die_id < 0 && ms->smp.dies == 1) {
 317            cpu->die_id = 0;
 318        }
 319
 320        if (cpu->socket_id < 0) {
 321            error_setg(errp, "CPU socket-id is not set");
 322            return;
 323        } else if (cpu->socket_id > max_socket) {
 324            error_setg(errp, "Invalid CPU socket-id: %u must be in range 0:%u",
 325                       cpu->socket_id, max_socket);
 326            return;
 327        }
 328        if (cpu->die_id < 0) {
 329            error_setg(errp, "CPU die-id is not set");
 330            return;
 331        } else if (cpu->die_id > ms->smp.dies - 1) {
 332            error_setg(errp, "Invalid CPU die-id: %u must be in range 0:%u",
 333                       cpu->die_id, ms->smp.dies - 1);
 334            return;
 335        }
 336        if (cpu->core_id < 0) {
 337            error_setg(errp, "CPU core-id is not set");
 338            return;
 339        } else if (cpu->core_id > (smp_cores - 1)) {
 340            error_setg(errp, "Invalid CPU core-id: %u must be in range 0:%u",
 341                       cpu->core_id, smp_cores - 1);
 342            return;
 343        }
 344        if (cpu->thread_id < 0) {
 345            error_setg(errp, "CPU thread-id is not set");
 346            return;
 347        } else if (cpu->thread_id > (smp_threads - 1)) {
 348            error_setg(errp, "Invalid CPU thread-id: %u must be in range 0:%u",
 349                       cpu->thread_id, smp_threads - 1);
 350            return;
 351        }
 352
 353        topo_ids.pkg_id = cpu->socket_id;
 354        topo_ids.die_id = cpu->die_id;
 355        topo_ids.core_id = cpu->core_id;
 356        topo_ids.smt_id = cpu->thread_id;
 357        cpu->apic_id = x86_apicid_from_topo_ids(&topo_info, &topo_ids);
 358    }
 359
 360    cpu_slot = x86_find_cpu_slot(MACHINE(x86ms), cpu->apic_id, &idx);
 361    if (!cpu_slot) {
 362        MachineState *ms = MACHINE(x86ms);
 363
 364        x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids);
 365        error_setg(errp,
 366            "Invalid CPU [socket: %u, die: %u, core: %u, thread: %u] with"
 367            " APIC ID %" PRIu32 ", valid index range 0:%d",
 368            topo_ids.pkg_id, topo_ids.die_id, topo_ids.core_id, topo_ids.smt_id,
 369            cpu->apic_id, ms->possible_cpus->len - 1);
 370        return;
 371    }
 372
 373    if (cpu_slot->cpu) {
 374        error_setg(errp, "CPU[%d] with APIC ID %" PRIu32 " exists",
 375                   idx, cpu->apic_id);
 376        return;
 377    }
 378
 379    /* if 'address' properties socket-id/core-id/thread-id are not set, set them
 380     * so that machine_query_hotpluggable_cpus would show correct values
 381     */
 382    /* TODO: move socket_id/core_id/thread_id checks into x86_cpu_realizefn()
 383     * once -smp refactoring is complete and there will be CPU private
 384     * CPUState::nr_cores and CPUState::nr_threads fields instead of globals */
 385    x86_topo_ids_from_apicid(cpu->apic_id, &topo_info, &topo_ids);
 386    if (cpu->socket_id != -1 && cpu->socket_id != topo_ids.pkg_id) {
 387        error_setg(errp, "property socket-id: %u doesn't match set apic-id:"
 388            " 0x%x (socket-id: %u)", cpu->socket_id, cpu->apic_id,
 389            topo_ids.pkg_id);
 390        return;
 391    }
 392    cpu->socket_id = topo_ids.pkg_id;
 393
 394    if (cpu->die_id != -1 && cpu->die_id != topo_ids.die_id) {
 395        error_setg(errp, "property die-id: %u doesn't match set apic-id:"
 396            " 0x%x (die-id: %u)", cpu->die_id, cpu->apic_id, topo_ids.die_id);
 397        return;
 398    }
 399    cpu->die_id = topo_ids.die_id;
 400
 401    if (cpu->core_id != -1 && cpu->core_id != topo_ids.core_id) {
 402        error_setg(errp, "property core-id: %u doesn't match set apic-id:"
 403            " 0x%x (core-id: %u)", cpu->core_id, cpu->apic_id,
 404            topo_ids.core_id);
 405        return;
 406    }
 407    cpu->core_id = topo_ids.core_id;
 408
 409    if (cpu->thread_id != -1 && cpu->thread_id != topo_ids.smt_id) {
 410        error_setg(errp, "property thread-id: %u doesn't match set apic-id:"
 411            " 0x%x (thread-id: %u)", cpu->thread_id, cpu->apic_id,
 412            topo_ids.smt_id);
 413        return;
 414    }
 415    cpu->thread_id = topo_ids.smt_id;
 416
 417    if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) &&
 418        !kvm_hv_vpindex_settable()) {
 419        error_setg(errp, "kernel doesn't allow setting HyperV VP_INDEX");
 420        return;
 421    }
 422
 423    cs = CPU(cpu);
 424    cs->cpu_index = idx;
 425
 426    numa_cpu_pre_plug(cpu_slot, dev, errp);
 427}
 428
 429CpuInstanceProperties
 430x86_cpu_index_to_props(MachineState *ms, unsigned cpu_index)
 431{
 432    MachineClass *mc = MACHINE_GET_CLASS(ms);
 433    const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms);
 434
 435    assert(cpu_index < possible_cpus->len);
 436    return possible_cpus->cpus[cpu_index].props;
 437}
 438
 439int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx)
 440{
 441   X86CPUTopoIDs topo_ids;
 442   X86MachineState *x86ms = X86_MACHINE(ms);
 443   X86CPUTopoInfo topo_info;
 444
 445   init_topo_info(&topo_info, x86ms);
 446
 447   assert(idx < ms->possible_cpus->len);
 448   x86_topo_ids_from_apicid(ms->possible_cpus->cpus[idx].arch_id,
 449                            &topo_info, &topo_ids);
 450   return topo_ids.pkg_id % ms->numa_state->num_nodes;
 451}
 452
 453const CPUArchIdList *x86_possible_cpu_arch_ids(MachineState *ms)
 454{
 455    X86MachineState *x86ms = X86_MACHINE(ms);
 456    unsigned int max_cpus = ms->smp.max_cpus;
 457    X86CPUTopoInfo topo_info;
 458    int i;
 459
 460    if (ms->possible_cpus) {
 461        /*
 462         * make sure that max_cpus hasn't changed since the first use, i.e.
 463         * -smp hasn't been parsed after it
 464         */
 465        assert(ms->possible_cpus->len == max_cpus);
 466        return ms->possible_cpus;
 467    }
 468
 469    ms->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
 470                                  sizeof(CPUArchId) * max_cpus);
 471    ms->possible_cpus->len = max_cpus;
 472
 473    init_topo_info(&topo_info, x86ms);
 474
 475    for (i = 0; i < ms->possible_cpus->len; i++) {
 476        X86CPUTopoIDs topo_ids;
 477
 478        ms->possible_cpus->cpus[i].type = ms->cpu_type;
 479        ms->possible_cpus->cpus[i].vcpus_count = 1;
 480        ms->possible_cpus->cpus[i].arch_id =
 481            x86_cpu_apic_id_from_index(x86ms, i);
 482        x86_topo_ids_from_apicid(ms->possible_cpus->cpus[i].arch_id,
 483                                 &topo_info, &topo_ids);
 484        ms->possible_cpus->cpus[i].props.has_socket_id = true;
 485        ms->possible_cpus->cpus[i].props.socket_id = topo_ids.pkg_id;
 486        if (ms->smp.dies > 1) {
 487            ms->possible_cpus->cpus[i].props.has_die_id = true;
 488            ms->possible_cpus->cpus[i].props.die_id = topo_ids.die_id;
 489        }
 490        ms->possible_cpus->cpus[i].props.has_core_id = true;
 491        ms->possible_cpus->cpus[i].props.core_id = topo_ids.core_id;
 492        ms->possible_cpus->cpus[i].props.has_thread_id = true;
 493        ms->possible_cpus->cpus[i].props.thread_id = topo_ids.smt_id;
 494    }
 495    return ms->possible_cpus;
 496}
 497
 498static void x86_nmi(NMIState *n, int cpu_index, Error **errp)
 499{
 500    /* cpu index isn't used */
 501    CPUState *cs;
 502
 503    CPU_FOREACH(cs) {
 504        X86CPU *cpu = X86_CPU(cs);
 505
 506        if (!cpu->apic_state) {
 507            cpu_interrupt(cs, CPU_INTERRUPT_NMI);
 508        } else {
 509            apic_deliver_nmi(cpu->apic_state);
 510        }
 511    }
 512}
 513
 514static long get_file_size(FILE *f)
 515{
 516    long where, size;
 517
 518    /* XXX: on Unix systems, using fstat() probably makes more sense */
 519
 520    where = ftell(f);
 521    fseek(f, 0, SEEK_END);
 522    size = ftell(f);
 523    fseek(f, where, SEEK_SET);
 524
 525    return size;
 526}
 527
 528/* TSC handling */
 529uint64_t cpu_get_tsc(CPUX86State *env)
 530{
 531    return cpus_get_elapsed_ticks();
 532}
 533
 534/* IRQ handling */
 535static void pic_irq_request(void *opaque, int irq, int level)
 536{
 537    CPUState *cs = first_cpu;
 538    X86CPU *cpu = X86_CPU(cs);
 539
 540    trace_x86_pic_interrupt(irq, level);
 541    if (cpu->apic_state && !kvm_irqchip_in_kernel() &&
 542        !whpx_apic_in_platform()) {
 543        CPU_FOREACH(cs) {
 544            cpu = X86_CPU(cs);
 545            if (apic_accept_pic_intr(cpu->apic_state)) {
 546                apic_deliver_pic_intr(cpu->apic_state, level);
 547            }
 548        }
 549    } else {
 550        if (level) {
 551            cpu_interrupt(cs, CPU_INTERRUPT_HARD);
 552        } else {
 553            cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
 554        }
 555    }
 556}
 557
 558qemu_irq x86_allocate_cpu_irq(void)
 559{
 560    return qemu_allocate_irq(pic_irq_request, NULL, 0);
 561}
 562
 563int cpu_get_pic_interrupt(CPUX86State *env)
 564{
 565    X86CPU *cpu = env_archcpu(env);
 566    int intno;
 567
 568    if (!kvm_irqchip_in_kernel() && !whpx_apic_in_platform()) {
 569        intno = apic_get_interrupt(cpu->apic_state);
 570        if (intno >= 0) {
 571            return intno;
 572        }
 573        /* read the irq from the PIC */
 574        if (!apic_accept_pic_intr(cpu->apic_state)) {
 575            return -1;
 576        }
 577    }
 578
 579    intno = pic_read_irq(isa_pic);
 580    return intno;
 581}
 582
 583DeviceState *cpu_get_current_apic(void)
 584{
 585    if (current_cpu) {
 586        X86CPU *cpu = X86_CPU(current_cpu);
 587        return cpu->apic_state;
 588    } else {
 589        return NULL;
 590    }
 591}
 592
 593void gsi_handler(void *opaque, int n, int level)
 594{
 595    GSIState *s = opaque;
 596
 597    trace_x86_gsi_interrupt(n, level);
 598    switch (n) {
 599    case 0 ... ISA_NUM_IRQS - 1:
 600        if (s->i8259_irq[n]) {
 601            /* Under KVM, Kernel will forward to both PIC and IOAPIC */
 602            qemu_set_irq(s->i8259_irq[n], level);
 603        }
 604        /* fall through */
 605    case ISA_NUM_IRQS ... IOAPIC_NUM_PINS - 1:
 606        qemu_set_irq(s->ioapic_irq[n], level);
 607        break;
 608    case IO_APIC_SECONDARY_IRQBASE
 609        ... IO_APIC_SECONDARY_IRQBASE + IOAPIC_NUM_PINS - 1:
 610        qemu_set_irq(s->ioapic2_irq[n - IO_APIC_SECONDARY_IRQBASE], level);
 611        break;
 612    }
 613}
 614
 615void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
 616{
 617    DeviceState *dev;
 618    SysBusDevice *d;
 619    unsigned int i;
 620
 621    assert(parent_name);
 622    if (kvm_ioapic_in_kernel()) {
 623        dev = qdev_new(TYPE_KVM_IOAPIC);
 624    } else {
 625        dev = qdev_new(TYPE_IOAPIC);
 626    }
 627    object_property_add_child(object_resolve_path(parent_name, NULL),
 628                              "ioapic", OBJECT(dev));
 629    d = SYS_BUS_DEVICE(dev);
 630    sysbus_realize_and_unref(d, &error_fatal);
 631    sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);
 632
 633    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
 634        gsi_state->ioapic_irq[i] = qdev_get_gpio_in(dev, i);
 635    }
 636}
 637
 638DeviceState *ioapic_init_secondary(GSIState *gsi_state)
 639{
 640    DeviceState *dev;
 641    SysBusDevice *d;
 642    unsigned int i;
 643
 644    dev = qdev_new(TYPE_IOAPIC);
 645    d = SYS_BUS_DEVICE(dev);
 646    sysbus_realize_and_unref(d, &error_fatal);
 647    sysbus_mmio_map(d, 0, IO_APIC_SECONDARY_ADDRESS);
 648
 649    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
 650        gsi_state->ioapic2_irq[i] = qdev_get_gpio_in(dev, i);
 651    }
 652    return dev;
 653}
 654
 655struct setup_data {
 656    uint64_t next;
 657    uint32_t type;
 658    uint32_t len;
 659    uint8_t data[];
 660} __attribute__((packed));
 661
 662
 663/*
 664 * The entry point into the kernel for PVH boot is different from
 665 * the native entry point.  The PVH entry is defined by the x86/HVM
 666 * direct boot ABI and is available in an ELFNOTE in the kernel binary.
 667 *
 668 * This function is passed to load_elf() when it is called from
 669 * load_elfboot() which then additionally checks for an ELF Note of
 670 * type XEN_ELFNOTE_PHYS32_ENTRY and passes it to this function to
 671 * parse the PVH entry address from the ELF Note.
 672 *
 673 * Due to trickery in elf_opts.h, load_elf() is actually available as
 674 * load_elf32() or load_elf64() and this routine needs to be able
 675 * to deal with being called as 32 or 64 bit.
 676 *
 677 * The address of the PVH entry point is saved to the 'pvh_start_addr'
 678 * global variable.  (although the entry point is 32-bit, the kernel
 679 * binary can be either 32-bit or 64-bit).
 680 */
 681static uint64_t read_pvh_start_addr(void *arg1, void *arg2, bool is64)
 682{
 683    size_t *elf_note_data_addr;
 684
 685    /* Check if ELF Note header passed in is valid */
 686    if (arg1 == NULL) {
 687        return 0;
 688    }
 689
 690    if (is64) {
 691        struct elf64_note *nhdr64 = (struct elf64_note *)arg1;
 692        uint64_t nhdr_size64 = sizeof(struct elf64_note);
 693        uint64_t phdr_align = *(uint64_t *)arg2;
 694        uint64_t nhdr_namesz = nhdr64->n_namesz;
 695
 696        elf_note_data_addr =
 697            ((void *)nhdr64) + nhdr_size64 +
 698            QEMU_ALIGN_UP(nhdr_namesz, phdr_align);
 699
 700        pvh_start_addr = *elf_note_data_addr;
 701    } else {
 702        struct elf32_note *nhdr32 = (struct elf32_note *)arg1;
 703        uint32_t nhdr_size32 = sizeof(struct elf32_note);
 704        uint32_t phdr_align = *(uint32_t *)arg2;
 705        uint32_t nhdr_namesz = nhdr32->n_namesz;
 706
 707        elf_note_data_addr =
 708            ((void *)nhdr32) + nhdr_size32 +
 709            QEMU_ALIGN_UP(nhdr_namesz, phdr_align);
 710
 711        pvh_start_addr = *(uint32_t *)elf_note_data_addr;
 712    }
 713
 714    return pvh_start_addr;
 715}
 716
 717static bool load_elfboot(const char *kernel_filename,
 718                         int kernel_file_size,
 719                         uint8_t *header,
 720                         size_t pvh_xen_start_addr,
 721                         FWCfgState *fw_cfg)
 722{
 723    uint32_t flags = 0;
 724    uint32_t mh_load_addr = 0;
 725    uint32_t elf_kernel_size = 0;
 726    uint64_t elf_entry;
 727    uint64_t elf_low, elf_high;
 728    int kernel_size;
 729
 730    if (ldl_p(header) != 0x464c457f) {
 731        return false; /* no elfboot */
 732    }
 733
 734    bool elf_is64 = header[EI_CLASS] == ELFCLASS64;
 735    flags = elf_is64 ?
 736        ((Elf64_Ehdr *)header)->e_flags : ((Elf32_Ehdr *)header)->e_flags;
 737
 738    if (flags & 0x00010004) { /* LOAD_ELF_HEADER_HAS_ADDR */
 739        error_report("elfboot unsupported flags = %x", flags);
 740        exit(1);
 741    }
 742
 743    uint64_t elf_note_type = XEN_ELFNOTE_PHYS32_ENTRY;
 744    kernel_size = load_elf(kernel_filename, read_pvh_start_addr,
 745                           NULL, &elf_note_type, &elf_entry,
 746                           &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE,
 747                           0, 0);
 748
 749    if (kernel_size < 0) {
 750        error_report("Error while loading elf kernel");
 751        exit(1);
 752    }
 753    mh_load_addr = elf_low;
 754    elf_kernel_size = elf_high - elf_low;
 755
 756    if (pvh_start_addr == 0) {
 757        error_report("Error loading uncompressed kernel without PVH ELF Note");
 758        exit(1);
 759    }
 760    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, pvh_start_addr);
 761    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr);
 762    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, elf_kernel_size);
 763
 764    return true;
 765}
 766
 767void x86_load_linux(X86MachineState *x86ms,
 768                    FWCfgState *fw_cfg,
 769                    int acpi_data_size,
 770                    bool pvh_enabled,
 771                    bool legacy_no_rng_seed)
 772{
 773    bool linuxboot_dma_enabled = X86_MACHINE_GET_CLASS(x86ms)->fwcfg_dma_enabled;
 774    uint16_t protocol;
 775    int setup_size, kernel_size, cmdline_size;
 776    int dtb_size, setup_data_offset;
 777    uint32_t initrd_max;
 778    uint8_t header[8192], *setup, *kernel;
 779    hwaddr real_addr, prot_addr, cmdline_addr, initrd_addr = 0, first_setup_data = 0;
 780    FILE *f;
 781    char *vmode;
 782    MachineState *machine = MACHINE(x86ms);
 783    struct setup_data *setup_data;
 784    const char *kernel_filename = machine->kernel_filename;
 785    const char *initrd_filename = machine->initrd_filename;
 786    const char *dtb_filename = machine->dtb;
 787    const char *kernel_cmdline = machine->kernel_cmdline;
 788    SevKernelLoaderContext sev_load_ctx = {};
 789    enum { RNG_SEED_LENGTH = 32 };
 790
 791    /* Align to 16 bytes as a paranoia measure */
 792    cmdline_size = (strlen(kernel_cmdline) + 16) & ~15;
 793
 794    /* load the kernel header */
 795    f = fopen(kernel_filename, "rb");
 796    if (!f) {
 797        fprintf(stderr, "qemu: could not open kernel file '%s': %s\n",
 798                kernel_filename, strerror(errno));
 799        exit(1);
 800    }
 801
 802    kernel_size = get_file_size(f);
 803    if (!kernel_size ||
 804        fread(header, 1, MIN(ARRAY_SIZE(header), kernel_size), f) !=
 805        MIN(ARRAY_SIZE(header), kernel_size)) {
 806        fprintf(stderr, "qemu: could not load kernel '%s': %s\n",
 807                kernel_filename, strerror(errno));
 808        exit(1);
 809    }
 810
 811    /* kernel protocol version */
 812    if (ldl_p(header + 0x202) == 0x53726448) {
 813        protocol = lduw_p(header + 0x206);
 814    } else {
 815        /*
 816         * This could be a multiboot kernel. If it is, let's stop treating it
 817         * like a Linux kernel.
 818         * Note: some multiboot images could be in the ELF format (the same of
 819         * PVH), so we try multiboot first since we check the multiboot magic
 820         * header before to load it.
 821         */
 822        if (load_multiboot(x86ms, fw_cfg, f, kernel_filename, initrd_filename,
 823                           kernel_cmdline, kernel_size, header)) {
 824            return;
 825        }
 826        /*
 827         * Check if the file is an uncompressed kernel file (ELF) and load it,
 828         * saving the PVH entry point used by the x86/HVM direct boot ABI.
 829         * If load_elfboot() is successful, populate the fw_cfg info.
 830         */
 831        if (pvh_enabled &&
 832            load_elfboot(kernel_filename, kernel_size,
 833                         header, pvh_start_addr, fw_cfg)) {
 834            fclose(f);
 835
 836            fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE,
 837                strlen(kernel_cmdline) + 1);
 838            fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
 839
 840            fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, sizeof(header));
 841            fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA,
 842                             header, sizeof(header));
 843
 844            /* load initrd */
 845            if (initrd_filename) {
 846                GMappedFile *mapped_file;
 847                gsize initrd_size;
 848                gchar *initrd_data;
 849                GError *gerr = NULL;
 850
 851                mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
 852                if (!mapped_file) {
 853                    fprintf(stderr, "qemu: error reading initrd %s: %s\n",
 854                            initrd_filename, gerr->message);
 855                    exit(1);
 856                }
 857                x86ms->initrd_mapped_file = mapped_file;
 858
 859                initrd_data = g_mapped_file_get_contents(mapped_file);
 860                initrd_size = g_mapped_file_get_length(mapped_file);
 861                initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
 862                if (initrd_size >= initrd_max) {
 863                    fprintf(stderr, "qemu: initrd is too large, cannot support."
 864                            "(max: %"PRIu32", need %"PRId64")\n",
 865                            initrd_max, (uint64_t)initrd_size);
 866                    exit(1);
 867                }
 868
 869                initrd_addr = (initrd_max - initrd_size) & ~4095;
 870
 871                fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
 872                fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
 873                fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data,
 874                                 initrd_size);
 875            }
 876
 877            option_rom[nb_option_roms].bootindex = 0;
 878            option_rom[nb_option_roms].name = "pvh.bin";
 879            nb_option_roms++;
 880
 881            return;
 882        }
 883        protocol = 0;
 884    }
 885
 886    if (protocol < 0x200 || !(header[0x211] & 0x01)) {
 887        /* Low kernel */
 888        real_addr    = 0x90000;
 889        cmdline_addr = 0x9a000 - cmdline_size;
 890        prot_addr    = 0x10000;
 891    } else if (protocol < 0x202) {
 892        /* High but ancient kernel */
 893        real_addr    = 0x90000;
 894        cmdline_addr = 0x9a000 - cmdline_size;
 895        prot_addr    = 0x100000;
 896    } else {
 897        /* High and recent kernel */
 898        real_addr    = 0x10000;
 899        cmdline_addr = 0x20000;
 900        prot_addr    = 0x100000;
 901    }
 902
 903    /* highest address for loading the initrd */
 904    if (protocol >= 0x20c &&
 905        lduw_p(header + 0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) {
 906        /*
 907         * Linux has supported initrd up to 4 GB for a very long time (2007,
 908         * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013),
 909         * though it only sets initrd_max to 2 GB to "work around bootloader
 910         * bugs". Luckily, QEMU firmware(which does something like bootloader)
 911         * has supported this.
 912         *
 913         * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can
 914         * be loaded into any address.
 915         *
 916         * In addition, initrd_max is uint32_t simply because QEMU doesn't
 917         * support the 64-bit boot protocol (specifically the ext_ramdisk_image
 918         * field).
 919         *
 920         * Therefore here just limit initrd_max to UINT32_MAX simply as well.
 921         */
 922        initrd_max = UINT32_MAX;
 923    } else if (protocol >= 0x203) {
 924        initrd_max = ldl_p(header + 0x22c);
 925    } else {
 926        initrd_max = 0x37ffffff;
 927    }
 928
 929    if (initrd_max >= x86ms->below_4g_mem_size - acpi_data_size) {
 930        initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
 931    }
 932
 933    fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr);
 934    fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(kernel_cmdline) + 1);
 935    fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
 936    sev_load_ctx.cmdline_data = (char *)kernel_cmdline;
 937    sev_load_ctx.cmdline_size = strlen(kernel_cmdline) + 1;
 938
 939    if (protocol >= 0x202) {
 940        stl_p(header + 0x228, cmdline_addr);
 941    } else {
 942        stw_p(header + 0x20, 0xA33F);
 943        stw_p(header + 0x22, cmdline_addr - real_addr);
 944    }
 945
 946    /* handle vga= parameter */
 947    vmode = strstr(kernel_cmdline, "vga=");
 948    if (vmode) {
 949        unsigned int video_mode;
 950        const char *end;
 951        int ret;
 952        /* skip "vga=" */
 953        vmode += 4;
 954        if (!strncmp(vmode, "normal", 6)) {
 955            video_mode = 0xffff;
 956        } else if (!strncmp(vmode, "ext", 3)) {
 957            video_mode = 0xfffe;
 958        } else if (!strncmp(vmode, "ask", 3)) {
 959            video_mode = 0xfffd;
 960        } else {
 961            ret = qemu_strtoui(vmode, &end, 0, &video_mode);
 962            if (ret != 0 || (*end && *end != ' ')) {
 963                fprintf(stderr, "qemu: invalid 'vga=' kernel parameter.\n");
 964                exit(1);
 965            }
 966        }
 967        stw_p(header + 0x1fa, video_mode);
 968    }
 969
 970    /* loader type */
 971    /*
 972     * High nybble = B reserved for QEMU; low nybble is revision number.
 973     * If this code is substantially changed, you may want to consider
 974     * incrementing the revision.
 975     */
 976    if (protocol >= 0x200) {
 977        header[0x210] = 0xB0;
 978    }
 979    /* heap */
 980    if (protocol >= 0x201) {
 981        header[0x211] |= 0x80; /* CAN_USE_HEAP */
 982        stw_p(header + 0x224, cmdline_addr - real_addr - 0x200);
 983    }
 984
 985    /* load initrd */
 986    if (initrd_filename) {
 987        GMappedFile *mapped_file;
 988        gsize initrd_size;
 989        gchar *initrd_data;
 990        GError *gerr = NULL;
 991
 992        if (protocol < 0x200) {
 993            fprintf(stderr, "qemu: linux kernel too old to load a ram disk\n");
 994            exit(1);
 995        }
 996
 997        mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
 998        if (!mapped_file) {
 999            fprintf(stderr, "qemu: error reading initrd %s: %s\n",
1000                    initrd_filename, gerr->message);
1001            exit(1);
1002        }
1003        x86ms->initrd_mapped_file = mapped_file;
1004
1005        initrd_data = g_mapped_file_get_contents(mapped_file);
1006        initrd_size = g_mapped_file_get_length(mapped_file);
1007        if (initrd_size >= initrd_max) {
1008            fprintf(stderr, "qemu: initrd is too large, cannot support."
1009                    "(max: %"PRIu32", need %"PRId64")\n",
1010                    initrd_max, (uint64_t)initrd_size);
1011            exit(1);
1012        }
1013
1014        initrd_addr = (initrd_max - initrd_size) & ~4095;
1015
1016        fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
1017        fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
1018        fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, initrd_size);
1019        sev_load_ctx.initrd_data = initrd_data;
1020        sev_load_ctx.initrd_size = initrd_size;
1021
1022        stl_p(header + 0x218, initrd_addr);
1023        stl_p(header + 0x21c, initrd_size);
1024    }
1025
1026    /* load kernel and setup */
1027    setup_size = header[0x1f1];
1028    if (setup_size == 0) {
1029        setup_size = 4;
1030    }
1031    setup_size = (setup_size + 1) * 512;
1032    if (setup_size > kernel_size) {
1033        fprintf(stderr, "qemu: invalid kernel header\n");
1034        exit(1);
1035    }
1036    kernel_size -= setup_size;
1037
1038    setup  = g_malloc(setup_size);
1039    kernel = g_malloc(kernel_size);
1040    fseek(f, 0, SEEK_SET);
1041    if (fread(setup, 1, setup_size, f) != setup_size) {
1042        fprintf(stderr, "fread() failed\n");
1043        exit(1);
1044    }
1045    if (fread(kernel, 1, kernel_size, f) != kernel_size) {
1046        fprintf(stderr, "fread() failed\n");
1047        exit(1);
1048    }
1049    fclose(f);
1050
1051    /* append dtb to kernel */
1052    if (dtb_filename) {
1053        if (protocol < 0x209) {
1054            fprintf(stderr, "qemu: Linux kernel too old to load a dtb\n");
1055            exit(1);
1056        }
1057
1058        dtb_size = get_image_size(dtb_filename);
1059        if (dtb_size <= 0) {
1060            fprintf(stderr, "qemu: error reading dtb %s: %s\n",
1061                    dtb_filename, strerror(errno));
1062            exit(1);
1063        }
1064
1065        setup_data_offset = QEMU_ALIGN_UP(kernel_size, 16);
1066        kernel_size = setup_data_offset + sizeof(struct setup_data) + dtb_size;
1067        kernel = g_realloc(kernel, kernel_size);
1068
1069
1070        setup_data = (struct setup_data *)(kernel + setup_data_offset);
1071        setup_data->next = cpu_to_le64(first_setup_data);
1072        first_setup_data = prot_addr + setup_data_offset;
1073        setup_data->type = cpu_to_le32(SETUP_DTB);
1074        setup_data->len = cpu_to_le32(dtb_size);
1075
1076        load_image_size(dtb_filename, setup_data->data, dtb_size);
1077    }
1078
1079    if (!legacy_no_rng_seed) {
1080        setup_data_offset = QEMU_ALIGN_UP(kernel_size, 16);
1081        kernel_size = setup_data_offset + sizeof(struct setup_data) + RNG_SEED_LENGTH;
1082        kernel = g_realloc(kernel, kernel_size);
1083        setup_data = (struct setup_data *)(kernel + setup_data_offset);
1084        setup_data->next = cpu_to_le64(first_setup_data);
1085        first_setup_data = prot_addr + setup_data_offset;
1086        setup_data->type = cpu_to_le32(SETUP_RNG_SEED);
1087        setup_data->len = cpu_to_le32(RNG_SEED_LENGTH);
1088        qemu_guest_getrandom_nofail(setup_data->data, RNG_SEED_LENGTH);
1089    }
1090
1091    /* Offset 0x250 is a pointer to the first setup_data link. */
1092    stq_p(header + 0x250, first_setup_data);
1093
1094    /*
1095     * If we're starting an encrypted VM, it will be OVMF based, which uses the
1096     * efi stub for booting and doesn't require any values to be placed in the
1097     * kernel header.  We therefore don't update the header so the hash of the
1098     * kernel on the other side of the fw_cfg interface matches the hash of the
1099     * file the user passed in.
1100     */
1101    if (!sev_enabled()) {
1102        memcpy(setup, header, MIN(sizeof(header), setup_size));
1103    }
1104
1105    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, prot_addr);
1106    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, kernel_size);
1107    fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, kernel, kernel_size);
1108    sev_load_ctx.kernel_data = (char *)kernel;
1109    sev_load_ctx.kernel_size = kernel_size;
1110
1111    fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_ADDR, real_addr);
1112    fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, setup_size);
1113    fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, setup, setup_size);
1114    sev_load_ctx.setup_data = (char *)setup;
1115    sev_load_ctx.setup_size = setup_size;
1116
1117    if (sev_enabled()) {
1118        sev_add_kernel_loader_hashes(&sev_load_ctx, &error_fatal);
1119    }
1120
1121    option_rom[nb_option_roms].bootindex = 0;
1122    option_rom[nb_option_roms].name = "linuxboot.bin";
1123    if (linuxboot_dma_enabled && fw_cfg_dma_enabled(fw_cfg)) {
1124        option_rom[nb_option_roms].name = "linuxboot_dma.bin";
1125    }
1126    nb_option_roms++;
1127}
1128
1129void x86_bios_rom_init(MachineState *ms, const char *default_firmware,
1130                       MemoryRegion *rom_memory, bool isapc_ram_fw)
1131{
1132    const char *bios_name;
1133    char *filename;
1134    MemoryRegion *bios, *isa_bios;
1135    int bios_size, isa_bios_size;
1136    ssize_t ret;
1137
1138    /* BIOS load */
1139    bios_name = ms->firmware ?: default_firmware;
1140    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
1141    if (filename) {
1142        bios_size = get_image_size(filename);
1143    } else {
1144        bios_size = -1;
1145    }
1146    if (bios_size <= 0 ||
1147        (bios_size % 65536) != 0) {
1148        goto bios_error;
1149    }
1150    bios = g_malloc(sizeof(*bios));
1151    memory_region_init_ram(bios, NULL, "pc.bios", bios_size, &error_fatal);
1152    if (sev_enabled()) {
1153        /*
1154         * The concept of a "reset" simply doesn't exist for
1155         * confidential computing guests, we have to destroy and
1156         * re-launch them instead.  So there is no need to register
1157         * the firmware as rom to properly re-initialize on reset.
1158         * Just go for a straight file load instead.
1159         */
1160        void *ptr = memory_region_get_ram_ptr(bios);
1161        load_image_size(filename, ptr, bios_size);
1162        x86_firmware_configure(ptr, bios_size);
1163    } else {
1164        if (!isapc_ram_fw) {
1165            memory_region_set_readonly(bios, true);
1166        }
1167        ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1);
1168        if (ret != 0) {
1169            goto bios_error;
1170        }
1171    }
1172    g_free(filename);
1173
1174    /* map the last 128KB of the BIOS in ISA space */
1175    isa_bios_size = MIN(bios_size, 128 * KiB);
1176    isa_bios = g_malloc(sizeof(*isa_bios));
1177    memory_region_init_alias(isa_bios, NULL, "isa-bios", bios,
1178                             bios_size - isa_bios_size, isa_bios_size);
1179    memory_region_add_subregion_overlap(rom_memory,
1180                                        0x100000 - isa_bios_size,
1181                                        isa_bios,
1182                                        1);
1183    if (!isapc_ram_fw) {
1184        memory_region_set_readonly(isa_bios, true);
1185    }
1186
1187    /* map all the bios at the top of memory */
1188    memory_region_add_subregion(rom_memory,
1189                                (uint32_t)(-bios_size),
1190                                bios);
1191    return;
1192
1193bios_error:
1194    fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name);
1195    exit(1);
1196}
1197
1198bool x86_machine_is_smm_enabled(const X86MachineState *x86ms)
1199{
1200    bool smm_available = false;
1201
1202    if (x86ms->smm == ON_OFF_AUTO_OFF) {
1203        return false;
1204    }
1205
1206    if (tcg_enabled() || qtest_enabled()) {
1207        smm_available = true;
1208    } else if (kvm_enabled()) {
1209        smm_available = kvm_has_smm();
1210    }
1211
1212    if (smm_available) {
1213        return true;
1214    }
1215
1216    if (x86ms->smm == ON_OFF_AUTO_ON) {
1217        error_report("System Management Mode not supported by this hypervisor.");
1218        exit(1);
1219    }
1220    return false;
1221}
1222
1223static void x86_machine_get_smm(Object *obj, Visitor *v, const char *name,
1224                               void *opaque, Error **errp)
1225{
1226    X86MachineState *x86ms = X86_MACHINE(obj);
1227    OnOffAuto smm = x86ms->smm;
1228
1229    visit_type_OnOffAuto(v, name, &smm, errp);
1230}
1231
1232static void x86_machine_set_smm(Object *obj, Visitor *v, const char *name,
1233                               void *opaque, Error **errp)
1234{
1235    X86MachineState *x86ms = X86_MACHINE(obj);
1236
1237    visit_type_OnOffAuto(v, name, &x86ms->smm, errp);
1238}
1239
1240bool x86_machine_is_acpi_enabled(const X86MachineState *x86ms)
1241{
1242    if (x86ms->acpi == ON_OFF_AUTO_OFF) {
1243        return false;
1244    }
1245    return true;
1246}
1247
1248static void x86_machine_get_acpi(Object *obj, Visitor *v, const char *name,
1249                                 void *opaque, Error **errp)
1250{
1251    X86MachineState *x86ms = X86_MACHINE(obj);
1252    OnOffAuto acpi = x86ms->acpi;
1253
1254    visit_type_OnOffAuto(v, name, &acpi, errp);
1255}
1256
1257static void x86_machine_set_acpi(Object *obj, Visitor *v, const char *name,
1258                                 void *opaque, Error **errp)
1259{
1260    X86MachineState *x86ms = X86_MACHINE(obj);
1261
1262    visit_type_OnOffAuto(v, name, &x86ms->acpi, errp);
1263}
1264
1265static void x86_machine_get_pit(Object *obj, Visitor *v, const char *name,
1266                                    void *opaque, Error **errp)
1267{
1268    X86MachineState *x86ms = X86_MACHINE(obj);
1269    OnOffAuto pit = x86ms->pit;
1270
1271    visit_type_OnOffAuto(v, name, &pit, errp);
1272}
1273
1274static void x86_machine_set_pit(Object *obj, Visitor *v, const char *name,
1275                                    void *opaque, Error **errp)
1276{
1277    X86MachineState *x86ms = X86_MACHINE(obj);;
1278
1279    visit_type_OnOffAuto(v, name, &x86ms->pit, errp);
1280}
1281
1282static void x86_machine_get_pic(Object *obj, Visitor *v, const char *name,
1283                                void *opaque, Error **errp)
1284{
1285    X86MachineState *x86ms = X86_MACHINE(obj);
1286    OnOffAuto pic = x86ms->pic;
1287
1288    visit_type_OnOffAuto(v, name, &pic, errp);
1289}
1290
1291static void x86_machine_set_pic(Object *obj, Visitor *v, const char *name,
1292                                void *opaque, Error **errp)
1293{
1294    X86MachineState *x86ms = X86_MACHINE(obj);
1295
1296    visit_type_OnOffAuto(v, name, &x86ms->pic, errp);
1297}
1298
1299static char *x86_machine_get_oem_id(Object *obj, Error **errp)
1300{
1301    X86MachineState *x86ms = X86_MACHINE(obj);
1302
1303    return g_strdup(x86ms->oem_id);
1304}
1305
1306static void x86_machine_set_oem_id(Object *obj, const char *value, Error **errp)
1307{
1308    X86MachineState *x86ms = X86_MACHINE(obj);
1309    size_t len = strlen(value);
1310
1311    if (len > 6) {
1312        error_setg(errp,
1313                   "User specified "X86_MACHINE_OEM_ID" value is bigger than "
1314                   "6 bytes in size");
1315        return;
1316    }
1317
1318    strncpy(x86ms->oem_id, value, 6);
1319}
1320
1321static char *x86_machine_get_oem_table_id(Object *obj, Error **errp)
1322{
1323    X86MachineState *x86ms = X86_MACHINE(obj);
1324
1325    return g_strdup(x86ms->oem_table_id);
1326}
1327
1328static void x86_machine_set_oem_table_id(Object *obj, const char *value,
1329                                         Error **errp)
1330{
1331    X86MachineState *x86ms = X86_MACHINE(obj);
1332    size_t len = strlen(value);
1333
1334    if (len > 8) {
1335        error_setg(errp,
1336                   "User specified "X86_MACHINE_OEM_TABLE_ID
1337                   " value is bigger than "
1338                   "8 bytes in size");
1339        return;
1340    }
1341    strncpy(x86ms->oem_table_id, value, 8);
1342}
1343
1344static void x86_machine_get_bus_lock_ratelimit(Object *obj, Visitor *v,
1345                                const char *name, void *opaque, Error **errp)
1346{
1347    X86MachineState *x86ms = X86_MACHINE(obj);
1348    uint64_t bus_lock_ratelimit = x86ms->bus_lock_ratelimit;
1349
1350    visit_type_uint64(v, name, &bus_lock_ratelimit, errp);
1351}
1352
1353static void x86_machine_set_bus_lock_ratelimit(Object *obj, Visitor *v,
1354                               const char *name, void *opaque, Error **errp)
1355{
1356    X86MachineState *x86ms = X86_MACHINE(obj);
1357
1358    visit_type_uint64(v, name, &x86ms->bus_lock_ratelimit, errp);
1359}
1360
1361static void machine_get_sgx_epc(Object *obj, Visitor *v, const char *name,
1362                                void *opaque, Error **errp)
1363{
1364    X86MachineState *x86ms = X86_MACHINE(obj);
1365    SgxEPCList *list = x86ms->sgx_epc_list;
1366
1367    visit_type_SgxEPCList(v, name, &list, errp);
1368}
1369
1370static void machine_set_sgx_epc(Object *obj, Visitor *v, const char *name,
1371                                void *opaque, Error **errp)
1372{
1373    X86MachineState *x86ms = X86_MACHINE(obj);
1374    SgxEPCList *list;
1375
1376    list = x86ms->sgx_epc_list;
1377    visit_type_SgxEPCList(v, name, &x86ms->sgx_epc_list, errp);
1378
1379    qapi_free_SgxEPCList(list);
1380}
1381
1382static void x86_machine_initfn(Object *obj)
1383{
1384    X86MachineState *x86ms = X86_MACHINE(obj);
1385
1386    x86ms->smm = ON_OFF_AUTO_AUTO;
1387    x86ms->acpi = ON_OFF_AUTO_AUTO;
1388    x86ms->pit = ON_OFF_AUTO_AUTO;
1389    x86ms->pic = ON_OFF_AUTO_AUTO;
1390    x86ms->pci_irq_mask = ACPI_BUILD_PCI_IRQS;
1391    x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6);
1392    x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8);
1393    x86ms->bus_lock_ratelimit = 0;
1394    x86ms->above_4g_mem_start = 4 * GiB;
1395}
1396
1397static void x86_machine_class_init(ObjectClass *oc, void *data)
1398{
1399    MachineClass *mc = MACHINE_CLASS(oc);
1400    X86MachineClass *x86mc = X86_MACHINE_CLASS(oc);
1401    NMIClass *nc = NMI_CLASS(oc);
1402
1403    mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
1404    mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
1405    mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
1406    x86mc->save_tsc_khz = true;
1407    x86mc->fwcfg_dma_enabled = true;
1408    nc->nmi_monitor_handler = x86_nmi;
1409
1410    object_class_property_add(oc, X86_MACHINE_SMM, "OnOffAuto",
1411        x86_machine_get_smm, x86_machine_set_smm,
1412        NULL, NULL);
1413    object_class_property_set_description(oc, X86_MACHINE_SMM,
1414        "Enable SMM");
1415
1416    object_class_property_add(oc, X86_MACHINE_ACPI, "OnOffAuto",
1417        x86_machine_get_acpi, x86_machine_set_acpi,
1418        NULL, NULL);
1419    object_class_property_set_description(oc, X86_MACHINE_ACPI,
1420        "Enable ACPI");
1421
1422    object_class_property_add(oc, X86_MACHINE_PIT, "OnOffAuto",
1423                              x86_machine_get_pit,
1424                              x86_machine_set_pit,
1425                              NULL, NULL);
1426    object_class_property_set_description(oc, X86_MACHINE_PIT,
1427        "Enable i8254 PIT");
1428
1429    object_class_property_add(oc, X86_MACHINE_PIC, "OnOffAuto",
1430                              x86_machine_get_pic,
1431                              x86_machine_set_pic,
1432                              NULL, NULL);
1433    object_class_property_set_description(oc, X86_MACHINE_PIC,
1434        "Enable i8259 PIC");
1435
1436    object_class_property_add_str(oc, X86_MACHINE_OEM_ID,
1437                                  x86_machine_get_oem_id,
1438                                  x86_machine_set_oem_id);
1439    object_class_property_set_description(oc, X86_MACHINE_OEM_ID,
1440                                          "Override the default value of field OEMID "
1441                                          "in ACPI table header."
1442                                          "The string may be up to 6 bytes in size");
1443
1444
1445    object_class_property_add_str(oc, X86_MACHINE_OEM_TABLE_ID,
1446                                  x86_machine_get_oem_table_id,
1447                                  x86_machine_set_oem_table_id);
1448    object_class_property_set_description(oc, X86_MACHINE_OEM_TABLE_ID,
1449                                          "Override the default value of field OEM Table ID "
1450                                          "in ACPI table header."
1451                                          "The string may be up to 8 bytes in size");
1452
1453    object_class_property_add(oc, X86_MACHINE_BUS_LOCK_RATELIMIT, "uint64_t",
1454                                x86_machine_get_bus_lock_ratelimit,
1455                                x86_machine_set_bus_lock_ratelimit, NULL, NULL);
1456    object_class_property_set_description(oc, X86_MACHINE_BUS_LOCK_RATELIMIT,
1457            "Set the ratelimit for the bus locks acquired in VMs");
1458
1459    object_class_property_add(oc, "sgx-epc", "SgxEPC",
1460        machine_get_sgx_epc, machine_set_sgx_epc,
1461        NULL, NULL);
1462    object_class_property_set_description(oc, "sgx-epc",
1463        "SGX EPC device");
1464}
1465
1466static const TypeInfo x86_machine_info = {
1467    .name = TYPE_X86_MACHINE,
1468    .parent = TYPE_MACHINE,
1469    .abstract = true,
1470    .instance_size = sizeof(X86MachineState),
1471    .instance_init = x86_machine_initfn,
1472    .class_size = sizeof(X86MachineClass),
1473    .class_init = x86_machine_class_init,
1474    .interfaces = (InterfaceInfo[]) {
1475         { TYPE_NMI },
1476         { }
1477    },
1478};
1479
1480static void x86_machine_register_types(void)
1481{
1482    type_register_static(&x86_machine_info);
1483}
1484
1485type_init(x86_machine_register_types)
1486