qemu/hw/i386/x86.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2004 Fabrice Bellard
   3 * Copyright (c) 2019 Red Hat, Inc.
   4 *
   5 * Permission is hereby granted, free of charge, to any person obtaining a copy
   6 * of this software and associated documentation files (the "Software"), to deal
   7 * in the Software without restriction, including without limitation the rights
   8 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   9 * copies of the Software, and to permit persons to whom the Software is
  10 * furnished to do so, subject to the following conditions:
  11 *
  12 * The above copyright notice and this permission notice shall be included in
  13 * all copies or substantial portions of the Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  21 * THE SOFTWARE.
  22 */
  23#include "qemu/osdep.h"
  24#include "qemu/error-report.h"
  25#include "qemu/option.h"
  26#include "qemu/cutils.h"
  27#include "qemu/units.h"
  28#include "qemu-common.h"
  29#include "qapi/error.h"
  30#include "qapi/qmp/qerror.h"
  31#include "qapi/qapi-visit-common.h"
  32#include "qapi/visitor.h"
  33#include "sysemu/qtest.h"
  34#include "sysemu/numa.h"
  35#include "sysemu/replay.h"
  36#include "sysemu/sysemu.h"
  37#include "trace.h"
  38
  39#include "hw/i386/x86.h"
  40#include "target/i386/cpu.h"
  41#include "hw/i386/topology.h"
  42#include "hw/i386/fw_cfg.h"
  43#include "hw/intc/i8259.h"
  44
  45#include "hw/acpi/cpu_hotplug.h"
  46#include "hw/irq.h"
  47#include "hw/nmi.h"
  48#include "hw/loader.h"
  49#include "multiboot.h"
  50#include "elf.h"
  51#include "standard-headers/asm-x86/bootparam.h"
  52#include "config-devices.h"
  53#include "kvm_i386.h"
  54
  55#define BIOS_FILENAME "bios.bin"
  56
  57/* Physical Address of PVH entry point read from kernel ELF NOTE */
  58static size_t pvh_start_addr;
  59
  60inline void init_topo_info(X86CPUTopoInfo *topo_info,
  61                           const X86MachineState *x86ms)
  62{
  63    MachineState *ms = MACHINE(x86ms);
  64
  65    topo_info->nodes_per_pkg = ms->numa_state->num_nodes / ms->smp.sockets;
  66    topo_info->dies_per_pkg = x86ms->smp_dies;
  67    topo_info->cores_per_die = ms->smp.cores;
  68    topo_info->threads_per_core = ms->smp.threads;
  69}
  70
  71/*
  72 * Set up with the new EPYC topology handlers
  73 *
  74 * AMD uses different apic id encoding for EPYC based cpus. Override
  75 * the default topo handlers with EPYC encoding handlers.
  76 */
  77static void x86_set_epyc_topo_handlers(MachineState *machine)
  78{
  79    X86MachineState *x86ms = X86_MACHINE(machine);
  80
  81    x86ms->apicid_from_cpu_idx = x86_apicid_from_cpu_idx_epyc;
  82    x86ms->topo_ids_from_apicid = x86_topo_ids_from_apicid_epyc;
  83    x86ms->apicid_from_topo_ids = x86_apicid_from_topo_ids_epyc;
  84    x86ms->apicid_pkg_offset = apicid_pkg_offset_epyc;
  85}
  86
  87/*
  88 * Calculates initial APIC ID for a specific CPU index
  89 *
  90 * Currently we need to be able to calculate the APIC ID from the CPU index
  91 * alone (without requiring a CPU object), as the QEMU<->Seabios interfaces have
  92 * no concept of "CPU index", and the NUMA tables on fw_cfg need the APIC ID of
  93 * all CPUs up to max_cpus.
  94 */
  95uint32_t x86_cpu_apic_id_from_index(X86MachineState *x86ms,
  96                                    unsigned int cpu_index)
  97{
  98    X86MachineClass *x86mc = X86_MACHINE_GET_CLASS(x86ms);
  99    X86CPUTopoInfo topo_info;
 100    uint32_t correct_id;
 101    static bool warned;
 102
 103    init_topo_info(&topo_info, x86ms);
 104
 105    correct_id = x86ms->apicid_from_cpu_idx(&topo_info, cpu_index);
 106    if (x86mc->compat_apic_id_mode) {
 107        if (cpu_index != correct_id && !warned && !qtest_enabled()) {
 108            error_report("APIC IDs set in compatibility mode, "
 109                         "CPU topology won't match the configuration");
 110            warned = true;
 111        }
 112        return cpu_index;
 113    } else {
 114        return correct_id;
 115    }
 116}
 117
 118
 119void x86_cpu_new(X86MachineState *x86ms, int64_t apic_id, Error **errp)
 120{
 121    Object *cpu = NULL;
 122    Error *local_err = NULL;
 123
 124    cpu = object_new(MACHINE(x86ms)->cpu_type);
 125
 126    object_property_set_uint(cpu, apic_id, "apic-id", &local_err);
 127    object_property_set_bool(cpu, true, "realized", &local_err);
 128
 129    object_unref(cpu);
 130    error_propagate(errp, local_err);
 131}
 132
 133void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
 134{
 135    int i;
 136    const CPUArchIdList *possible_cpus;
 137    MachineState *ms = MACHINE(x86ms);
 138    MachineClass *mc = MACHINE_GET_CLASS(x86ms);
 139
 140    /* Check for apicid encoding */
 141    if (cpu_x86_use_epyc_apic_id_encoding(ms->cpu_type)) {
 142        x86_set_epyc_topo_handlers(ms);
 143    }
 144
 145    x86_cpu_set_default_version(default_cpu_version);
 146
 147    /*
 148     * Calculates the limit to CPU APIC ID values
 149     *
 150     * Limit for the APIC ID value, so that all
 151     * CPU APIC IDs are < x86ms->apic_id_limit.
 152     *
 153     * This is used for FW_CFG_MAX_CPUS. See comments on fw_cfg_arch_create().
 154     */
 155    x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms,
 156                                                      ms->smp.max_cpus - 1) + 1;
 157    possible_cpus = mc->possible_cpu_arch_ids(ms);
 158
 159    for (i = 0; i < ms->possible_cpus->len; i++) {
 160        ms->possible_cpus->cpus[i].arch_id =
 161            x86_cpu_apic_id_from_index(x86ms, i);
 162    }
 163
 164    for (i = 0; i < ms->smp.cpus; i++) {
 165        x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
 166    }
 167}
 168
 169CpuInstanceProperties
 170x86_cpu_index_to_props(MachineState *ms, unsigned cpu_index)
 171{
 172    MachineClass *mc = MACHINE_GET_CLASS(ms);
 173    const CPUArchIdList *possible_cpus = mc->possible_cpu_arch_ids(ms);
 174
 175    assert(cpu_index < possible_cpus->len);
 176    return possible_cpus->cpus[cpu_index].props;
 177}
 178
 179int64_t x86_get_default_cpu_node_id(const MachineState *ms, int idx)
 180{
 181   X86CPUTopoIDs topo_ids;
 182   X86MachineState *x86ms = X86_MACHINE(ms);
 183   X86CPUTopoInfo topo_info;
 184
 185   init_topo_info(&topo_info, x86ms);
 186
 187   assert(idx < ms->possible_cpus->len);
 188   x86_topo_ids_from_idx(&topo_info, idx, &topo_ids);
 189   return topo_ids.pkg_id % ms->numa_state->num_nodes;
 190}
 191
 192const CPUArchIdList *x86_possible_cpu_arch_ids(MachineState *ms)
 193{
 194    X86MachineState *x86ms = X86_MACHINE(ms);
 195    unsigned int max_cpus = ms->smp.max_cpus;
 196    X86CPUTopoInfo topo_info;
 197    int i;
 198
 199    if (ms->possible_cpus) {
 200        /*
 201         * make sure that max_cpus hasn't changed since the first use, i.e.
 202         * -smp hasn't been parsed after it
 203         */
 204        assert(ms->possible_cpus->len == max_cpus);
 205        return ms->possible_cpus;
 206    }
 207
 208    ms->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
 209                                  sizeof(CPUArchId) * max_cpus);
 210    ms->possible_cpus->len = max_cpus;
 211
 212    init_topo_info(&topo_info, x86ms);
 213
 214    for (i = 0; i < ms->possible_cpus->len; i++) {
 215        X86CPUTopoIDs topo_ids;
 216
 217        ms->possible_cpus->cpus[i].type = ms->cpu_type;
 218        ms->possible_cpus->cpus[i].vcpus_count = 1;
 219        x86_topo_ids_from_idx(&topo_info, i, &topo_ids);
 220        ms->possible_cpus->cpus[i].props.has_socket_id = true;
 221        ms->possible_cpus->cpus[i].props.socket_id = topo_ids.pkg_id;
 222        if (x86ms->smp_dies > 1) {
 223            ms->possible_cpus->cpus[i].props.has_die_id = true;
 224            ms->possible_cpus->cpus[i].props.die_id = topo_ids.die_id;
 225        }
 226        ms->possible_cpus->cpus[i].props.has_core_id = true;
 227        ms->possible_cpus->cpus[i].props.core_id = topo_ids.core_id;
 228        ms->possible_cpus->cpus[i].props.has_thread_id = true;
 229        ms->possible_cpus->cpus[i].props.thread_id = topo_ids.smt_id;
 230    }
 231    return ms->possible_cpus;
 232}
 233
 234static void x86_nmi(NMIState *n, int cpu_index, Error **errp)
 235{
 236    /* cpu index isn't used */
 237    CPUState *cs;
 238
 239    CPU_FOREACH(cs) {
 240        X86CPU *cpu = X86_CPU(cs);
 241
 242        if (!cpu->apic_state) {
 243            cpu_interrupt(cs, CPU_INTERRUPT_NMI);
 244        } else {
 245            apic_deliver_nmi(cpu->apic_state);
 246        }
 247    }
 248}
 249
 250static long get_file_size(FILE *f)
 251{
 252    long where, size;
 253
 254    /* XXX: on Unix systems, using fstat() probably makes more sense */
 255
 256    where = ftell(f);
 257    fseek(f, 0, SEEK_END);
 258    size = ftell(f);
 259    fseek(f, where, SEEK_SET);
 260
 261    return size;
 262}
 263
 264/* TSC handling */
 265uint64_t cpu_get_tsc(CPUX86State *env)
 266{
 267    return cpu_get_ticks();
 268}
 269
 270/* IRQ handling */
 271static void pic_irq_request(void *opaque, int irq, int level)
 272{
 273    CPUState *cs = first_cpu;
 274    X86CPU *cpu = X86_CPU(cs);
 275
 276    trace_x86_pic_interrupt(irq, level);
 277    if (cpu->apic_state && !kvm_irqchip_in_kernel()) {
 278        CPU_FOREACH(cs) {
 279            cpu = X86_CPU(cs);
 280            if (apic_accept_pic_intr(cpu->apic_state)) {
 281                apic_deliver_pic_intr(cpu->apic_state, level);
 282            }
 283        }
 284    } else {
 285        if (level) {
 286            cpu_interrupt(cs, CPU_INTERRUPT_HARD);
 287        } else {
 288            cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
 289        }
 290    }
 291}
 292
 293qemu_irq x86_allocate_cpu_irq(void)
 294{
 295    return qemu_allocate_irq(pic_irq_request, NULL, 0);
 296}
 297
 298int cpu_get_pic_interrupt(CPUX86State *env)
 299{
 300    X86CPU *cpu = env_archcpu(env);
 301    int intno;
 302
 303    if (!kvm_irqchip_in_kernel()) {
 304        intno = apic_get_interrupt(cpu->apic_state);
 305        if (intno >= 0) {
 306            return intno;
 307        }
 308        /* read the irq from the PIC */
 309        if (!apic_accept_pic_intr(cpu->apic_state)) {
 310            return -1;
 311        }
 312    }
 313
 314    intno = pic_read_irq(isa_pic);
 315    return intno;
 316}
 317
 318DeviceState *cpu_get_current_apic(void)
 319{
 320    if (current_cpu) {
 321        X86CPU *cpu = X86_CPU(current_cpu);
 322        return cpu->apic_state;
 323    } else {
 324        return NULL;
 325    }
 326}
 327
 328void gsi_handler(void *opaque, int n, int level)
 329{
 330    GSIState *s = opaque;
 331
 332    trace_x86_gsi_interrupt(n, level);
 333    if (n < ISA_NUM_IRQS) {
 334        /* Under KVM, Kernel will forward to both PIC and IOAPIC */
 335        qemu_set_irq(s->i8259_irq[n], level);
 336    }
 337    qemu_set_irq(s->ioapic_irq[n], level);
 338}
 339
 340void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
 341{
 342    DeviceState *dev;
 343    SysBusDevice *d;
 344    unsigned int i;
 345
 346    assert(parent_name);
 347    if (kvm_ioapic_in_kernel()) {
 348        dev = qdev_create(NULL, TYPE_KVM_IOAPIC);
 349    } else {
 350        dev = qdev_create(NULL, TYPE_IOAPIC);
 351    }
 352    object_property_add_child(object_resolve_path(parent_name, NULL),
 353                              "ioapic", OBJECT(dev), NULL);
 354    qdev_init_nofail(dev);
 355    d = SYS_BUS_DEVICE(dev);
 356    sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);
 357
 358    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
 359        gsi_state->ioapic_irq[i] = qdev_get_gpio_in(dev, i);
 360    }
 361}
 362
 363struct setup_data {
 364    uint64_t next;
 365    uint32_t type;
 366    uint32_t len;
 367    uint8_t data[];
 368} __attribute__((packed));
 369
 370
 371/*
 372 * The entry point into the kernel for PVH boot is different from
 373 * the native entry point.  The PVH entry is defined by the x86/HVM
 374 * direct boot ABI and is available in an ELFNOTE in the kernel binary.
 375 *
 376 * This function is passed to load_elf() when it is called from
 377 * load_elfboot() which then additionally checks for an ELF Note of
 378 * type XEN_ELFNOTE_PHYS32_ENTRY and passes it to this function to
 379 * parse the PVH entry address from the ELF Note.
 380 *
 381 * Due to trickery in elf_opts.h, load_elf() is actually available as
 382 * load_elf32() or load_elf64() and this routine needs to be able
 383 * to deal with being called as 32 or 64 bit.
 384 *
 385 * The address of the PVH entry point is saved to the 'pvh_start_addr'
 386 * global variable.  (although the entry point is 32-bit, the kernel
 387 * binary can be either 32-bit or 64-bit).
 388 */
 389static uint64_t read_pvh_start_addr(void *arg1, void *arg2, bool is64)
 390{
 391    size_t *elf_note_data_addr;
 392
 393    /* Check if ELF Note header passed in is valid */
 394    if (arg1 == NULL) {
 395        return 0;
 396    }
 397
 398    if (is64) {
 399        struct elf64_note *nhdr64 = (struct elf64_note *)arg1;
 400        uint64_t nhdr_size64 = sizeof(struct elf64_note);
 401        uint64_t phdr_align = *(uint64_t *)arg2;
 402        uint64_t nhdr_namesz = nhdr64->n_namesz;
 403
 404        elf_note_data_addr =
 405            ((void *)nhdr64) + nhdr_size64 +
 406            QEMU_ALIGN_UP(nhdr_namesz, phdr_align);
 407    } else {
 408        struct elf32_note *nhdr32 = (struct elf32_note *)arg1;
 409        uint32_t nhdr_size32 = sizeof(struct elf32_note);
 410        uint32_t phdr_align = *(uint32_t *)arg2;
 411        uint32_t nhdr_namesz = nhdr32->n_namesz;
 412
 413        elf_note_data_addr =
 414            ((void *)nhdr32) + nhdr_size32 +
 415            QEMU_ALIGN_UP(nhdr_namesz, phdr_align);
 416    }
 417
 418    pvh_start_addr = *elf_note_data_addr;
 419
 420    return pvh_start_addr;
 421}
 422
 423static bool load_elfboot(const char *kernel_filename,
 424                         int kernel_file_size,
 425                         uint8_t *header,
 426                         size_t pvh_xen_start_addr,
 427                         FWCfgState *fw_cfg)
 428{
 429    uint32_t flags = 0;
 430    uint32_t mh_load_addr = 0;
 431    uint32_t elf_kernel_size = 0;
 432    uint64_t elf_entry;
 433    uint64_t elf_low, elf_high;
 434    int kernel_size;
 435
 436    if (ldl_p(header) != 0x464c457f) {
 437        return false; /* no elfboot */
 438    }
 439
 440    bool elf_is64 = header[EI_CLASS] == ELFCLASS64;
 441    flags = elf_is64 ?
 442        ((Elf64_Ehdr *)header)->e_flags : ((Elf32_Ehdr *)header)->e_flags;
 443
 444    if (flags & 0x00010004) { /* LOAD_ELF_HEADER_HAS_ADDR */
 445        error_report("elfboot unsupported flags = %x", flags);
 446        exit(1);
 447    }
 448
 449    uint64_t elf_note_type = XEN_ELFNOTE_PHYS32_ENTRY;
 450    kernel_size = load_elf(kernel_filename, read_pvh_start_addr,
 451                           NULL, &elf_note_type, &elf_entry,
 452                           &elf_low, &elf_high, NULL, 0, I386_ELF_MACHINE,
 453                           0, 0);
 454
 455    if (kernel_size < 0) {
 456        error_report("Error while loading elf kernel");
 457        exit(1);
 458    }
 459    mh_load_addr = elf_low;
 460    elf_kernel_size = elf_high - elf_low;
 461
 462    if (pvh_start_addr == 0) {
 463        error_report("Error loading uncompressed kernel without PVH ELF Note");
 464        exit(1);
 465    }
 466    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, pvh_start_addr);
 467    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr);
 468    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, elf_kernel_size);
 469
 470    return true;
 471}
 472
 473void x86_load_linux(X86MachineState *x86ms,
 474                    FWCfgState *fw_cfg,
 475                    int acpi_data_size,
 476                    bool pvh_enabled,
 477                    bool linuxboot_dma_enabled)
 478{
 479    uint16_t protocol;
 480    int setup_size, kernel_size, cmdline_size;
 481    int dtb_size, setup_data_offset;
 482    uint32_t initrd_max;
 483    uint8_t header[8192], *setup, *kernel;
 484    hwaddr real_addr, prot_addr, cmdline_addr, initrd_addr = 0;
 485    FILE *f;
 486    char *vmode;
 487    MachineState *machine = MACHINE(x86ms);
 488    struct setup_data *setup_data;
 489    const char *kernel_filename = machine->kernel_filename;
 490    const char *initrd_filename = machine->initrd_filename;
 491    const char *dtb_filename = machine->dtb;
 492    const char *kernel_cmdline = machine->kernel_cmdline;
 493
 494    /* Align to 16 bytes as a paranoia measure */
 495    cmdline_size = (strlen(kernel_cmdline) + 16) & ~15;
 496
 497    /* load the kernel header */
 498    f = fopen(kernel_filename, "rb");
 499    if (!f) {
 500        fprintf(stderr, "qemu: could not open kernel file '%s': %s\n",
 501                kernel_filename, strerror(errno));
 502        exit(1);
 503    }
 504
 505    kernel_size = get_file_size(f);
 506    if (!kernel_size ||
 507        fread(header, 1, MIN(ARRAY_SIZE(header), kernel_size), f) !=
 508        MIN(ARRAY_SIZE(header), kernel_size)) {
 509        fprintf(stderr, "qemu: could not load kernel '%s': %s\n",
 510                kernel_filename, strerror(errno));
 511        exit(1);
 512    }
 513
 514    /* kernel protocol version */
 515    if (ldl_p(header + 0x202) == 0x53726448) {
 516        protocol = lduw_p(header + 0x206);
 517    } else {
 518        /*
 519         * This could be a multiboot kernel. If it is, let's stop treating it
 520         * like a Linux kernel.
 521         * Note: some multiboot images could be in the ELF format (the same of
 522         * PVH), so we try multiboot first since we check the multiboot magic
 523         * header before to load it.
 524         */
 525        if (load_multiboot(fw_cfg, f, kernel_filename, initrd_filename,
 526                           kernel_cmdline, kernel_size, header)) {
 527            return;
 528        }
 529        /*
 530         * Check if the file is an uncompressed kernel file (ELF) and load it,
 531         * saving the PVH entry point used by the x86/HVM direct boot ABI.
 532         * If load_elfboot() is successful, populate the fw_cfg info.
 533         */
 534        if (pvh_enabled &&
 535            load_elfboot(kernel_filename, kernel_size,
 536                         header, pvh_start_addr, fw_cfg)) {
 537            fclose(f);
 538
 539            fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE,
 540                strlen(kernel_cmdline) + 1);
 541            fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
 542
 543            fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, sizeof(header));
 544            fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA,
 545                             header, sizeof(header));
 546
 547            /* load initrd */
 548            if (initrd_filename) {
 549                GMappedFile *mapped_file;
 550                gsize initrd_size;
 551                gchar *initrd_data;
 552                GError *gerr = NULL;
 553
 554                mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
 555                if (!mapped_file) {
 556                    fprintf(stderr, "qemu: error reading initrd %s: %s\n",
 557                            initrd_filename, gerr->message);
 558                    exit(1);
 559                }
 560                x86ms->initrd_mapped_file = mapped_file;
 561
 562                initrd_data = g_mapped_file_get_contents(mapped_file);
 563                initrd_size = g_mapped_file_get_length(mapped_file);
 564                initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
 565                if (initrd_size >= initrd_max) {
 566                    fprintf(stderr, "qemu: initrd is too large, cannot support."
 567                            "(max: %"PRIu32", need %"PRId64")\n",
 568                            initrd_max, (uint64_t)initrd_size);
 569                    exit(1);
 570                }
 571
 572                initrd_addr = (initrd_max - initrd_size) & ~4095;
 573
 574                fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
 575                fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
 576                fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data,
 577                                 initrd_size);
 578            }
 579
 580            option_rom[nb_option_roms].bootindex = 0;
 581            option_rom[nb_option_roms].name = "pvh.bin";
 582            nb_option_roms++;
 583
 584            return;
 585        }
 586        protocol = 0;
 587    }
 588
 589    if (protocol < 0x200 || !(header[0x211] & 0x01)) {
 590        /* Low kernel */
 591        real_addr    = 0x90000;
 592        cmdline_addr = 0x9a000 - cmdline_size;
 593        prot_addr    = 0x10000;
 594    } else if (protocol < 0x202) {
 595        /* High but ancient kernel */
 596        real_addr    = 0x90000;
 597        cmdline_addr = 0x9a000 - cmdline_size;
 598        prot_addr    = 0x100000;
 599    } else {
 600        /* High and recent kernel */
 601        real_addr    = 0x10000;
 602        cmdline_addr = 0x20000;
 603        prot_addr    = 0x100000;
 604    }
 605
 606    /* highest address for loading the initrd */
 607    if (protocol >= 0x20c &&
 608        lduw_p(header + 0x236) & XLF_CAN_BE_LOADED_ABOVE_4G) {
 609        /*
 610         * Linux has supported initrd up to 4 GB for a very long time (2007,
 611         * long before XLF_CAN_BE_LOADED_ABOVE_4G which was added in 2013),
 612         * though it only sets initrd_max to 2 GB to "work around bootloader
 613         * bugs". Luckily, QEMU firmware(which does something like bootloader)
 614         * has supported this.
 615         *
 616         * It's believed that if XLF_CAN_BE_LOADED_ABOVE_4G is set, initrd can
 617         * be loaded into any address.
 618         *
 619         * In addition, initrd_max is uint32_t simply because QEMU doesn't
 620         * support the 64-bit boot protocol (specifically the ext_ramdisk_image
 621         * field).
 622         *
 623         * Therefore here just limit initrd_max to UINT32_MAX simply as well.
 624         */
 625        initrd_max = UINT32_MAX;
 626    } else if (protocol >= 0x203) {
 627        initrd_max = ldl_p(header + 0x22c);
 628    } else {
 629        initrd_max = 0x37ffffff;
 630    }
 631
 632    if (initrd_max >= x86ms->below_4g_mem_size - acpi_data_size) {
 633        initrd_max = x86ms->below_4g_mem_size - acpi_data_size - 1;
 634    }
 635
 636    fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_ADDR, cmdline_addr);
 637    fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE, strlen(kernel_cmdline) + 1);
 638    fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
 639
 640    if (protocol >= 0x202) {
 641        stl_p(header + 0x228, cmdline_addr);
 642    } else {
 643        stw_p(header + 0x20, 0xA33F);
 644        stw_p(header + 0x22, cmdline_addr - real_addr);
 645    }
 646
 647    /* handle vga= parameter */
 648    vmode = strstr(kernel_cmdline, "vga=");
 649    if (vmode) {
 650        unsigned int video_mode;
 651        const char *end;
 652        int ret;
 653        /* skip "vga=" */
 654        vmode += 4;
 655        if (!strncmp(vmode, "normal", 6)) {
 656            video_mode = 0xffff;
 657        } else if (!strncmp(vmode, "ext", 3)) {
 658            video_mode = 0xfffe;
 659        } else if (!strncmp(vmode, "ask", 3)) {
 660            video_mode = 0xfffd;
 661        } else {
 662            ret = qemu_strtoui(vmode, &end, 0, &video_mode);
 663            if (ret != 0 || (*end && *end != ' ')) {
 664                fprintf(stderr, "qemu: invalid 'vga=' kernel parameter.\n");
 665                exit(1);
 666            }
 667        }
 668        stw_p(header + 0x1fa, video_mode);
 669    }
 670
 671    /* loader type */
 672    /*
 673     * High nybble = B reserved for QEMU; low nybble is revision number.
 674     * If this code is substantially changed, you may want to consider
 675     * incrementing the revision.
 676     */
 677    if (protocol >= 0x200) {
 678        header[0x210] = 0xB0;
 679    }
 680    /* heap */
 681    if (protocol >= 0x201) {
 682        header[0x211] |= 0x80; /* CAN_USE_HEAP */
 683        stw_p(header + 0x224, cmdline_addr - real_addr - 0x200);
 684    }
 685
 686    /* load initrd */
 687    if (initrd_filename) {
 688        GMappedFile *mapped_file;
 689        gsize initrd_size;
 690        gchar *initrd_data;
 691        GError *gerr = NULL;
 692
 693        if (protocol < 0x200) {
 694            fprintf(stderr, "qemu: linux kernel too old to load a ram disk\n");
 695            exit(1);
 696        }
 697
 698        mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
 699        if (!mapped_file) {
 700            fprintf(stderr, "qemu: error reading initrd %s: %s\n",
 701                    initrd_filename, gerr->message);
 702            exit(1);
 703        }
 704        x86ms->initrd_mapped_file = mapped_file;
 705
 706        initrd_data = g_mapped_file_get_contents(mapped_file);
 707        initrd_size = g_mapped_file_get_length(mapped_file);
 708        if (initrd_size >= initrd_max) {
 709            fprintf(stderr, "qemu: initrd is too large, cannot support."
 710                    "(max: %"PRIu32", need %"PRId64")\n",
 711                    initrd_max, (uint64_t)initrd_size);
 712            exit(1);
 713        }
 714
 715        initrd_addr = (initrd_max - initrd_size) & ~4095;
 716
 717        fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
 718        fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
 719        fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, initrd_size);
 720
 721        stl_p(header + 0x218, initrd_addr);
 722        stl_p(header + 0x21c, initrd_size);
 723    }
 724
 725    /* load kernel and setup */
 726    setup_size = header[0x1f1];
 727    if (setup_size == 0) {
 728        setup_size = 4;
 729    }
 730    setup_size = (setup_size + 1) * 512;
 731    if (setup_size > kernel_size) {
 732        fprintf(stderr, "qemu: invalid kernel header\n");
 733        exit(1);
 734    }
 735    kernel_size -= setup_size;
 736
 737    setup  = g_malloc(setup_size);
 738    kernel = g_malloc(kernel_size);
 739    fseek(f, 0, SEEK_SET);
 740    if (fread(setup, 1, setup_size, f) != setup_size) {
 741        fprintf(stderr, "fread() failed\n");
 742        exit(1);
 743    }
 744    if (fread(kernel, 1, kernel_size, f) != kernel_size) {
 745        fprintf(stderr, "fread() failed\n");
 746        exit(1);
 747    }
 748    fclose(f);
 749
 750    /* append dtb to kernel */
 751    if (dtb_filename) {
 752        if (protocol < 0x209) {
 753            fprintf(stderr, "qemu: Linux kernel too old to load a dtb\n");
 754            exit(1);
 755        }
 756
 757        dtb_size = get_image_size(dtb_filename);
 758        if (dtb_size <= 0) {
 759            fprintf(stderr, "qemu: error reading dtb %s: %s\n",
 760                    dtb_filename, strerror(errno));
 761            exit(1);
 762        }
 763
 764        setup_data_offset = QEMU_ALIGN_UP(kernel_size, 16);
 765        kernel_size = setup_data_offset + sizeof(struct setup_data) + dtb_size;
 766        kernel = g_realloc(kernel, kernel_size);
 767
 768        stq_p(header + 0x250, prot_addr + setup_data_offset);
 769
 770        setup_data = (struct setup_data *)(kernel + setup_data_offset);
 771        setup_data->next = 0;
 772        setup_data->type = cpu_to_le32(SETUP_DTB);
 773        setup_data->len = cpu_to_le32(dtb_size);
 774
 775        load_image_size(dtb_filename, setup_data->data, dtb_size);
 776    }
 777
 778    memcpy(setup, header, MIN(sizeof(header), setup_size));
 779
 780    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, prot_addr);
 781    fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, kernel_size);
 782    fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, kernel, kernel_size);
 783
 784    fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_ADDR, real_addr);
 785    fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, setup_size);
 786    fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA, setup, setup_size);
 787
 788    option_rom[nb_option_roms].bootindex = 0;
 789    option_rom[nb_option_roms].name = "linuxboot.bin";
 790    if (linuxboot_dma_enabled && fw_cfg_dma_enabled(fw_cfg)) {
 791        option_rom[nb_option_roms].name = "linuxboot_dma.bin";
 792    }
 793    nb_option_roms++;
 794}
 795
 796void x86_bios_rom_init(MemoryRegion *rom_memory, bool isapc_ram_fw)
 797{
 798    char *filename;
 799    MemoryRegion *bios, *isa_bios;
 800    int bios_size, isa_bios_size;
 801    int ret;
 802
 803    /* BIOS load */
 804    if (bios_name == NULL) {
 805        bios_name = BIOS_FILENAME;
 806    }
 807    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
 808    if (filename) {
 809        bios_size = get_image_size(filename);
 810    } else {
 811        bios_size = -1;
 812    }
 813    if (bios_size <= 0 ||
 814        (bios_size % 65536) != 0) {
 815        goto bios_error;
 816    }
 817    bios = g_malloc(sizeof(*bios));
 818    memory_region_init_ram(bios, NULL, "pc.bios", bios_size, &error_fatal);
 819    if (!isapc_ram_fw) {
 820        memory_region_set_readonly(bios, true);
 821    }
 822    ret = rom_add_file_fixed(bios_name, (uint32_t)(-bios_size), -1);
 823    if (ret != 0) {
 824    bios_error:
 825        fprintf(stderr, "qemu: could not load PC BIOS '%s'\n", bios_name);
 826        exit(1);
 827    }
 828    g_free(filename);
 829
 830    /* map the last 128KB of the BIOS in ISA space */
 831    isa_bios_size = MIN(bios_size, 128 * KiB);
 832    isa_bios = g_malloc(sizeof(*isa_bios));
 833    memory_region_init_alias(isa_bios, NULL, "isa-bios", bios,
 834                             bios_size - isa_bios_size, isa_bios_size);
 835    memory_region_add_subregion_overlap(rom_memory,
 836                                        0x100000 - isa_bios_size,
 837                                        isa_bios,
 838                                        1);
 839    if (!isapc_ram_fw) {
 840        memory_region_set_readonly(isa_bios, true);
 841    }
 842
 843    /* map all the bios at the top of memory */
 844    memory_region_add_subregion(rom_memory,
 845                                (uint32_t)(-bios_size),
 846                                bios);
 847}
 848
 849static void x86_machine_get_max_ram_below_4g(Object *obj, Visitor *v,
 850                                             const char *name, void *opaque,
 851                                             Error **errp)
 852{
 853    X86MachineState *x86ms = X86_MACHINE(obj);
 854    uint64_t value = x86ms->max_ram_below_4g;
 855
 856    visit_type_size(v, name, &value, errp);
 857}
 858
 859static void x86_machine_set_max_ram_below_4g(Object *obj, Visitor *v,
 860                                             const char *name, void *opaque,
 861                                             Error **errp)
 862{
 863    X86MachineState *x86ms = X86_MACHINE(obj);
 864    Error *error = NULL;
 865    uint64_t value;
 866
 867    visit_type_size(v, name, &value, &error);
 868    if (error) {
 869        error_propagate(errp, error);
 870        return;
 871    }
 872    if (value > 4 * GiB) {
 873        error_setg(&error,
 874                   "Machine option 'max-ram-below-4g=%"PRIu64
 875                   "' expects size less than or equal to 4G", value);
 876        error_propagate(errp, error);
 877        return;
 878    }
 879
 880    if (value < 1 * MiB) {
 881        warn_report("Only %" PRIu64 " bytes of RAM below the 4GiB boundary,"
 882                    "BIOS may not work with less than 1MiB", value);
 883    }
 884
 885    x86ms->max_ram_below_4g = value;
 886}
 887
 888bool x86_machine_is_smm_enabled(X86MachineState *x86ms)
 889{
 890    bool smm_available = false;
 891
 892    if (x86ms->smm == ON_OFF_AUTO_OFF) {
 893        return false;
 894    }
 895
 896    if (tcg_enabled() || qtest_enabled()) {
 897        smm_available = true;
 898    } else if (kvm_enabled()) {
 899        smm_available = kvm_has_smm();
 900    }
 901
 902    if (smm_available) {
 903        return true;
 904    }
 905
 906    if (x86ms->smm == ON_OFF_AUTO_ON) {
 907        error_report("System Management Mode not supported by this hypervisor.");
 908        exit(1);
 909    }
 910    return false;
 911}
 912
 913static void x86_machine_get_smm(Object *obj, Visitor *v, const char *name,
 914                               void *opaque, Error **errp)
 915{
 916    X86MachineState *x86ms = X86_MACHINE(obj);
 917    OnOffAuto smm = x86ms->smm;
 918
 919    visit_type_OnOffAuto(v, name, &smm, errp);
 920}
 921
 922static void x86_machine_set_smm(Object *obj, Visitor *v, const char *name,
 923                               void *opaque, Error **errp)
 924{
 925    X86MachineState *x86ms = X86_MACHINE(obj);
 926
 927    visit_type_OnOffAuto(v, name, &x86ms->smm, errp);
 928}
 929
 930bool x86_machine_is_acpi_enabled(X86MachineState *x86ms)
 931{
 932    if (x86ms->acpi == ON_OFF_AUTO_OFF) {
 933        return false;
 934    }
 935    return true;
 936}
 937
 938static void x86_machine_get_acpi(Object *obj, Visitor *v, const char *name,
 939                                 void *opaque, Error **errp)
 940{
 941    X86MachineState *x86ms = X86_MACHINE(obj);
 942    OnOffAuto acpi = x86ms->acpi;
 943
 944    visit_type_OnOffAuto(v, name, &acpi, errp);
 945}
 946
 947static void x86_machine_set_acpi(Object *obj, Visitor *v, const char *name,
 948                                 void *opaque, Error **errp)
 949{
 950    X86MachineState *x86ms = X86_MACHINE(obj);
 951
 952    visit_type_OnOffAuto(v, name, &x86ms->acpi, errp);
 953}
 954
 955static void x86_machine_initfn(Object *obj)
 956{
 957    X86MachineState *x86ms = X86_MACHINE(obj);
 958
 959    x86ms->smm = ON_OFF_AUTO_AUTO;
 960    x86ms->acpi = ON_OFF_AUTO_AUTO;
 961    x86ms->max_ram_below_4g = 0; /* use default */
 962    x86ms->smp_dies = 1;
 963
 964    x86ms->apicid_from_cpu_idx = x86_apicid_from_cpu_idx;
 965    x86ms->topo_ids_from_apicid = x86_topo_ids_from_apicid;
 966    x86ms->apicid_from_topo_ids = x86_apicid_from_topo_ids;
 967    x86ms->apicid_pkg_offset = apicid_pkg_offset;
 968}
 969
 970static void x86_machine_class_init(ObjectClass *oc, void *data)
 971{
 972    MachineClass *mc = MACHINE_CLASS(oc);
 973    X86MachineClass *x86mc = X86_MACHINE_CLASS(oc);
 974    NMIClass *nc = NMI_CLASS(oc);
 975
 976    mc->cpu_index_to_instance_props = x86_cpu_index_to_props;
 977    mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
 978    mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
 979    x86mc->compat_apic_id_mode = false;
 980    x86mc->save_tsc_khz = true;
 981    nc->nmi_monitor_handler = x86_nmi;
 982
 983    object_class_property_add(oc, X86_MACHINE_MAX_RAM_BELOW_4G, "size",
 984        x86_machine_get_max_ram_below_4g, x86_machine_set_max_ram_below_4g,
 985        NULL, NULL, &error_abort);
 986    object_class_property_set_description(oc, X86_MACHINE_MAX_RAM_BELOW_4G,
 987        "Maximum ram below the 4G boundary (32bit boundary)", &error_abort);
 988
 989    object_class_property_add(oc, X86_MACHINE_SMM, "OnOffAuto",
 990        x86_machine_get_smm, x86_machine_set_smm,
 991        NULL, NULL, &error_abort);
 992    object_class_property_set_description(oc, X86_MACHINE_SMM,
 993        "Enable SMM", &error_abort);
 994
 995    object_class_property_add(oc, X86_MACHINE_ACPI, "OnOffAuto",
 996        x86_machine_get_acpi, x86_machine_set_acpi,
 997        NULL, NULL, &error_abort);
 998    object_class_property_set_description(oc, X86_MACHINE_ACPI,
 999        "Enable ACPI", &error_abort);
1000}
1001
1002static const TypeInfo x86_machine_info = {
1003    .name = TYPE_X86_MACHINE,
1004    .parent = TYPE_MACHINE,
1005    .abstract = true,
1006    .instance_size = sizeof(X86MachineState),
1007    .instance_init = x86_machine_initfn,
1008    .class_size = sizeof(X86MachineClass),
1009    .class_init = x86_machine_class_init,
1010    .interfaces = (InterfaceInfo[]) {
1011         { TYPE_NMI },
1012         { }
1013    },
1014};
1015
1016static void x86_machine_register_types(void)
1017{
1018    type_register_static(&x86_machine_info);
1019}
1020
1021type_init(x86_machine_register_types)
1022