qemu/hw/ppc/spapr.c
<<
>>
Prefs
   1/*
   2 * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
   3 *
   4 * Copyright (c) 2004-2007 Fabrice Bellard
   5 * Copyright (c) 2007 Jocelyn Mayer
   6 * Copyright (c) 2010 David Gibson, IBM Corporation.
   7 *
   8 * Permission is hereby granted, free of charge, to any person obtaining a copy
   9 * of this software and associated documentation files (the "Software"), to deal
  10 * in the Software without restriction, including without limitation the rights
  11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12 * copies of the Software, and to permit persons to whom the Software is
  13 * furnished to do so, subject to the following conditions:
  14 *
  15 * The above copyright notice and this permission notice shall be included in
  16 * all copies or substantial portions of the Software.
  17 *
  18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24 * THE SOFTWARE.
  25 */
  26
  27#include "qemu/osdep.h"
  28#include "qemu-common.h"
  29#include "qemu/datadir.h"
  30#include "qemu/memalign.h"
  31#include "qapi/error.h"
  32#include "qapi/qapi-events-machine.h"
  33#include "qapi/qapi-events-qdev.h"
  34#include "qapi/visitor.h"
  35#include "sysemu/sysemu.h"
  36#include "sysemu/hostmem.h"
  37#include "sysemu/numa.h"
  38#include "sysemu/qtest.h"
  39#include "sysemu/reset.h"
  40#include "sysemu/runstate.h"
  41#include "qemu/log.h"
  42#include "hw/fw-path-provider.h"
  43#include "elf.h"
  44#include "net/net.h"
  45#include "sysemu/device_tree.h"
  46#include "sysemu/cpus.h"
  47#include "sysemu/hw_accel.h"
  48#include "kvm_ppc.h"
  49#include "migration/misc.h"
  50#include "migration/qemu-file-types.h"
  51#include "migration/global_state.h"
  52#include "migration/register.h"
  53#include "migration/blocker.h"
  54#include "mmu-hash64.h"
  55#include "mmu-book3s-v3.h"
  56#include "cpu-models.h"
  57#include "hw/core/cpu.h"
  58
  59#include "hw/ppc/ppc.h"
  60#include "hw/loader.h"
  61
  62#include "hw/ppc/fdt.h"
  63#include "hw/ppc/spapr.h"
  64#include "hw/ppc/spapr_vio.h"
  65#include "hw/qdev-properties.h"
  66#include "hw/pci-host/spapr.h"
  67#include "hw/pci/msi.h"
  68
  69#include "hw/pci/pci.h"
  70#include "hw/scsi/scsi.h"
  71#include "hw/virtio/virtio-scsi.h"
  72#include "hw/virtio/vhost-scsi-common.h"
  73
  74#include "exec/ram_addr.h"
  75#include "hw/usb.h"
  76#include "qemu/config-file.h"
  77#include "qemu/error-report.h"
  78#include "trace.h"
  79#include "hw/nmi.h"
  80#include "hw/intc/intc.h"
  81
  82#include "hw/ppc/spapr_cpu_core.h"
  83#include "hw/mem/memory-device.h"
  84#include "hw/ppc/spapr_tpm_proxy.h"
  85#include "hw/ppc/spapr_nvdimm.h"
  86#include "hw/ppc/spapr_numa.h"
  87#include "hw/ppc/pef.h"
  88
  89#include "monitor/monitor.h"
  90
  91#include <libfdt.h>
  92
  93/* SLOF memory layout:
  94 *
  95 * SLOF raw image loaded at 0, copies its romfs right below the flat
  96 * device-tree, then position SLOF itself 31M below that
  97 *
  98 * So we set FW_OVERHEAD to 40MB which should account for all of that
  99 * and more
 100 *
 101 * We load our kernel at 4M, leaving space for SLOF initial image
 102 */
 103#define FDT_MAX_ADDR            0x80000000 /* FDT must stay below that */
 104#define FW_MAX_SIZE             0x400000
 105#define FW_FILE_NAME            "slof.bin"
 106#define FW_FILE_NAME_VOF        "vof.bin"
 107#define FW_OVERHEAD             0x2800000
 108#define KERNEL_LOAD_ADDR        FW_MAX_SIZE
 109
 110#define MIN_RMA_SLOF            (128 * MiB)
 111
 112#define PHANDLE_INTC            0x00001111
 113
 114/* These two functions implement the VCPU id numbering: one to compute them
 115 * all and one to identify thread 0 of a VCORE. Any change to the first one
 116 * is likely to have an impact on the second one, so let's keep them close.
 117 */
 118static int spapr_vcpu_id(SpaprMachineState *spapr, int cpu_index)
 119{
 120    MachineState *ms = MACHINE(spapr);
 121    unsigned int smp_threads = ms->smp.threads;
 122
 123    assert(spapr->vsmt);
 124    return
 125        (cpu_index / smp_threads) * spapr->vsmt + cpu_index % smp_threads;
 126}
 127static bool spapr_is_thread0_in_vcore(SpaprMachineState *spapr,
 128                                      PowerPCCPU *cpu)
 129{
 130    assert(spapr->vsmt);
 131    return spapr_get_vcpu_id(cpu) % spapr->vsmt == 0;
 132}
 133
 134static bool pre_2_10_vmstate_dummy_icp_needed(void *opaque)
 135{
 136    /* Dummy entries correspond to unused ICPState objects in older QEMUs,
 137     * and newer QEMUs don't even have them. In both cases, we don't want
 138     * to send anything on the wire.
 139     */
 140    return false;
 141}
 142
 143static const VMStateDescription pre_2_10_vmstate_dummy_icp = {
 144    .name = "icp/server",
 145    .version_id = 1,
 146    .minimum_version_id = 1,
 147    .needed = pre_2_10_vmstate_dummy_icp_needed,
 148    .fields = (VMStateField[]) {
 149        VMSTATE_UNUSED(4), /* uint32_t xirr */
 150        VMSTATE_UNUSED(1), /* uint8_t pending_priority */
 151        VMSTATE_UNUSED(1), /* uint8_t mfrr */
 152        VMSTATE_END_OF_LIST()
 153    },
 154};
 155
 156static void pre_2_10_vmstate_register_dummy_icp(int i)
 157{
 158    vmstate_register(NULL, i, &pre_2_10_vmstate_dummy_icp,
 159                     (void *)(uintptr_t) i);
 160}
 161
 162static void pre_2_10_vmstate_unregister_dummy_icp(int i)
 163{
 164    vmstate_unregister(NULL, &pre_2_10_vmstate_dummy_icp,
 165                       (void *)(uintptr_t) i);
 166}
 167
 168int spapr_max_server_number(SpaprMachineState *spapr)
 169{
 170    MachineState *ms = MACHINE(spapr);
 171
 172    assert(spapr->vsmt);
 173    return DIV_ROUND_UP(ms->smp.max_cpus * spapr->vsmt, ms->smp.threads);
 174}
 175
 176static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
 177                                  int smt_threads)
 178{
 179    int i, ret = 0;
 180    uint32_t servers_prop[smt_threads];
 181    uint32_t gservers_prop[smt_threads * 2];
 182    int index = spapr_get_vcpu_id(cpu);
 183
 184    if (cpu->compat_pvr) {
 185        ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->compat_pvr);
 186        if (ret < 0) {
 187            return ret;
 188        }
 189    }
 190
 191    /* Build interrupt servers and gservers properties */
 192    for (i = 0; i < smt_threads; i++) {
 193        servers_prop[i] = cpu_to_be32(index + i);
 194        /* Hack, direct the group queues back to cpu 0 */
 195        gservers_prop[i*2] = cpu_to_be32(index + i);
 196        gservers_prop[i*2 + 1] = 0;
 197    }
 198    ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
 199                      servers_prop, sizeof(servers_prop));
 200    if (ret < 0) {
 201        return ret;
 202    }
 203    ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
 204                      gservers_prop, sizeof(gservers_prop));
 205
 206    return ret;
 207}
 208
 209static void spapr_dt_pa_features(SpaprMachineState *spapr,
 210                                 PowerPCCPU *cpu,
 211                                 void *fdt, int offset)
 212{
 213    uint8_t pa_features_206[] = { 6, 0,
 214        0xf6, 0x1f, 0xc7, 0x00, 0x80, 0xc0 };
 215    uint8_t pa_features_207[] = { 24, 0,
 216        0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0,
 217        0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
 218        0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
 219        0x80, 0x00, 0x80, 0x00, 0x00, 0x00 };
 220    uint8_t pa_features_300[] = { 66, 0,
 221        /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */
 222        /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, SSO, 5: LE|CFAR|EB|LSQ */
 223        0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0, /* 0 - 5 */
 224        /* 6: DS207 */
 225        0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */
 226        /* 16: Vector */
 227        0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */
 228        /* 18: Vec. Scalar, 20: Vec. XOR, 22: HTM */
 229        0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */
 230        /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */
 231        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */
 232        /* 30: MMR, 32: LE atomic, 34: EBB + ext EBB */
 233        0x80, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */
 234        /* 36: SPR SO, 38: Copy/Paste, 40: Radix MMU */
 235        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 36 - 41 */
 236        /* 42: PM, 44: PC RA, 46: SC vec'd */
 237        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */
 238        /* 48: SIMD, 50: QP BFP, 52: String */
 239        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
 240        /* 54: DecFP, 56: DecI, 58: SHA */
 241        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
 242        /* 60: NM atomic, 62: RNG */
 243        0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
 244    };
 245    uint8_t *pa_features = NULL;
 246    size_t pa_size;
 247
 248    if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_06, 0, cpu->compat_pvr)) {
 249        pa_features = pa_features_206;
 250        pa_size = sizeof(pa_features_206);
 251    }
 252    if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_07, 0, cpu->compat_pvr)) {
 253        pa_features = pa_features_207;
 254        pa_size = sizeof(pa_features_207);
 255    }
 256    if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, cpu->compat_pvr)) {
 257        pa_features = pa_features_300;
 258        pa_size = sizeof(pa_features_300);
 259    }
 260    if (!pa_features) {
 261        return;
 262    }
 263
 264    if (ppc_hash64_has(cpu, PPC_HASH64_CI_LARGEPAGE)) {
 265        /*
 266         * Note: we keep CI large pages off by default because a 64K capable
 267         * guest provisioned with large pages might otherwise try to map a qemu
 268         * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages
 269         * even if that qemu runs on a 4k host.
 270         * We dd this bit back here if we are confident this is not an issue
 271         */
 272        pa_features[3] |= 0x20;
 273    }
 274    if ((spapr_get_cap(spapr, SPAPR_CAP_HTM) != 0) && pa_size > 24) {
 275        pa_features[24] |= 0x80;    /* Transactional memory support */
 276    }
 277    if (spapr->cas_pre_isa3_guest && pa_size > 40) {
 278        /* Workaround for broken kernels that attempt (guest) radix
 279         * mode when they can't handle it, if they see the radix bit set
 280         * in pa-features. So hide it from them. */
 281        pa_features[40 + 2] &= ~0x80; /* Radix MMU */
 282    }
 283
 284    _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
 285}
 286
 287static hwaddr spapr_node0_size(MachineState *machine)
 288{
 289    if (machine->numa_state->num_nodes) {
 290        int i;
 291        for (i = 0; i < machine->numa_state->num_nodes; ++i) {
 292            if (machine->numa_state->nodes[i].node_mem) {
 293                return MIN(pow2floor(machine->numa_state->nodes[i].node_mem),
 294                           machine->ram_size);
 295            }
 296        }
 297    }
 298    return machine->ram_size;
 299}
 300
 301static void add_str(GString *s, const gchar *s1)
 302{
 303    g_string_append_len(s, s1, strlen(s1) + 1);
 304}
 305
 306static int spapr_dt_memory_node(SpaprMachineState *spapr, void *fdt, int nodeid,
 307                                hwaddr start, hwaddr size)
 308{
 309    char mem_name[32];
 310    uint64_t mem_reg_property[2];
 311    int off;
 312
 313    mem_reg_property[0] = cpu_to_be64(start);
 314    mem_reg_property[1] = cpu_to_be64(size);
 315
 316    sprintf(mem_name, "memory@%" HWADDR_PRIx, start);
 317    off = fdt_add_subnode(fdt, 0, mem_name);
 318    _FDT(off);
 319    _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
 320    _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
 321                      sizeof(mem_reg_property))));
 322    spapr_numa_write_associativity_dt(spapr, fdt, off, nodeid);
 323    return off;
 324}
 325
 326static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr)
 327{
 328    MemoryDeviceInfoList *info;
 329
 330    for (info = list; info; info = info->next) {
 331        MemoryDeviceInfo *value = info->value;
 332
 333        if (value && value->type == MEMORY_DEVICE_INFO_KIND_DIMM) {
 334            PCDIMMDeviceInfo *pcdimm_info = value->u.dimm.data;
 335
 336            if (addr >= pcdimm_info->addr &&
 337                addr < (pcdimm_info->addr + pcdimm_info->size)) {
 338                return pcdimm_info->node;
 339            }
 340        }
 341    }
 342
 343    return -1;
 344}
 345
 346struct sPAPRDrconfCellV2 {
 347     uint32_t seq_lmbs;
 348     uint64_t base_addr;
 349     uint32_t drc_index;
 350     uint32_t aa_index;
 351     uint32_t flags;
 352} QEMU_PACKED;
 353
 354typedef struct DrconfCellQueue {
 355    struct sPAPRDrconfCellV2 cell;
 356    QSIMPLEQ_ENTRY(DrconfCellQueue) entry;
 357} DrconfCellQueue;
 358
 359static DrconfCellQueue *
 360spapr_get_drconf_cell(uint32_t seq_lmbs, uint64_t base_addr,
 361                      uint32_t drc_index, uint32_t aa_index,
 362                      uint32_t flags)
 363{
 364    DrconfCellQueue *elem;
 365
 366    elem = g_malloc0(sizeof(*elem));
 367    elem->cell.seq_lmbs = cpu_to_be32(seq_lmbs);
 368    elem->cell.base_addr = cpu_to_be64(base_addr);
 369    elem->cell.drc_index = cpu_to_be32(drc_index);
 370    elem->cell.aa_index = cpu_to_be32(aa_index);
 371    elem->cell.flags = cpu_to_be32(flags);
 372
 373    return elem;
 374}
 375
 376static int spapr_dt_dynamic_memory_v2(SpaprMachineState *spapr, void *fdt,
 377                                      int offset, MemoryDeviceInfoList *dimms)
 378{
 379    MachineState *machine = MACHINE(spapr);
 380    uint8_t *int_buf, *cur_index;
 381    int ret;
 382    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
 383    uint64_t addr, cur_addr, size;
 384    uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
 385    uint64_t mem_end = machine->device_memory->base +
 386                       memory_region_size(&machine->device_memory->mr);
 387    uint32_t node, buf_len, nr_entries = 0;
 388    SpaprDrc *drc;
 389    DrconfCellQueue *elem, *next;
 390    MemoryDeviceInfoList *info;
 391    QSIMPLEQ_HEAD(, DrconfCellQueue) drconf_queue
 392        = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue);
 393
 394    /* Entry to cover RAM and the gap area */
 395    elem = spapr_get_drconf_cell(nr_boot_lmbs, 0, 0, -1,
 396                                 SPAPR_LMB_FLAGS_RESERVED |
 397                                 SPAPR_LMB_FLAGS_DRC_INVALID);
 398    QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
 399    nr_entries++;
 400
 401    cur_addr = machine->device_memory->base;
 402    for (info = dimms; info; info = info->next) {
 403        PCDIMMDeviceInfo *di = info->value->u.dimm.data;
 404
 405        addr = di->addr;
 406        size = di->size;
 407        node = di->node;
 408
 409        /*
 410         * The NVDIMM area is hotpluggable after the NVDIMM is unplugged. The
 411         * area is marked hotpluggable in the next iteration for the bigger
 412         * chunk including the NVDIMM occupied area.
 413         */
 414        if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM)
 415            continue;
 416
 417        /* Entry for hot-pluggable area */
 418        if (cur_addr < addr) {
 419            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
 420            g_assert(drc);
 421            elem = spapr_get_drconf_cell((addr - cur_addr) / lmb_size,
 422                                         cur_addr, spapr_drc_index(drc), -1, 0);
 423            QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
 424            nr_entries++;
 425        }
 426
 427        /* Entry for DIMM */
 428        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
 429        g_assert(drc);
 430        elem = spapr_get_drconf_cell(size / lmb_size, addr,
 431                                     spapr_drc_index(drc), node,
 432                                     (SPAPR_LMB_FLAGS_ASSIGNED |
 433                                      SPAPR_LMB_FLAGS_HOTREMOVABLE));
 434        QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
 435        nr_entries++;
 436        cur_addr = addr + size;
 437    }
 438
 439    /* Entry for remaining hotpluggable area */
 440    if (cur_addr < mem_end) {
 441        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
 442        g_assert(drc);
 443        elem = spapr_get_drconf_cell((mem_end - cur_addr) / lmb_size,
 444                                     cur_addr, spapr_drc_index(drc), -1, 0);
 445        QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
 446        nr_entries++;
 447    }
 448
 449    buf_len = nr_entries * sizeof(struct sPAPRDrconfCellV2) + sizeof(uint32_t);
 450    int_buf = cur_index = g_malloc0(buf_len);
 451    *(uint32_t *)int_buf = cpu_to_be32(nr_entries);
 452    cur_index += sizeof(nr_entries);
 453
 454    QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) {
 455        memcpy(cur_index, &elem->cell, sizeof(elem->cell));
 456        cur_index += sizeof(elem->cell);
 457        QSIMPLEQ_REMOVE(&drconf_queue, elem, DrconfCellQueue, entry);
 458        g_free(elem);
 459    }
 460
 461    ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len);
 462    g_free(int_buf);
 463    if (ret < 0) {
 464        return -1;
 465    }
 466    return 0;
 467}
 468
 469static int spapr_dt_dynamic_memory(SpaprMachineState *spapr, void *fdt,
 470                                   int offset, MemoryDeviceInfoList *dimms)
 471{
 472    MachineState *machine = MACHINE(spapr);
 473    int i, ret;
 474    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
 475    uint32_t device_lmb_start = machine->device_memory->base / lmb_size;
 476    uint32_t nr_lmbs = (machine->device_memory->base +
 477                       memory_region_size(&machine->device_memory->mr)) /
 478                       lmb_size;
 479    uint32_t *int_buf, *cur_index, buf_len;
 480
 481    /*
 482     * Allocate enough buffer size to fit in ibm,dynamic-memory
 483     */
 484    buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t);
 485    cur_index = int_buf = g_malloc0(buf_len);
 486    int_buf[0] = cpu_to_be32(nr_lmbs);
 487    cur_index++;
 488    for (i = 0; i < nr_lmbs; i++) {
 489        uint64_t addr = i * lmb_size;
 490        uint32_t *dynamic_memory = cur_index;
 491
 492        if (i >= device_lmb_start) {
 493            SpaprDrc *drc;
 494
 495            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, i);
 496            g_assert(drc);
 497
 498            dynamic_memory[0] = cpu_to_be32(addr >> 32);
 499            dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
 500            dynamic_memory[2] = cpu_to_be32(spapr_drc_index(drc));
 501            dynamic_memory[3] = cpu_to_be32(0); /* reserved */
 502            dynamic_memory[4] = cpu_to_be32(spapr_pc_dimm_node(dimms, addr));
 503            if (memory_region_present(get_system_memory(), addr)) {
 504                dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED);
 505            } else {
 506                dynamic_memory[5] = cpu_to_be32(0);
 507            }
 508        } else {
 509            /*
 510             * LMB information for RMA, boot time RAM and gap b/n RAM and
 511             * device memory region -- all these are marked as reserved
 512             * and as having no valid DRC.
 513             */
 514            dynamic_memory[0] = cpu_to_be32(addr >> 32);
 515            dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
 516            dynamic_memory[2] = cpu_to_be32(0);
 517            dynamic_memory[3] = cpu_to_be32(0); /* reserved */
 518            dynamic_memory[4] = cpu_to_be32(-1);
 519            dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
 520                                            SPAPR_LMB_FLAGS_DRC_INVALID);
 521        }
 522
 523        cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
 524    }
 525    ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
 526    g_free(int_buf);
 527    if (ret < 0) {
 528        return -1;
 529    }
 530    return 0;
 531}
 532
 533/*
 534 * Adds ibm,dynamic-reconfiguration-memory node.
 535 * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
 536 * of this device tree node.
 537 */
 538static int spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState *spapr,
 539                                                   void *fdt)
 540{
 541    MachineState *machine = MACHINE(spapr);
 542    int ret, offset;
 543    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
 544    uint32_t prop_lmb_size[] = {cpu_to_be32(lmb_size >> 32),
 545                                cpu_to_be32(lmb_size & 0xffffffff)};
 546    MemoryDeviceInfoList *dimms = NULL;
 547
 548    /*
 549     * Don't create the node if there is no device memory
 550     */
 551    if (machine->ram_size == machine->maxram_size) {
 552        return 0;
 553    }
 554
 555    offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
 556
 557    ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
 558                    sizeof(prop_lmb_size));
 559    if (ret < 0) {
 560        return ret;
 561    }
 562
 563    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
 564    if (ret < 0) {
 565        return ret;
 566    }
 567
 568    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
 569    if (ret < 0) {
 570        return ret;
 571    }
 572
 573    /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */
 574    dimms = qmp_memory_device_list();
 575    if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) {
 576        ret = spapr_dt_dynamic_memory_v2(spapr, fdt, offset, dimms);
 577    } else {
 578        ret = spapr_dt_dynamic_memory(spapr, fdt, offset, dimms);
 579    }
 580    qapi_free_MemoryDeviceInfoList(dimms);
 581
 582    if (ret < 0) {
 583        return ret;
 584    }
 585
 586    ret = spapr_numa_write_assoc_lookup_arrays(spapr, fdt, offset);
 587
 588    return ret;
 589}
 590
 591static int spapr_dt_memory(SpaprMachineState *spapr, void *fdt)
 592{
 593    MachineState *machine = MACHINE(spapr);
 594    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
 595    hwaddr mem_start, node_size;
 596    int i, nb_nodes = machine->numa_state->num_nodes;
 597    NodeInfo *nodes = machine->numa_state->nodes;
 598
 599    for (i = 0, mem_start = 0; i < nb_nodes; ++i) {
 600        if (!nodes[i].node_mem) {
 601            continue;
 602        }
 603        if (mem_start >= machine->ram_size) {
 604            node_size = 0;
 605        } else {
 606            node_size = nodes[i].node_mem;
 607            if (node_size > machine->ram_size - mem_start) {
 608                node_size = machine->ram_size - mem_start;
 609            }
 610        }
 611        if (!mem_start) {
 612            /* spapr_machine_init() checks for rma_size <= node0_size
 613             * already */
 614            spapr_dt_memory_node(spapr, fdt, i, 0, spapr->rma_size);
 615            mem_start += spapr->rma_size;
 616            node_size -= spapr->rma_size;
 617        }
 618        for ( ; node_size; ) {
 619            hwaddr sizetmp = pow2floor(node_size);
 620
 621            /* mem_start != 0 here */
 622            if (ctzl(mem_start) < ctzl(sizetmp)) {
 623                sizetmp = 1ULL << ctzl(mem_start);
 624            }
 625
 626            spapr_dt_memory_node(spapr, fdt, i, mem_start, sizetmp);
 627            node_size -= sizetmp;
 628            mem_start += sizetmp;
 629        }
 630    }
 631
 632    /* Generate ibm,dynamic-reconfiguration-memory node if required */
 633    if (spapr_ovec_test(spapr->ov5_cas, OV5_DRCONF_MEMORY)) {
 634        int ret;
 635
 636        g_assert(smc->dr_lmb_enabled);
 637        ret = spapr_dt_dynamic_reconfiguration_memory(spapr, fdt);
 638        if (ret) {
 639            return ret;
 640        }
 641    }
 642
 643    return 0;
 644}
 645
 646static void spapr_dt_cpu(CPUState *cs, void *fdt, int offset,
 647                         SpaprMachineState *spapr)
 648{
 649    MachineState *ms = MACHINE(spapr);
 650    PowerPCCPU *cpu = POWERPC_CPU(cs);
 651    CPUPPCState *env = &cpu->env;
 652    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
 653    int index = spapr_get_vcpu_id(cpu);
 654    uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
 655                       0xffffffff, 0xffffffff};
 656    uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq()
 657        : SPAPR_TIMEBASE_FREQ;
 658    uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
 659    uint32_t page_sizes_prop[64];
 660    size_t page_sizes_prop_size;
 661    unsigned int smp_threads = ms->smp.threads;
 662    uint32_t vcpus_per_socket = smp_threads * ms->smp.cores;
 663    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
 664    int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));
 665    SpaprDrc *drc;
 666    int drc_index;
 667    uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ];
 668    int i;
 669
 670    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, index);
 671    if (drc) {
 672        drc_index = spapr_drc_index(drc);
 673        _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)));
 674    }
 675
 676    _FDT((fdt_setprop_cell(fdt, offset, "reg", index)));
 677    _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
 678
 679    _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR])));
 680    _FDT((fdt_setprop_cell(fdt, offset, "d-cache-block-size",
 681                           env->dcache_line_size)));
 682    _FDT((fdt_setprop_cell(fdt, offset, "d-cache-line-size",
 683                           env->dcache_line_size)));
 684    _FDT((fdt_setprop_cell(fdt, offset, "i-cache-block-size",
 685                           env->icache_line_size)));
 686    _FDT((fdt_setprop_cell(fdt, offset, "i-cache-line-size",
 687                           env->icache_line_size)));
 688
 689    if (pcc->l1_dcache_size) {
 690        _FDT((fdt_setprop_cell(fdt, offset, "d-cache-size",
 691                               pcc->l1_dcache_size)));
 692    } else {
 693        warn_report("Unknown L1 dcache size for cpu");
 694    }
 695    if (pcc->l1_icache_size) {
 696        _FDT((fdt_setprop_cell(fdt, offset, "i-cache-size",
 697                               pcc->l1_icache_size)));
 698    } else {
 699        warn_report("Unknown L1 icache size for cpu");
 700    }
 701
 702    _FDT((fdt_setprop_cell(fdt, offset, "timebase-frequency", tbfreq)));
 703    _FDT((fdt_setprop_cell(fdt, offset, "clock-frequency", cpufreq)));
 704    _FDT((fdt_setprop_cell(fdt, offset, "slb-size", cpu->hash64_opts->slb_size)));
 705    _FDT((fdt_setprop_cell(fdt, offset, "ibm,slb-size", cpu->hash64_opts->slb_size)));
 706    _FDT((fdt_setprop_string(fdt, offset, "status", "okay")));
 707    _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0)));
 708
 709    if (ppc_has_spr(cpu, SPR_PURR)) {
 710        _FDT((fdt_setprop_cell(fdt, offset, "ibm,purr", 1)));
 711    }
 712    if (ppc_has_spr(cpu, SPR_PURR)) {
 713        _FDT((fdt_setprop_cell(fdt, offset, "ibm,spurr", 1)));
 714    }
 715
 716    if (ppc_hash64_has(cpu, PPC_HASH64_1TSEG)) {
 717        _FDT((fdt_setprop(fdt, offset, "ibm,processor-segment-sizes",
 718                          segs, sizeof(segs))));
 719    }
 720
 721    /* Advertise VSX (vector extensions) if available
 722     *   1               == VMX / Altivec available
 723     *   2               == VSX available
 724     *
 725     * Only CPUs for which we create core types in spapr_cpu_core.c
 726     * are possible, and all of those have VMX */
 727    if (env->insns_flags & PPC_ALTIVEC) {
 728        if (spapr_get_cap(spapr, SPAPR_CAP_VSX) != 0) {
 729            _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 2)));
 730        } else {
 731            _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 1)));
 732        }
 733    }
 734
 735    /* Advertise DFP (Decimal Floating Point) if available
 736     *   0 / no property == no DFP
 737     *   1               == DFP available */
 738    if (spapr_get_cap(spapr, SPAPR_CAP_DFP) != 0) {
 739        _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1)));
 740    }
 741
 742    page_sizes_prop_size = ppc_create_page_sizes_prop(cpu, page_sizes_prop,
 743                                                      sizeof(page_sizes_prop));
 744    if (page_sizes_prop_size) {
 745        _FDT((fdt_setprop(fdt, offset, "ibm,segment-page-sizes",
 746                          page_sizes_prop, page_sizes_prop_size)));
 747    }
 748
 749    spapr_dt_pa_features(spapr, cpu, fdt, offset);
 750
 751    _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
 752                           cs->cpu_index / vcpus_per_socket)));
 753
 754    _FDT((fdt_setprop(fdt, offset, "ibm,pft-size",
 755                      pft_size_prop, sizeof(pft_size_prop))));
 756
 757    if (ms->numa_state->num_nodes > 1) {
 758        _FDT(spapr_numa_fixup_cpu_dt(spapr, fdt, offset, cpu));
 759    }
 760
 761    _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt));
 762
 763    if (pcc->radix_page_info) {
 764        for (i = 0; i < pcc->radix_page_info->count; i++) {
 765            radix_AP_encodings[i] =
 766                cpu_to_be32(pcc->radix_page_info->entries[i]);
 767        }
 768        _FDT((fdt_setprop(fdt, offset, "ibm,processor-radix-AP-encodings",
 769                          radix_AP_encodings,
 770                          pcc->radix_page_info->count *
 771                          sizeof(radix_AP_encodings[0]))));
 772    }
 773
 774    /*
 775     * We set this property to let the guest know that it can use the large
 776     * decrementer and its width in bits.
 777     */
 778    if (spapr_get_cap(spapr, SPAPR_CAP_LARGE_DECREMENTER) != SPAPR_CAP_OFF)
 779        _FDT((fdt_setprop_u32(fdt, offset, "ibm,dec-bits",
 780                              pcc->lrg_decr_bits)));
 781}
 782
 783static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr)
 784{
 785    CPUState **rev;
 786    CPUState *cs;
 787    int n_cpus;
 788    int cpus_offset;
 789    int i;
 790
 791    cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
 792    _FDT(cpus_offset);
 793    _FDT((fdt_setprop_cell(fdt, cpus_offset, "#address-cells", 0x1)));
 794    _FDT((fdt_setprop_cell(fdt, cpus_offset, "#size-cells", 0x0)));
 795
 796    /*
 797     * We walk the CPUs in reverse order to ensure that CPU DT nodes
 798     * created by fdt_add_subnode() end up in the right order in FDT
 799     * for the guest kernel the enumerate the CPUs correctly.
 800     *
 801     * The CPU list cannot be traversed in reverse order, so we need
 802     * to do extra work.
 803     */
 804    n_cpus = 0;
 805    rev = NULL;
 806    CPU_FOREACH(cs) {
 807        rev = g_renew(CPUState *, rev, n_cpus + 1);
 808        rev[n_cpus++] = cs;
 809    }
 810
 811    for (i = n_cpus - 1; i >= 0; i--) {
 812        CPUState *cs = rev[i];
 813        PowerPCCPU *cpu = POWERPC_CPU(cs);
 814        int index = spapr_get_vcpu_id(cpu);
 815        DeviceClass *dc = DEVICE_GET_CLASS(cs);
 816        g_autofree char *nodename = NULL;
 817        int offset;
 818
 819        if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
 820            continue;
 821        }
 822
 823        nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
 824        offset = fdt_add_subnode(fdt, cpus_offset, nodename);
 825        _FDT(offset);
 826        spapr_dt_cpu(cs, fdt, offset, spapr);
 827    }
 828
 829    g_free(rev);
 830}
 831
 832static int spapr_dt_rng(void *fdt)
 833{
 834    int node;
 835    int ret;
 836
 837    node = qemu_fdt_add_subnode(fdt, "/ibm,platform-facilities");
 838    if (node <= 0) {
 839        return -1;
 840    }
 841    ret = fdt_setprop_string(fdt, node, "device_type",
 842                             "ibm,platform-facilities");
 843    ret |= fdt_setprop_cell(fdt, node, "#address-cells", 0x1);
 844    ret |= fdt_setprop_cell(fdt, node, "#size-cells", 0x0);
 845
 846    node = fdt_add_subnode(fdt, node, "ibm,random-v1");
 847    if (node <= 0) {
 848        return -1;
 849    }
 850    ret |= fdt_setprop_string(fdt, node, "compatible", "ibm,random");
 851
 852    return ret ? -1 : 0;
 853}
 854
 855static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
 856{
 857    MachineState *ms = MACHINE(spapr);
 858    int rtas;
 859    GString *hypertas = g_string_sized_new(256);
 860    GString *qemu_hypertas = g_string_sized_new(256);
 861    uint64_t max_device_addr = MACHINE(spapr)->device_memory->base +
 862        memory_region_size(&MACHINE(spapr)->device_memory->mr);
 863    uint32_t lrdr_capacity[] = {
 864        cpu_to_be32(max_device_addr >> 32),
 865        cpu_to_be32(max_device_addr & 0xffffffff),
 866        cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE >> 32),
 867        cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE & 0xffffffff),
 868        cpu_to_be32(ms->smp.max_cpus / ms->smp.threads),
 869    };
 870
 871    _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas"));
 872
 873    /* hypertas */
 874    add_str(hypertas, "hcall-pft");
 875    add_str(hypertas, "hcall-term");
 876    add_str(hypertas, "hcall-dabr");
 877    add_str(hypertas, "hcall-interrupt");
 878    add_str(hypertas, "hcall-tce");
 879    add_str(hypertas, "hcall-vio");
 880    add_str(hypertas, "hcall-splpar");
 881    add_str(hypertas, "hcall-join");
 882    add_str(hypertas, "hcall-bulk");
 883    add_str(hypertas, "hcall-set-mode");
 884    add_str(hypertas, "hcall-sprg0");
 885    add_str(hypertas, "hcall-copy");
 886    add_str(hypertas, "hcall-debug");
 887    add_str(hypertas, "hcall-vphn");
 888    if (spapr_get_cap(spapr, SPAPR_CAP_RPT_INVALIDATE) == SPAPR_CAP_ON) {
 889        add_str(hypertas, "hcall-rpt-invalidate");
 890    }
 891
 892    add_str(qemu_hypertas, "hcall-memop1");
 893
 894    if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
 895        add_str(hypertas, "hcall-multi-tce");
 896    }
 897
 898    if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
 899        add_str(hypertas, "hcall-hpt-resize");
 900    }
 901
 902    _FDT(fdt_setprop(fdt, rtas, "ibm,hypertas-functions",
 903                     hypertas->str, hypertas->len));
 904    g_string_free(hypertas, TRUE);
 905    _FDT(fdt_setprop(fdt, rtas, "qemu,hypertas-functions",
 906                     qemu_hypertas->str, qemu_hypertas->len));
 907    g_string_free(qemu_hypertas, TRUE);
 908
 909    spapr_numa_write_rtas_dt(spapr, fdt, rtas);
 910
 911    /*
 912     * FWNMI reserves RTAS_ERROR_LOG_MAX for the machine check error log,
 913     * and 16 bytes per CPU for system reset error log plus an extra 8 bytes.
 914     *
 915     * The system reset requirements are driven by existing Linux and PowerVM
 916     * implementation which (contrary to PAPR) saves r3 in the error log
 917     * structure like machine check, so Linux expects to find the saved r3
 918     * value at the address in r3 upon FWNMI-enabled sreset interrupt (and
 919     * does not look at the error value).
 920     *
 921     * System reset interrupts are not subject to interlock like machine
 922     * check, so this memory area could be corrupted if the sreset is
 923     * interrupted by a machine check (or vice versa) if it was shared. To
 924     * prevent this, system reset uses per-CPU areas for the sreset save
 925     * area. A system reset that interrupts a system reset handler could
 926     * still overwrite this area, but Linux doesn't try to recover in that
 927     * case anyway.
 928     *
 929     * The extra 8 bytes is required because Linux's FWNMI error log check
 930     * is off-by-one.
 931     *
 932     * RTAS_MIN_SIZE is required for the RTAS blob itself.
 933     */
 934    _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_MIN_SIZE +
 935                          RTAS_ERROR_LOG_MAX +
 936                          ms->smp.max_cpus * sizeof(uint64_t) * 2 +
 937                          sizeof(uint64_t)));
 938    _FDT(fdt_setprop_cell(fdt, rtas, "rtas-error-log-max",
 939                          RTAS_ERROR_LOG_MAX));
 940    _FDT(fdt_setprop_cell(fdt, rtas, "rtas-event-scan-rate",
 941                          RTAS_EVENT_SCAN_RATE));
 942
 943    g_assert(msi_nonbroken);
 944    _FDT(fdt_setprop(fdt, rtas, "ibm,change-msix-capable", NULL, 0));
 945
 946    /*
 947     * According to PAPR, rtas ibm,os-term does not guarantee a return
 948     * back to the guest cpu.
 949     *
 950     * While an additional ibm,extended-os-term property indicates
 951     * that rtas call return will always occur. Set this property.
 952     */
 953    _FDT(fdt_setprop(fdt, rtas, "ibm,extended-os-term", NULL, 0));
 954
 955    _FDT(fdt_setprop(fdt, rtas, "ibm,lrdr-capacity",
 956                     lrdr_capacity, sizeof(lrdr_capacity)));
 957
 958    spapr_dt_rtas_tokens(fdt, rtas);
 959}
 960
 961/*
 962 * Prepare ibm,arch-vec-5-platform-support, which indicates the MMU
 963 * and the XIVE features that the guest may request and thus the valid
 964 * values for bytes 23..26 of option vector 5:
 965 */
 966static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
 967                                          int chosen)
 968{
 969    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
 970
 971    char val[2 * 4] = {
 972        23, 0x00, /* XICS / XIVE mode */
 973        24, 0x00, /* Hash/Radix, filled in below. */
 974        25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
 975        26, 0x40, /* Radix options: GTSE == yes. */
 976    };
 977
 978    if (spapr->irq->xics && spapr->irq->xive) {
 979        val[1] = SPAPR_OV5_XIVE_BOTH;
 980    } else if (spapr->irq->xive) {
 981        val[1] = SPAPR_OV5_XIVE_EXPLOIT;
 982    } else {
 983        assert(spapr->irq->xics);
 984        val[1] = SPAPR_OV5_XIVE_LEGACY;
 985    }
 986
 987    if (!ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0,
 988                          first_ppc_cpu->compat_pvr)) {
 989        /*
 990         * If we're in a pre POWER9 compat mode then the guest should
 991         * do hash and use the legacy interrupt mode
 992         */
 993        val[1] = SPAPR_OV5_XIVE_LEGACY; /* XICS */
 994        val[3] = 0x00; /* Hash */
 995        spapr_check_mmu_mode(false);
 996    } else if (kvm_enabled()) {
 997        if (kvmppc_has_cap_mmu_radix() && kvmppc_has_cap_mmu_hash_v3()) {
 998            val[3] = 0x80; /* OV5_MMU_BOTH */
 999        } else if (kvmppc_has_cap_mmu_radix()) {
1000            val[3] = 0x40; /* OV5_MMU_RADIX_300 */
1001        } else {
1002            val[3] = 0x00; /* Hash */
1003        }
1004    } else {
1005        /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
1006        val[3] = 0xC0;
1007    }
1008    _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
1009                     val, sizeof(val)));
1010}
1011
1012static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset)
1013{
1014    MachineState *machine = MACHINE(spapr);
1015    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
1016    int chosen;
1017
1018    _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen"));
1019
1020    if (reset) {
1021        const char *boot_device = spapr->boot_device;
1022        g_autofree char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
1023        size_t cb = 0;
1024        g_autofree char *bootlist = get_boot_devices_list(&cb);
1025
1026        if (machine->kernel_cmdline && machine->kernel_cmdline[0]) {
1027            _FDT(fdt_setprop_string(fdt, chosen, "bootargs",
1028                                    machine->kernel_cmdline));
1029        }
1030
1031        if (spapr->initrd_size) {
1032            _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-start",
1033                                  spapr->initrd_base));
1034            _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-end",
1035                                  spapr->initrd_base + spapr->initrd_size));
1036        }
1037
1038        if (spapr->kernel_size) {
1039            uint64_t kprop[2] = { cpu_to_be64(spapr->kernel_addr),
1040                                  cpu_to_be64(spapr->kernel_size) };
1041
1042            _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel",
1043                         &kprop, sizeof(kprop)));
1044            if (spapr->kernel_le) {
1045                _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel-le", NULL, 0));
1046            }
1047        }
1048        if (boot_menu) {
1049            _FDT((fdt_setprop_cell(fdt, chosen, "qemu,boot-menu", boot_menu)));
1050        }
1051        _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-width", graphic_width));
1052        _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-height", graphic_height));
1053        _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-depth", graphic_depth));
1054
1055        if (cb && bootlist) {
1056            int i;
1057
1058            for (i = 0; i < cb; i++) {
1059                if (bootlist[i] == '\n') {
1060                    bootlist[i] = ' ';
1061                }
1062            }
1063            _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-list", bootlist));
1064        }
1065
1066        if (boot_device && strlen(boot_device)) {
1067            _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-device", boot_device));
1068        }
1069
1070        if (!spapr->has_graphics && stdout_path) {
1071            /*
1072             * "linux,stdout-path" and "stdout" properties are
1073             * deprecated by linux kernel. New platforms should only
1074             * use the "stdout-path" property. Set the new property
1075             * and continue using older property to remain compatible
1076             * with the existing firmware.
1077             */
1078            _FDT(fdt_setprop_string(fdt, chosen, "linux,stdout-path", stdout_path));
1079            _FDT(fdt_setprop_string(fdt, chosen, "stdout-path", stdout_path));
1080        }
1081
1082        /*
1083         * We can deal with BAR reallocation just fine, advertise it
1084         * to the guest
1085         */
1086        if (smc->linux_pci_probe) {
1087            _FDT(fdt_setprop_cell(fdt, chosen, "linux,pci-probe-only", 0));
1088        }
1089
1090        spapr_dt_ov5_platform_support(spapr, fdt, chosen);
1091    }
1092
1093    _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5"));
1094}
1095
1096static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt)
1097{
1098    /* The /hypervisor node isn't in PAPR - this is a hack to allow PR
1099     * KVM to work under pHyp with some guest co-operation */
1100    int hypervisor;
1101    uint8_t hypercall[16];
1102
1103    _FDT(hypervisor = fdt_add_subnode(fdt, 0, "hypervisor"));
1104    /* indicate KVM hypercall interface */
1105    _FDT(fdt_setprop_string(fdt, hypervisor, "compatible", "linux,kvm"));
1106    if (kvmppc_has_cap_fixup_hcalls()) {
1107        /*
1108         * Older KVM versions with older guest kernels were broken
1109         * with the magic page, don't allow the guest to map it.
1110         */
1111        if (!kvmppc_get_hypercall(first_cpu->env_ptr, hypercall,
1112                                  sizeof(hypercall))) {
1113            _FDT(fdt_setprop(fdt, hypervisor, "hcall-instructions",
1114                             hypercall, sizeof(hypercall)));
1115        }
1116    }
1117}
1118
1119void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space)
1120{
1121    MachineState *machine = MACHINE(spapr);
1122    MachineClass *mc = MACHINE_GET_CLASS(machine);
1123    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
1124    uint32_t root_drc_type_mask = 0;
1125    int ret;
1126    void *fdt;
1127    SpaprPhbState *phb;
1128    char *buf;
1129
1130    fdt = g_malloc0(space);
1131    _FDT((fdt_create_empty_tree(fdt, space)));
1132
1133    /* Root node */
1134    _FDT(fdt_setprop_string(fdt, 0, "device_type", "chrp"));
1135    _FDT(fdt_setprop_string(fdt, 0, "model", "IBM pSeries (emulated by qemu)"));
1136    _FDT(fdt_setprop_string(fdt, 0, "compatible", "qemu,pseries"));
1137
1138    /* Guest UUID & Name*/
1139    buf = qemu_uuid_unparse_strdup(&qemu_uuid);
1140    _FDT(fdt_setprop_string(fdt, 0, "vm,uuid", buf));
1141    if (qemu_uuid_set) {
1142        _FDT(fdt_setprop_string(fdt, 0, "system-id", buf));
1143    }
1144    g_free(buf);
1145
1146    if (qemu_get_vm_name()) {
1147        _FDT(fdt_setprop_string(fdt, 0, "ibm,partition-name",
1148                                qemu_get_vm_name()));
1149    }
1150
1151    /* Host Model & Serial Number */
1152    if (spapr->host_model) {
1153        _FDT(fdt_setprop_string(fdt, 0, "host-model", spapr->host_model));
1154    } else if (smc->broken_host_serial_model && kvmppc_get_host_model(&buf)) {
1155        _FDT(fdt_setprop_string(fdt, 0, "host-model", buf));
1156        g_free(buf);
1157    }
1158
1159    if (spapr->host_serial) {
1160        _FDT(fdt_setprop_string(fdt, 0, "host-serial", spapr->host_serial));
1161    } else if (smc->broken_host_serial_model && kvmppc_get_host_serial(&buf)) {
1162        _FDT(fdt_setprop_string(fdt, 0, "host-serial", buf));
1163        g_free(buf);
1164    }
1165
1166    _FDT(fdt_setprop_cell(fdt, 0, "#address-cells", 2));
1167    _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2));
1168
1169    /* /interrupt controller */
1170    spapr_irq_dt(spapr, spapr_max_server_number(spapr), fdt, PHANDLE_INTC);
1171
1172    ret = spapr_dt_memory(spapr, fdt);
1173    if (ret < 0) {
1174        error_report("couldn't setup memory nodes in fdt");
1175        exit(1);
1176    }
1177
1178    /* /vdevice */
1179    spapr_dt_vdevice(spapr->vio_bus, fdt);
1180
1181    if (object_resolve_path_type("", TYPE_SPAPR_RNG, NULL)) {
1182        ret = spapr_dt_rng(fdt);
1183        if (ret < 0) {
1184            error_report("could not set up rng device in the fdt");
1185            exit(1);
1186        }
1187    }
1188
1189    QLIST_FOREACH(phb, &spapr->phbs, list) {
1190        ret = spapr_dt_phb(spapr, phb, PHANDLE_INTC, fdt, NULL);
1191        if (ret < 0) {
1192            error_report("couldn't setup PCI devices in fdt");
1193            exit(1);
1194        }
1195    }
1196
1197    spapr_dt_cpus(fdt, spapr);
1198
1199    /* ibm,drc-indexes and friends */
1200    if (smc->dr_lmb_enabled) {
1201        root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_LMB;
1202    }
1203    if (smc->dr_phb_enabled) {
1204        root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PHB;
1205    }
1206    if (mc->nvdimm_supported) {
1207        root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PMEM;
1208    }
1209    if (root_drc_type_mask) {
1210        _FDT(spapr_dt_drc(fdt, 0, NULL, root_drc_type_mask));
1211    }
1212
1213    if (mc->has_hotpluggable_cpus) {
1214        int offset = fdt_path_offset(fdt, "/cpus");
1215        ret = spapr_dt_drc(fdt, offset, NULL, SPAPR_DR_CONNECTOR_TYPE_CPU);
1216        if (ret < 0) {
1217            error_report("Couldn't set up CPU DR device tree properties");
1218            exit(1);
1219        }
1220    }
1221
1222    /* /event-sources */
1223    spapr_dt_events(spapr, fdt);
1224
1225    /* /rtas */
1226    spapr_dt_rtas(spapr, fdt);
1227
1228    /* /chosen */
1229    spapr_dt_chosen(spapr, fdt, reset);
1230
1231    /* /hypervisor */
1232    if (kvm_enabled()) {
1233        spapr_dt_hypervisor(spapr, fdt);
1234    }
1235
1236    /* Build memory reserve map */
1237    if (reset) {
1238        if (spapr->kernel_size) {
1239            _FDT((fdt_add_mem_rsv(fdt, spapr->kernel_addr,
1240                                  spapr->kernel_size)));
1241        }
1242        if (spapr->initrd_size) {
1243            _FDT((fdt_add_mem_rsv(fdt, spapr->initrd_base,
1244                                  spapr->initrd_size)));
1245        }
1246    }
1247
1248    /* NVDIMM devices */
1249    if (mc->nvdimm_supported) {
1250        spapr_dt_persistent_memory(spapr, fdt);
1251    }
1252
1253    return fdt;
1254}
1255
1256static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
1257{
1258    SpaprMachineState *spapr = opaque;
1259
1260    return (addr & 0x0fffffff) + spapr->kernel_addr;
1261}
1262
1263static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
1264                                    PowerPCCPU *cpu)
1265{
1266    CPUPPCState *env = &cpu->env;
1267
1268    /* The TCG path should also be holding the BQL at this point */
1269    g_assert(qemu_mutex_iothread_locked());
1270
1271    g_assert(!vhyp_cpu_in_nested(cpu));
1272
1273    if (msr_pr) {
1274        hcall_dprintf("Hypercall made with MSR[PR]=1\n");
1275        env->gpr[3] = H_PRIVILEGE;
1276    } else {
1277        env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
1278    }
1279}
1280
1281struct LPCRSyncState {
1282    target_ulong value;
1283    target_ulong mask;
1284};
1285
1286static void do_lpcr_sync(CPUState *cs, run_on_cpu_data arg)
1287{
1288    struct LPCRSyncState *s = arg.host_ptr;
1289    PowerPCCPU *cpu = POWERPC_CPU(cs);
1290    CPUPPCState *env = &cpu->env;
1291    target_ulong lpcr;
1292
1293    cpu_synchronize_state(cs);
1294    lpcr = env->spr[SPR_LPCR];
1295    lpcr &= ~s->mask;
1296    lpcr |= s->value;
1297    ppc_store_lpcr(cpu, lpcr);
1298}
1299
1300void spapr_set_all_lpcrs(target_ulong value, target_ulong mask)
1301{
1302    CPUState *cs;
1303    struct LPCRSyncState s = {
1304        .value = value,
1305        .mask = mask
1306    };
1307    CPU_FOREACH(cs) {
1308        run_on_cpu(cs, do_lpcr_sync, RUN_ON_CPU_HOST_PTR(&s));
1309    }
1310}
1311
1312static bool spapr_get_pate(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu,
1313                           target_ulong lpid, ppc_v3_pate_t *entry)
1314{
1315    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1316    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
1317
1318    if (!spapr_cpu->in_nested) {
1319        assert(lpid == 0);
1320
1321        /* Copy PATE1:GR into PATE0:HR */
1322        entry->dw0 = spapr->patb_entry & PATE0_HR;
1323        entry->dw1 = spapr->patb_entry;
1324
1325    } else {
1326        uint64_t patb, pats;
1327
1328        assert(lpid != 0);
1329
1330        patb = spapr->nested_ptcr & PTCR_PATB;
1331        pats = spapr->nested_ptcr & PTCR_PATS;
1332
1333        /* Calculate number of entries */
1334        pats = 1ull << (pats + 12 - 4);
1335        if (pats <= lpid) {
1336            return false;
1337        }
1338
1339        /* Grab entry */
1340        patb += 16 * lpid;
1341        entry->dw0 = ldq_phys(CPU(cpu)->as, patb);
1342        entry->dw1 = ldq_phys(CPU(cpu)->as, patb + 8);
1343    }
1344
1345    return true;
1346}
1347
1348#define HPTE(_table, _i)   (void *)(((uint64_t *)(_table)) + ((_i) * 2))
1349#define HPTE_VALID(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
1350#define HPTE_DIRTY(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
1351#define CLEAN_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))
1352#define DIRTY_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) |= tswap64(HPTE64_V_HPTE_DIRTY))
1353
1354/*
1355 * Get the fd to access the kernel htab, re-opening it if necessary
1356 */
1357static int get_htab_fd(SpaprMachineState *spapr)
1358{
1359    Error *local_err = NULL;
1360
1361    if (spapr->htab_fd >= 0) {
1362        return spapr->htab_fd;
1363    }
1364
1365    spapr->htab_fd = kvmppc_get_htab_fd(false, 0, &local_err);
1366    if (spapr->htab_fd < 0) {
1367        error_report_err(local_err);
1368    }
1369
1370    return spapr->htab_fd;
1371}
1372
1373void close_htab_fd(SpaprMachineState *spapr)
1374{
1375    if (spapr->htab_fd >= 0) {
1376        close(spapr->htab_fd);
1377    }
1378    spapr->htab_fd = -1;
1379}
1380
1381static hwaddr spapr_hpt_mask(PPCVirtualHypervisor *vhyp)
1382{
1383    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1384
1385    return HTAB_SIZE(spapr) / HASH_PTEG_SIZE_64 - 1;
1386}
1387
1388static target_ulong spapr_encode_hpt_for_kvm_pr(PPCVirtualHypervisor *vhyp)
1389{
1390    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1391
1392    assert(kvm_enabled());
1393
1394    if (!spapr->htab) {
1395        return 0;
1396    }
1397
1398    return (target_ulong)(uintptr_t)spapr->htab | (spapr->htab_shift - 18);
1399}
1400
1401static const ppc_hash_pte64_t *spapr_map_hptes(PPCVirtualHypervisor *vhyp,
1402                                                hwaddr ptex, int n)
1403{
1404    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1405    hwaddr pte_offset = ptex * HASH_PTE_SIZE_64;
1406
1407    if (!spapr->htab) {
1408        /*
1409         * HTAB is controlled by KVM. Fetch into temporary buffer
1410         */
1411        ppc_hash_pte64_t *hptes = g_malloc(n * HASH_PTE_SIZE_64);
1412        kvmppc_read_hptes(hptes, ptex, n);
1413        return hptes;
1414    }
1415
1416    /*
1417     * HTAB is controlled by QEMU. Just point to the internally
1418     * accessible PTEG.
1419     */
1420    return (const ppc_hash_pte64_t *)(spapr->htab + pte_offset);
1421}
1422
1423static void spapr_unmap_hptes(PPCVirtualHypervisor *vhyp,
1424                              const ppc_hash_pte64_t *hptes,
1425                              hwaddr ptex, int n)
1426{
1427    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1428
1429    if (!spapr->htab) {
1430        g_free((void *)hptes);
1431    }
1432
1433    /* Nothing to do for qemu managed HPT */
1434}
1435
1436void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
1437                      uint64_t pte0, uint64_t pte1)
1438{
1439    SpaprMachineState *spapr = SPAPR_MACHINE(cpu->vhyp);
1440    hwaddr offset = ptex * HASH_PTE_SIZE_64;
1441
1442    if (!spapr->htab) {
1443        kvmppc_write_hpte(ptex, pte0, pte1);
1444    } else {
1445        if (pte0 & HPTE64_V_VALID) {
1446            stq_p(spapr->htab + offset + HPTE64_DW1, pte1);
1447            /*
1448             * When setting valid, we write PTE1 first. This ensures
1449             * proper synchronization with the reading code in
1450             * ppc_hash64_pteg_search()
1451             */
1452            smp_wmb();
1453            stq_p(spapr->htab + offset, pte0);
1454        } else {
1455            stq_p(spapr->htab + offset, pte0);
1456            /*
1457             * When clearing it we set PTE0 first. This ensures proper
1458             * synchronization with the reading code in
1459             * ppc_hash64_pteg_search()
1460             */
1461            smp_wmb();
1462            stq_p(spapr->htab + offset + HPTE64_DW1, pte1);
1463        }
1464    }
1465}
1466
1467static void spapr_hpte_set_c(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1468                             uint64_t pte1)
1469{
1470    hwaddr offset = ptex * HASH_PTE_SIZE_64 + HPTE64_DW1_C;
1471    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1472
1473    if (!spapr->htab) {
1474        /* There should always be a hash table when this is called */
1475        error_report("spapr_hpte_set_c called with no hash table !");
1476        return;
1477    }
1478
1479    /* The HW performs a non-atomic byte update */
1480    stb_p(spapr->htab + offset, (pte1 & 0xff) | 0x80);
1481}
1482
1483static void spapr_hpte_set_r(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1484                             uint64_t pte1)
1485{
1486    hwaddr offset = ptex * HASH_PTE_SIZE_64 + HPTE64_DW1_R;
1487    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1488
1489    if (!spapr->htab) {
1490        /* There should always be a hash table when this is called */
1491        error_report("spapr_hpte_set_r called with no hash table !");
1492        return;
1493    }
1494
1495    /* The HW performs a non-atomic byte update */
1496    stb_p(spapr->htab + offset, ((pte1 >> 8) & 0xff) | 0x01);
1497}
1498
1499int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
1500{
1501    int shift;
1502
1503    /* We aim for a hash table of size 1/128 the size of RAM (rounded
1504     * up).  The PAPR recommendation is actually 1/64 of RAM size, but
1505     * that's much more than is needed for Linux guests */
1506    shift = ctz64(pow2ceil(ramsize)) - 7;
1507    shift = MAX(shift, 18); /* Minimum architected size */
1508    shift = MIN(shift, 46); /* Maximum architected size */
1509    return shift;
1510}
1511
1512void spapr_free_hpt(SpaprMachineState *spapr)
1513{
1514    g_free(spapr->htab);
1515    spapr->htab = NULL;
1516    spapr->htab_shift = 0;
1517    close_htab_fd(spapr);
1518}
1519
1520int spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, Error **errp)
1521{
1522    ERRP_GUARD();
1523    long rc;
1524
1525    /* Clean up any HPT info from a previous boot */
1526    spapr_free_hpt(spapr);
1527
1528    rc = kvmppc_reset_htab(shift);
1529
1530    if (rc == -EOPNOTSUPP) {
1531        error_setg(errp, "HPT not supported in nested guests");
1532        return -EOPNOTSUPP;
1533    }
1534
1535    if (rc < 0) {
1536        /* kernel-side HPT needed, but couldn't allocate one */
1537        error_setg_errno(errp, errno, "Failed to allocate KVM HPT of order %d",
1538                         shift);
1539        error_append_hint(errp, "Try smaller maxmem?\n");
1540        return -errno;
1541    } else if (rc > 0) {
1542        /* kernel-side HPT allocated */
1543        if (rc != shift) {
1544            error_setg(errp,
1545                       "Requested order %d HPT, but kernel allocated order %ld",
1546                       shift, rc);
1547            error_append_hint(errp, "Try smaller maxmem?\n");
1548            return -ENOSPC;
1549        }
1550
1551        spapr->htab_shift = shift;
1552        spapr->htab = NULL;
1553    } else {
1554        /* kernel-side HPT not needed, allocate in userspace instead */
1555        size_t size = 1ULL << shift;
1556        int i;
1557
1558        spapr->htab = qemu_memalign(size, size);
1559        memset(spapr->htab, 0, size);
1560        spapr->htab_shift = shift;
1561
1562        for (i = 0; i < size / HASH_PTE_SIZE_64; i++) {
1563            DIRTY_HPTE(HPTE(spapr->htab, i));
1564        }
1565    }
1566    /* We're setting up a hash table, so that means we're not radix */
1567    spapr->patb_entry = 0;
1568    spapr_set_all_lpcrs(0, LPCR_HR | LPCR_UPRT);
1569    return 0;
1570}
1571
1572void spapr_setup_hpt(SpaprMachineState *spapr)
1573{
1574    int hpt_shift;
1575
1576    if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
1577        hpt_shift = spapr_hpt_shift_for_ramsize(MACHINE(spapr)->maxram_size);
1578    } else {
1579        uint64_t current_ram_size;
1580
1581        current_ram_size = MACHINE(spapr)->ram_size + get_plugged_memory_size();
1582        hpt_shift = spapr_hpt_shift_for_ramsize(current_ram_size);
1583    }
1584    spapr_reallocate_hpt(spapr, hpt_shift, &error_fatal);
1585
1586    if (kvm_enabled()) {
1587        hwaddr vrma_limit = kvmppc_vrma_limit(spapr->htab_shift);
1588
1589        /* Check our RMA fits in the possible VRMA */
1590        if (vrma_limit < spapr->rma_size) {
1591            error_report("Unable to create %" HWADDR_PRIu
1592                         "MiB RMA (VRMA only allows %" HWADDR_PRIu "MiB",
1593                         spapr->rma_size / MiB, vrma_limit / MiB);
1594            exit(EXIT_FAILURE);
1595        }
1596    }
1597}
1598
1599void spapr_check_mmu_mode(bool guest_radix)
1600{
1601    if (guest_radix) {
1602        if (kvm_enabled() && !kvmppc_has_cap_mmu_radix()) {
1603            error_report("Guest requested unavailable MMU mode (radix).");
1604            exit(EXIT_FAILURE);
1605        }
1606    } else {
1607        if (kvm_enabled() && kvmppc_has_cap_mmu_radix()
1608            && !kvmppc_has_cap_mmu_hash_v3()) {
1609            error_report("Guest requested unavailable MMU mode (hash).");
1610            exit(EXIT_FAILURE);
1611        }
1612    }
1613}
1614
1615static void spapr_machine_reset(MachineState *machine)
1616{
1617    SpaprMachineState *spapr = SPAPR_MACHINE(machine);
1618    PowerPCCPU *first_ppc_cpu;
1619    hwaddr fdt_addr;
1620    void *fdt;
1621    int rc;
1622
1623    pef_kvm_reset(machine->cgs, &error_fatal);
1624    spapr_caps_apply(spapr);
1625
1626    first_ppc_cpu = POWERPC_CPU(first_cpu);
1627    if (kvm_enabled() && kvmppc_has_cap_mmu_radix() &&
1628        ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
1629                              spapr->max_compat_pvr)) {
1630        /*
1631         * If using KVM with radix mode available, VCPUs can be started
1632         * without a HPT because KVM will start them in radix mode.
1633         * Set the GR bit in PATE so that we know there is no HPT.
1634         */
1635        spapr->patb_entry = PATE1_GR;
1636        spapr_set_all_lpcrs(LPCR_HR | LPCR_UPRT, LPCR_HR | LPCR_UPRT);
1637    } else {
1638        spapr_setup_hpt(spapr);
1639    }
1640
1641    qemu_devices_reset();
1642
1643    spapr_ovec_cleanup(spapr->ov5_cas);
1644    spapr->ov5_cas = spapr_ovec_new();
1645
1646    ppc_set_compat_all(spapr->max_compat_pvr, &error_fatal);
1647
1648    /*
1649     * This is fixing some of the default configuration of the XIVE
1650     * devices. To be called after the reset of the machine devices.
1651     */
1652    spapr_irq_reset(spapr, &error_fatal);
1653
1654    /*
1655     * There is no CAS under qtest. Simulate one to please the code that
1656     * depends on spapr->ov5_cas. This is especially needed to test device
1657     * unplug, so we do that before resetting the DRCs.
1658     */
1659    if (qtest_enabled()) {
1660        spapr_ovec_cleanup(spapr->ov5_cas);
1661        spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
1662    }
1663
1664    spapr_nvdimm_finish_flushes();
1665
1666    /* DRC reset may cause a device to be unplugged. This will cause troubles
1667     * if this device is used by another device (eg, a running vhost backend
1668     * will crash QEMU if the DIMM holding the vring goes away). To avoid such
1669     * situations, we reset DRCs after all devices have been reset.
1670     */
1671    spapr_drc_reset_all(spapr);
1672
1673    spapr_clear_pending_events(spapr);
1674
1675    /*
1676     * We place the device tree just below either the top of the RMA,
1677     * or just below 2GB, whichever is lower, so that it can be
1678     * processed with 32-bit real mode code if necessary
1679     */
1680    fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE;
1681
1682    fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
1683    if (spapr->vof) {
1684        spapr_vof_reset(spapr, fdt, &error_fatal);
1685        /*
1686         * Do not pack the FDT as the client may change properties.
1687         * VOF client does not expect the FDT so we do not load it to the VM.
1688         */
1689    } else {
1690        rc = fdt_pack(fdt);
1691        /* Should only fail if we've built a corrupted tree */
1692        assert(rc == 0);
1693
1694        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
1695                                  0, fdt_addr, 0);
1696        cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
1697    }
1698    qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
1699
1700    g_free(spapr->fdt_blob);
1701    spapr->fdt_size = fdt_totalsize(fdt);
1702    spapr->fdt_initial_size = spapr->fdt_size;
1703    spapr->fdt_blob = fdt;
1704
1705    /* Set up the entry state */
1706    first_ppc_cpu->env.gpr[5] = 0;
1707
1708    spapr->fwnmi_system_reset_addr = -1;
1709    spapr->fwnmi_machine_check_addr = -1;
1710    spapr->fwnmi_machine_check_interlock = -1;
1711
1712    /* Signal all vCPUs waiting on this condition */
1713    qemu_cond_broadcast(&spapr->fwnmi_machine_check_interlock_cond);
1714
1715    migrate_del_blocker(spapr->fwnmi_migration_blocker);
1716}
1717
1718static void spapr_create_nvram(SpaprMachineState *spapr)
1719{
1720    DeviceState *dev = qdev_new("spapr-nvram");
1721    DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
1722
1723    if (dinfo) {
1724        qdev_prop_set_drive_err(dev, "drive", blk_by_legacy_dinfo(dinfo),
1725                                &error_fatal);
1726    }
1727
1728    qdev_realize_and_unref(dev, &spapr->vio_bus->bus, &error_fatal);
1729
1730    spapr->nvram = (struct SpaprNvram *)dev;
1731}
1732
1733static void spapr_rtc_create(SpaprMachineState *spapr)
1734{
1735    object_initialize_child_with_props(OBJECT(spapr), "rtc", &spapr->rtc,
1736                                       sizeof(spapr->rtc), TYPE_SPAPR_RTC,
1737                                       &error_fatal, NULL);
1738    qdev_realize(DEVICE(&spapr->rtc), NULL, &error_fatal);
1739    object_property_add_alias(OBJECT(spapr), "rtc-time", OBJECT(&spapr->rtc),
1740                              "date");
1741}
1742
1743/* Returns whether we want to use VGA or not */
1744static bool spapr_vga_init(PCIBus *pci_bus, Error **errp)
1745{
1746    switch (vga_interface_type) {
1747    case VGA_NONE:
1748        return false;
1749    case VGA_DEVICE:
1750        return true;
1751    case VGA_STD:
1752    case VGA_VIRTIO:
1753    case VGA_CIRRUS:
1754        return pci_vga_init(pci_bus) != NULL;
1755    default:
1756        error_setg(errp,
1757                   "Unsupported VGA mode, only -vga std or -vga virtio is supported");
1758        return false;
1759    }
1760}
1761
1762static int spapr_pre_load(void *opaque)
1763{
1764    int rc;
1765
1766    rc = spapr_caps_pre_load(opaque);
1767    if (rc) {
1768        return rc;
1769    }
1770
1771    return 0;
1772}
1773
1774static int spapr_post_load(void *opaque, int version_id)
1775{
1776    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1777    int err = 0;
1778
1779    err = spapr_caps_post_migration(spapr);
1780    if (err) {
1781        return err;
1782    }
1783
1784    /*
1785     * In earlier versions, there was no separate qdev for the PAPR
1786     * RTC, so the RTC offset was stored directly in sPAPREnvironment.
1787     * So when migrating from those versions, poke the incoming offset
1788     * value into the RTC device
1789     */
1790    if (version_id < 3) {
1791        err = spapr_rtc_import_offset(&spapr->rtc, spapr->rtc_offset);
1792        if (err) {
1793            return err;
1794        }
1795    }
1796
1797    if (kvm_enabled() && spapr->patb_entry) {
1798        PowerPCCPU *cpu = POWERPC_CPU(first_cpu);
1799        bool radix = !!(spapr->patb_entry & PATE1_GR);
1800        bool gtse = !!(cpu->env.spr[SPR_LPCR] & LPCR_GTSE);
1801
1802        /*
1803         * Update LPCR:HR and UPRT as they may not be set properly in
1804         * the stream
1805         */
1806        spapr_set_all_lpcrs(radix ? (LPCR_HR | LPCR_UPRT) : 0,
1807                            LPCR_HR | LPCR_UPRT);
1808
1809        err = kvmppc_configure_v3_mmu(cpu, radix, gtse, spapr->patb_entry);
1810        if (err) {
1811            error_report("Process table config unsupported by the host");
1812            return -EINVAL;
1813        }
1814    }
1815
1816    err = spapr_irq_post_load(spapr, version_id);
1817    if (err) {
1818        return err;
1819    }
1820
1821    return err;
1822}
1823
1824static int spapr_pre_save(void *opaque)
1825{
1826    int rc;
1827
1828    rc = spapr_caps_pre_save(opaque);
1829    if (rc) {
1830        return rc;
1831    }
1832
1833    return 0;
1834}
1835
1836static bool version_before_3(void *opaque, int version_id)
1837{
1838    return version_id < 3;
1839}
1840
1841static bool spapr_pending_events_needed(void *opaque)
1842{
1843    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1844    return !QTAILQ_EMPTY(&spapr->pending_events);
1845}
1846
1847static const VMStateDescription vmstate_spapr_event_entry = {
1848    .name = "spapr_event_log_entry",
1849    .version_id = 1,
1850    .minimum_version_id = 1,
1851    .fields = (VMStateField[]) {
1852        VMSTATE_UINT32(summary, SpaprEventLogEntry),
1853        VMSTATE_UINT32(extended_length, SpaprEventLogEntry),
1854        VMSTATE_VBUFFER_ALLOC_UINT32(extended_log, SpaprEventLogEntry, 0,
1855                                     NULL, extended_length),
1856        VMSTATE_END_OF_LIST()
1857    },
1858};
1859
1860static const VMStateDescription vmstate_spapr_pending_events = {
1861    .name = "spapr_pending_events",
1862    .version_id = 1,
1863    .minimum_version_id = 1,
1864    .needed = spapr_pending_events_needed,
1865    .fields = (VMStateField[]) {
1866        VMSTATE_QTAILQ_V(pending_events, SpaprMachineState, 1,
1867                         vmstate_spapr_event_entry, SpaprEventLogEntry, next),
1868        VMSTATE_END_OF_LIST()
1869    },
1870};
1871
1872static bool spapr_ov5_cas_needed(void *opaque)
1873{
1874    SpaprMachineState *spapr = opaque;
1875    SpaprOptionVector *ov5_mask = spapr_ovec_new();
1876    bool cas_needed;
1877
1878    /* Prior to the introduction of SpaprOptionVector, we had two option
1879     * vectors we dealt with: OV5_FORM1_AFFINITY, and OV5_DRCONF_MEMORY.
1880     * Both of these options encode machine topology into the device-tree
1881     * in such a way that the now-booted OS should still be able to interact
1882     * appropriately with QEMU regardless of what options were actually
1883     * negotiatied on the source side.
1884     *
1885     * As such, we can avoid migrating the CAS-negotiated options if these
1886     * are the only options available on the current machine/platform.
1887     * Since these are the only options available for pseries-2.7 and
1888     * earlier, this allows us to maintain old->new/new->old migration
1889     * compatibility.
1890     *
1891     * For QEMU 2.8+, there are additional CAS-negotiatable options available
1892     * via default pseries-2.8 machines and explicit command-line parameters.
1893     * Some of these options, like OV5_HP_EVT, *do* require QEMU to be aware
1894     * of the actual CAS-negotiated values to continue working properly. For
1895     * example, availability of memory unplug depends on knowing whether
1896     * OV5_HP_EVT was negotiated via CAS.
1897     *
1898     * Thus, for any cases where the set of available CAS-negotiatable
1899     * options extends beyond OV5_FORM1_AFFINITY and OV5_DRCONF_MEMORY, we
1900     * include the CAS-negotiated options in the migration stream, unless
1901     * if they affect boot time behaviour only.
1902     */
1903    spapr_ovec_set(ov5_mask, OV5_FORM1_AFFINITY);
1904    spapr_ovec_set(ov5_mask, OV5_DRCONF_MEMORY);
1905    spapr_ovec_set(ov5_mask, OV5_DRMEM_V2);
1906
1907    /* We need extra information if we have any bits outside the mask
1908     * defined above */
1909    cas_needed = !spapr_ovec_subset(spapr->ov5, ov5_mask);
1910
1911    spapr_ovec_cleanup(ov5_mask);
1912
1913    return cas_needed;
1914}
1915
1916static const VMStateDescription vmstate_spapr_ov5_cas = {
1917    .name = "spapr_option_vector_ov5_cas",
1918    .version_id = 1,
1919    .minimum_version_id = 1,
1920    .needed = spapr_ov5_cas_needed,
1921    .fields = (VMStateField[]) {
1922        VMSTATE_STRUCT_POINTER_V(ov5_cas, SpaprMachineState, 1,
1923                                 vmstate_spapr_ovec, SpaprOptionVector),
1924        VMSTATE_END_OF_LIST()
1925    },
1926};
1927
1928static bool spapr_patb_entry_needed(void *opaque)
1929{
1930    SpaprMachineState *spapr = opaque;
1931
1932    return !!spapr->patb_entry;
1933}
1934
1935static const VMStateDescription vmstate_spapr_patb_entry = {
1936    .name = "spapr_patb_entry",
1937    .version_id = 1,
1938    .minimum_version_id = 1,
1939    .needed = spapr_patb_entry_needed,
1940    .fields = (VMStateField[]) {
1941        VMSTATE_UINT64(patb_entry, SpaprMachineState),
1942        VMSTATE_END_OF_LIST()
1943    },
1944};
1945
1946static bool spapr_irq_map_needed(void *opaque)
1947{
1948    SpaprMachineState *spapr = opaque;
1949
1950    return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr);
1951}
1952
1953static const VMStateDescription vmstate_spapr_irq_map = {
1954    .name = "spapr_irq_map",
1955    .version_id = 1,
1956    .minimum_version_id = 1,
1957    .needed = spapr_irq_map_needed,
1958    .fields = (VMStateField[]) {
1959        VMSTATE_BITMAP(irq_map, SpaprMachineState, 0, irq_map_nr),
1960        VMSTATE_END_OF_LIST()
1961    },
1962};
1963
1964static bool spapr_dtb_needed(void *opaque)
1965{
1966    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
1967
1968    return smc->update_dt_enabled;
1969}
1970
1971static int spapr_dtb_pre_load(void *opaque)
1972{
1973    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1974
1975    g_free(spapr->fdt_blob);
1976    spapr->fdt_blob = NULL;
1977    spapr->fdt_size = 0;
1978
1979    return 0;
1980}
1981
1982static const VMStateDescription vmstate_spapr_dtb = {
1983    .name = "spapr_dtb",
1984    .version_id = 1,
1985    .minimum_version_id = 1,
1986    .needed = spapr_dtb_needed,
1987    .pre_load = spapr_dtb_pre_load,
1988    .fields = (VMStateField[]) {
1989        VMSTATE_UINT32(fdt_initial_size, SpaprMachineState),
1990        VMSTATE_UINT32(fdt_size, SpaprMachineState),
1991        VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, SpaprMachineState, 0, NULL,
1992                                     fdt_size),
1993        VMSTATE_END_OF_LIST()
1994    },
1995};
1996
1997static bool spapr_fwnmi_needed(void *opaque)
1998{
1999    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
2000
2001    return spapr->fwnmi_machine_check_addr != -1;
2002}
2003
2004static int spapr_fwnmi_pre_save(void *opaque)
2005{
2006    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
2007
2008    /*
2009     * Check if machine check handling is in progress and print a
2010     * warning message.
2011     */
2012    if (spapr->fwnmi_machine_check_interlock != -1) {
2013        warn_report("A machine check is being handled during migration. The"
2014                "handler may run and log hardware error on the destination");
2015    }
2016
2017    return 0;
2018}
2019
2020static const VMStateDescription vmstate_spapr_fwnmi = {
2021    .name = "spapr_fwnmi",
2022    .version_id = 1,
2023    .minimum_version_id = 1,
2024    .needed = spapr_fwnmi_needed,
2025    .pre_save = spapr_fwnmi_pre_save,
2026    .fields = (VMStateField[]) {
2027        VMSTATE_UINT64(fwnmi_system_reset_addr, SpaprMachineState),
2028        VMSTATE_UINT64(fwnmi_machine_check_addr, SpaprMachineState),
2029        VMSTATE_INT32(fwnmi_machine_check_interlock, SpaprMachineState),
2030        VMSTATE_END_OF_LIST()
2031    },
2032};
2033
2034static const VMStateDescription vmstate_spapr = {
2035    .name = "spapr",
2036    .version_id = 3,
2037    .minimum_version_id = 1,
2038    .pre_load = spapr_pre_load,
2039    .post_load = spapr_post_load,
2040    .pre_save = spapr_pre_save,
2041    .fields = (VMStateField[]) {
2042        /* used to be @next_irq */
2043        VMSTATE_UNUSED_BUFFER(version_before_3, 0, 4),
2044
2045        /* RTC offset */
2046        VMSTATE_UINT64_TEST(rtc_offset, SpaprMachineState, version_before_3),
2047
2048        VMSTATE_PPC_TIMEBASE_V(tb, SpaprMachineState, 2),
2049        VMSTATE_END_OF_LIST()
2050    },
2051    .subsections = (const VMStateDescription*[]) {
2052        &vmstate_spapr_ov5_cas,
2053        &vmstate_spapr_patb_entry,
2054        &vmstate_spapr_pending_events,
2055        &vmstate_spapr_cap_htm,
2056        &vmstate_spapr_cap_vsx,
2057        &vmstate_spapr_cap_dfp,
2058        &vmstate_spapr_cap_cfpc,
2059        &vmstate_spapr_cap_sbbc,
2060        &vmstate_spapr_cap_ibs,
2061        &vmstate_spapr_cap_hpt_maxpagesize,
2062        &vmstate_spapr_irq_map,
2063        &vmstate_spapr_cap_nested_kvm_hv,
2064        &vmstate_spapr_dtb,
2065        &vmstate_spapr_cap_large_decr,
2066        &vmstate_spapr_cap_ccf_assist,
2067        &vmstate_spapr_cap_fwnmi,
2068        &vmstate_spapr_fwnmi,
2069        &vmstate_spapr_cap_rpt_invalidate,
2070        NULL
2071    }
2072};
2073
2074static int htab_save_setup(QEMUFile *f, void *opaque)
2075{
2076    SpaprMachineState *spapr = opaque;
2077
2078    /* "Iteration" header */
2079    if (!spapr->htab_shift) {
2080        qemu_put_be32(f, -1);
2081    } else {
2082        qemu_put_be32(f, spapr->htab_shift);
2083    }
2084
2085    if (spapr->htab) {
2086        spapr->htab_save_index = 0;
2087        spapr->htab_first_pass = true;
2088    } else {
2089        if (spapr->htab_shift) {
2090            assert(kvm_enabled());
2091        }
2092    }
2093
2094
2095    return 0;
2096}
2097
2098static void htab_save_chunk(QEMUFile *f, SpaprMachineState *spapr,
2099                            int chunkstart, int n_valid, int n_invalid)
2100{
2101    qemu_put_be32(f, chunkstart);
2102    qemu_put_be16(f, n_valid);
2103    qemu_put_be16(f, n_invalid);
2104    qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
2105                    HASH_PTE_SIZE_64 * n_valid);
2106}
2107
2108static void htab_save_end_marker(QEMUFile *f)
2109{
2110    qemu_put_be32(f, 0);
2111    qemu_put_be16(f, 0);
2112    qemu_put_be16(f, 0);
2113}
2114
2115static void htab_save_first_pass(QEMUFile *f, SpaprMachineState *spapr,
2116                                 int64_t max_ns)
2117{
2118    bool has_timeout = max_ns != -1;
2119    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2120    int index = spapr->htab_save_index;
2121    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2122
2123    assert(spapr->htab_first_pass);
2124
2125    do {
2126        int chunkstart;
2127
2128        /* Consume invalid HPTEs */
2129        while ((index < htabslots)
2130               && !HPTE_VALID(HPTE(spapr->htab, index))) {
2131            CLEAN_HPTE(HPTE(spapr->htab, index));
2132            index++;
2133        }
2134
2135        /* Consume valid HPTEs */
2136        chunkstart = index;
2137        while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2138               && HPTE_VALID(HPTE(spapr->htab, index))) {
2139            CLEAN_HPTE(HPTE(spapr->htab, index));
2140            index++;
2141        }
2142
2143        if (index > chunkstart) {
2144            int n_valid = index - chunkstart;
2145
2146            htab_save_chunk(f, spapr, chunkstart, n_valid, 0);
2147
2148            if (has_timeout &&
2149                (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2150                break;
2151            }
2152        }
2153    } while ((index < htabslots) && !qemu_file_rate_limit(f));
2154
2155    if (index >= htabslots) {
2156        assert(index == htabslots);
2157        index = 0;
2158        spapr->htab_first_pass = false;
2159    }
2160    spapr->htab_save_index = index;
2161}
2162
2163static int htab_save_later_pass(QEMUFile *f, SpaprMachineState *spapr,
2164                                int64_t max_ns)
2165{
2166    bool final = max_ns < 0;
2167    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2168    int examined = 0, sent = 0;
2169    int index = spapr->htab_save_index;
2170    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2171
2172    assert(!spapr->htab_first_pass);
2173
2174    do {
2175        int chunkstart, invalidstart;
2176
2177        /* Consume non-dirty HPTEs */
2178        while ((index < htabslots)
2179               && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
2180            index++;
2181            examined++;
2182        }
2183
2184        chunkstart = index;
2185        /* Consume valid dirty HPTEs */
2186        while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2187               && HPTE_DIRTY(HPTE(spapr->htab, index))
2188               && HPTE_VALID(HPTE(spapr->htab, index))) {
2189            CLEAN_HPTE(HPTE(spapr->htab, index));
2190            index++;
2191            examined++;
2192        }
2193
2194        invalidstart = index;
2195        /* Consume invalid dirty HPTEs */
2196        while ((index < htabslots) && (index - invalidstart < USHRT_MAX)
2197               && HPTE_DIRTY(HPTE(spapr->htab, index))
2198               && !HPTE_VALID(HPTE(spapr->htab, index))) {
2199            CLEAN_HPTE(HPTE(spapr->htab, index));
2200            index++;
2201            examined++;
2202        }
2203
2204        if (index > chunkstart) {
2205            int n_valid = invalidstart - chunkstart;
2206            int n_invalid = index - invalidstart;
2207
2208            htab_save_chunk(f, spapr, chunkstart, n_valid, n_invalid);
2209            sent += index - chunkstart;
2210
2211            if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2212                break;
2213            }
2214        }
2215
2216        if (examined >= htabslots) {
2217            break;
2218        }
2219
2220        if (index >= htabslots) {
2221            assert(index == htabslots);
2222            index = 0;
2223        }
2224    } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));
2225
2226    if (index >= htabslots) {
2227        assert(index == htabslots);
2228        index = 0;
2229    }
2230
2231    spapr->htab_save_index = index;
2232
2233    return (examined >= htabslots) && (sent == 0) ? 1 : 0;
2234}
2235
2236#define MAX_ITERATION_NS    5000000 /* 5 ms */
2237#define MAX_KVM_BUF_SIZE    2048
2238
2239static int htab_save_iterate(QEMUFile *f, void *opaque)
2240{
2241    SpaprMachineState *spapr = opaque;
2242    int fd;
2243    int rc = 0;
2244
2245    /* Iteration header */
2246    if (!spapr->htab_shift) {
2247        qemu_put_be32(f, -1);
2248        return 1;
2249    } else {
2250        qemu_put_be32(f, 0);
2251    }
2252
2253    if (!spapr->htab) {
2254        assert(kvm_enabled());
2255
2256        fd = get_htab_fd(spapr);
2257        if (fd < 0) {
2258            return fd;
2259        }
2260
2261        rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
2262        if (rc < 0) {
2263            return rc;
2264        }
2265    } else  if (spapr->htab_first_pass) {
2266        htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
2267    } else {
2268        rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
2269    }
2270
2271    htab_save_end_marker(f);
2272
2273    return rc;
2274}
2275
2276static int htab_save_complete(QEMUFile *f, void *opaque)
2277{
2278    SpaprMachineState *spapr = opaque;
2279    int fd;
2280
2281    /* Iteration header */
2282    if (!spapr->htab_shift) {
2283        qemu_put_be32(f, -1);
2284        return 0;
2285    } else {
2286        qemu_put_be32(f, 0);
2287    }
2288
2289    if (!spapr->htab) {
2290        int rc;
2291
2292        assert(kvm_enabled());
2293
2294        fd = get_htab_fd(spapr);
2295        if (fd < 0) {
2296            return fd;
2297        }
2298
2299        rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, -1);
2300        if (rc < 0) {
2301            return rc;
2302        }
2303    } else {
2304        if (spapr->htab_first_pass) {
2305            htab_save_first_pass(f, spapr, -1);
2306        }
2307        htab_save_later_pass(f, spapr, -1);
2308    }
2309
2310    /* End marker */
2311    htab_save_end_marker(f);
2312
2313    return 0;
2314}
2315
2316static int htab_load(QEMUFile *f, void *opaque, int version_id)
2317{
2318    SpaprMachineState *spapr = opaque;
2319    uint32_t section_hdr;
2320    int fd = -1;
2321    Error *local_err = NULL;
2322
2323    if (version_id < 1 || version_id > 1) {
2324        error_report("htab_load() bad version");
2325        return -EINVAL;
2326    }
2327
2328    section_hdr = qemu_get_be32(f);
2329
2330    if (section_hdr == -1) {
2331        spapr_free_hpt(spapr);
2332        return 0;
2333    }
2334
2335    if (section_hdr) {
2336        int ret;
2337
2338        /* First section gives the htab size */
2339        ret = spapr_reallocate_hpt(spapr, section_hdr, &local_err);
2340        if (ret < 0) {
2341            error_report_err(local_err);
2342            return ret;
2343        }
2344        return 0;
2345    }
2346
2347    if (!spapr->htab) {
2348        assert(kvm_enabled());
2349
2350        fd = kvmppc_get_htab_fd(true, 0, &local_err);
2351        if (fd < 0) {
2352            error_report_err(local_err);
2353            return fd;
2354        }
2355    }
2356
2357    while (true) {
2358        uint32_t index;
2359        uint16_t n_valid, n_invalid;
2360
2361        index = qemu_get_be32(f);
2362        n_valid = qemu_get_be16(f);
2363        n_invalid = qemu_get_be16(f);
2364
2365        if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
2366            /* End of Stream */
2367            break;
2368        }
2369
2370        if ((index + n_valid + n_invalid) >
2371            (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
2372            /* Bad index in stream */
2373            error_report(
2374                "htab_load() bad index %d (%hd+%hd entries) in htab stream (htab_shift=%d)",
2375                index, n_valid, n_invalid, spapr->htab_shift);
2376            return -EINVAL;
2377        }
2378
2379        if (spapr->htab) {
2380            if (n_valid) {
2381                qemu_get_buffer(f, HPTE(spapr->htab, index),
2382                                HASH_PTE_SIZE_64 * n_valid);
2383            }
2384            if (n_invalid) {
2385                memset(HPTE(spapr->htab, index + n_valid), 0,
2386                       HASH_PTE_SIZE_64 * n_invalid);
2387            }
2388        } else {
2389            int rc;
2390
2391            assert(fd >= 0);
2392
2393            rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid,
2394                                        &local_err);
2395            if (rc < 0) {
2396                error_report_err(local_err);
2397                return rc;
2398            }
2399        }
2400    }
2401
2402    if (!spapr->htab) {
2403        assert(fd >= 0);
2404        close(fd);
2405    }
2406
2407    return 0;
2408}
2409
2410static void htab_save_cleanup(void *opaque)
2411{
2412    SpaprMachineState *spapr = opaque;
2413
2414    close_htab_fd(spapr);
2415}
2416
2417static SaveVMHandlers savevm_htab_handlers = {
2418    .save_setup = htab_save_setup,
2419    .save_live_iterate = htab_save_iterate,
2420    .save_live_complete_precopy = htab_save_complete,
2421    .save_cleanup = htab_save_cleanup,
2422    .load_state = htab_load,
2423};
2424
2425static void spapr_boot_set(void *opaque, const char *boot_device,
2426                           Error **errp)
2427{
2428    SpaprMachineState *spapr = SPAPR_MACHINE(opaque);
2429
2430    g_free(spapr->boot_device);
2431    spapr->boot_device = g_strdup(boot_device);
2432}
2433
2434static void spapr_create_lmb_dr_connectors(SpaprMachineState *spapr)
2435{
2436    MachineState *machine = MACHINE(spapr);
2437    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
2438    uint32_t nr_lmbs = (machine->maxram_size - machine->ram_size)/lmb_size;
2439    int i;
2440
2441    for (i = 0; i < nr_lmbs; i++) {
2442        uint64_t addr;
2443
2444        addr = i * lmb_size + machine->device_memory->base;
2445        spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_LMB,
2446                               addr / lmb_size);
2447    }
2448}
2449
2450/*
2451 * If RAM size, maxmem size and individual node mem sizes aren't aligned
2452 * to SPAPR_MEMORY_BLOCK_SIZE(256MB), then refuse to start the guest
2453 * since we can't support such unaligned sizes with DRCONF_MEMORY.
2454 */
2455static void spapr_validate_node_memory(MachineState *machine, Error **errp)
2456{
2457    int i;
2458
2459    if (machine->ram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2460        error_setg(errp, "Memory size 0x" RAM_ADDR_FMT
2461                   " is not aligned to %" PRIu64 " MiB",
2462                   machine->ram_size,
2463                   SPAPR_MEMORY_BLOCK_SIZE / MiB);
2464        return;
2465    }
2466
2467    if (machine->maxram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2468        error_setg(errp, "Maximum memory size 0x" RAM_ADDR_FMT
2469                   " is not aligned to %" PRIu64 " MiB",
2470                   machine->ram_size,
2471                   SPAPR_MEMORY_BLOCK_SIZE / MiB);
2472        return;
2473    }
2474
2475    for (i = 0; i < machine->numa_state->num_nodes; i++) {
2476        if (machine->numa_state->nodes[i].node_mem % SPAPR_MEMORY_BLOCK_SIZE) {
2477            error_setg(errp,
2478                       "Node %d memory size 0x%" PRIx64
2479                       " is not aligned to %" PRIu64 " MiB",
2480                       i, machine->numa_state->nodes[i].node_mem,
2481                       SPAPR_MEMORY_BLOCK_SIZE / MiB);
2482            return;
2483        }
2484    }
2485}
2486
2487/* find cpu slot in machine->possible_cpus by core_id */
2488static CPUArchId *spapr_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
2489{
2490    int index = id / ms->smp.threads;
2491
2492    if (index >= ms->possible_cpus->len) {
2493        return NULL;
2494    }
2495    if (idx) {
2496        *idx = index;
2497    }
2498    return &ms->possible_cpus->cpus[index];
2499}
2500
2501static void spapr_set_vsmt_mode(SpaprMachineState *spapr, Error **errp)
2502{
2503    MachineState *ms = MACHINE(spapr);
2504    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
2505    Error *local_err = NULL;
2506    bool vsmt_user = !!spapr->vsmt;
2507    int kvm_smt = kvmppc_smt_threads();
2508    int ret;
2509    unsigned int smp_threads = ms->smp.threads;
2510
2511    if (!kvm_enabled() && (smp_threads > 1)) {
2512        error_setg(errp, "TCG cannot support more than 1 thread/core "
2513                   "on a pseries machine");
2514        return;
2515    }
2516    if (!is_power_of_2(smp_threads)) {
2517        error_setg(errp, "Cannot support %d threads/core on a pseries "
2518                   "machine because it must be a power of 2", smp_threads);
2519        return;
2520    }
2521
2522    /* Detemine the VSMT mode to use: */
2523    if (vsmt_user) {
2524        if (spapr->vsmt < smp_threads) {
2525            error_setg(errp, "Cannot support VSMT mode %d"
2526                       " because it must be >= threads/core (%d)",
2527                       spapr->vsmt, smp_threads);
2528            return;
2529        }
2530        /* In this case, spapr->vsmt has been set by the command line */
2531    } else if (!smc->smp_threads_vsmt) {
2532        /*
2533         * Default VSMT value is tricky, because we need it to be as
2534         * consistent as possible (for migration), but this requires
2535         * changing it for at least some existing cases.  We pick 8 as
2536         * the value that we'd get with KVM on POWER8, the
2537         * overwhelmingly common case in production systems.
2538         */
2539        spapr->vsmt = MAX(8, smp_threads);
2540    } else {
2541        spapr->vsmt = smp_threads;
2542    }
2543
2544    /* KVM: If necessary, set the SMT mode: */
2545    if (kvm_enabled() && (spapr->vsmt != kvm_smt)) {
2546        ret = kvmppc_set_smt_threads(spapr->vsmt);
2547        if (ret) {
2548            /* Looks like KVM isn't able to change VSMT mode */
2549            error_setg(&local_err,
2550                       "Failed to set KVM's VSMT mode to %d (errno %d)",
2551                       spapr->vsmt, ret);
2552            /* We can live with that if the default one is big enough
2553             * for the number of threads, and a submultiple of the one
2554             * we want.  In this case we'll waste some vcpu ids, but
2555             * behaviour will be correct */
2556            if ((kvm_smt >= smp_threads) && ((spapr->vsmt % kvm_smt) == 0)) {
2557                warn_report_err(local_err);
2558            } else {
2559                if (!vsmt_user) {
2560                    error_append_hint(&local_err,
2561                                      "On PPC, a VM with %d threads/core"
2562                                      " on a host with %d threads/core"
2563                                      " requires the use of VSMT mode %d.\n",
2564                                      smp_threads, kvm_smt, spapr->vsmt);
2565                }
2566                kvmppc_error_append_smt_possible_hint(&local_err);
2567                error_propagate(errp, local_err);
2568            }
2569        }
2570    }
2571    /* else TCG: nothing to do currently */
2572}
2573
2574static void spapr_init_cpus(SpaprMachineState *spapr)
2575{
2576    MachineState *machine = MACHINE(spapr);
2577    MachineClass *mc = MACHINE_GET_CLASS(machine);
2578    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
2579    const char *type = spapr_get_cpu_core_type(machine->cpu_type);
2580    const CPUArchIdList *possible_cpus;
2581    unsigned int smp_cpus = machine->smp.cpus;
2582    unsigned int smp_threads = machine->smp.threads;
2583    unsigned int max_cpus = machine->smp.max_cpus;
2584    int boot_cores_nr = smp_cpus / smp_threads;
2585    int i;
2586
2587    possible_cpus = mc->possible_cpu_arch_ids(machine);
2588    if (mc->has_hotpluggable_cpus) {
2589        if (smp_cpus % smp_threads) {
2590            error_report("smp_cpus (%u) must be multiple of threads (%u)",
2591                         smp_cpus, smp_threads);
2592            exit(1);
2593        }
2594        if (max_cpus % smp_threads) {
2595            error_report("max_cpus (%u) must be multiple of threads (%u)",
2596                         max_cpus, smp_threads);
2597            exit(1);
2598        }
2599    } else {
2600        if (max_cpus != smp_cpus) {
2601            error_report("This machine version does not support CPU hotplug");
2602            exit(1);
2603        }
2604        boot_cores_nr = possible_cpus->len;
2605    }
2606
2607    if (smc->pre_2_10_has_unused_icps) {
2608        int i;
2609
2610        for (i = 0; i < spapr_max_server_number(spapr); i++) {
2611            /* Dummy entries get deregistered when real ICPState objects
2612             * are registered during CPU core hotplug.
2613             */
2614            pre_2_10_vmstate_register_dummy_icp(i);
2615        }
2616    }
2617
2618    for (i = 0; i < possible_cpus->len; i++) {
2619        int core_id = i * smp_threads;
2620
2621        if (mc->has_hotpluggable_cpus) {
2622            spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_CPU,
2623                                   spapr_vcpu_id(spapr, core_id));
2624        }
2625
2626        if (i < boot_cores_nr) {
2627            Object *core  = object_new(type);
2628            int nr_threads = smp_threads;
2629
2630            /* Handle the partially filled core for older machine types */
2631            if ((i + 1) * smp_threads >= smp_cpus) {
2632                nr_threads = smp_cpus - i * smp_threads;
2633            }
2634
2635            object_property_set_int(core, "nr-threads", nr_threads,
2636                                    &error_fatal);
2637            object_property_set_int(core, CPU_CORE_PROP_CORE_ID, core_id,
2638                                    &error_fatal);
2639            qdev_realize(DEVICE(core), NULL, &error_fatal);
2640
2641            object_unref(core);
2642        }
2643    }
2644}
2645
2646static PCIHostState *spapr_create_default_phb(void)
2647{
2648    DeviceState *dev;
2649
2650    dev = qdev_new(TYPE_SPAPR_PCI_HOST_BRIDGE);
2651    qdev_prop_set_uint32(dev, "index", 0);
2652    sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
2653
2654    return PCI_HOST_BRIDGE(dev);
2655}
2656
2657static hwaddr spapr_rma_size(SpaprMachineState *spapr, Error **errp)
2658{
2659    MachineState *machine = MACHINE(spapr);
2660    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
2661    hwaddr rma_size = machine->ram_size;
2662    hwaddr node0_size = spapr_node0_size(machine);
2663
2664    /* RMA has to fit in the first NUMA node */
2665    rma_size = MIN(rma_size, node0_size);
2666
2667    /*
2668     * VRMA access is via a special 1TiB SLB mapping, so the RMA can
2669     * never exceed that
2670     */
2671    rma_size = MIN(rma_size, 1 * TiB);
2672
2673    /*
2674     * Clamp the RMA size based on machine type.  This is for
2675     * migration compatibility with older qemu versions, which limited
2676     * the RMA size for complicated and mostly bad reasons.
2677     */
2678    if (smc->rma_limit) {
2679        rma_size = MIN(rma_size, smc->rma_limit);
2680    }
2681
2682    if (rma_size < MIN_RMA_SLOF) {
2683        error_setg(errp,
2684                   "pSeries SLOF firmware requires >= %" HWADDR_PRIx
2685                   "ldMiB guest RMA (Real Mode Area memory)",
2686                   MIN_RMA_SLOF / MiB);
2687        return 0;
2688    }
2689
2690    return rma_size;
2691}
2692
2693static void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr)
2694{
2695    MachineState *machine = MACHINE(spapr);
2696    int i;
2697
2698    for (i = 0; i < machine->ram_slots; i++) {
2699        spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM, i);
2700    }
2701}
2702
2703/* pSeries LPAR / sPAPR hardware init */
2704static void spapr_machine_init(MachineState *machine)
2705{
2706    SpaprMachineState *spapr = SPAPR_MACHINE(machine);
2707    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
2708    MachineClass *mc = MACHINE_GET_CLASS(machine);
2709    const char *bios_default = spapr->vof ? FW_FILE_NAME_VOF : FW_FILE_NAME;
2710    const char *bios_name = machine->firmware ?: bios_default;
2711    g_autofree char *filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
2712    const char *kernel_filename = machine->kernel_filename;
2713    const char *initrd_filename = machine->initrd_filename;
2714    PCIHostState *phb;
2715    int i;
2716    MemoryRegion *sysmem = get_system_memory();
2717    long load_limit, fw_size;
2718    Error *resize_hpt_err = NULL;
2719
2720    if (!filename) {
2721        error_report("Could not find LPAR firmware '%s'", bios_name);
2722        exit(1);
2723    }
2724    fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
2725    if (fw_size <= 0) {
2726        error_report("Could not load LPAR firmware '%s'", filename);
2727        exit(1);
2728    }
2729
2730    /*
2731     * if Secure VM (PEF) support is configured, then initialize it
2732     */
2733    pef_kvm_init(machine->cgs, &error_fatal);
2734
2735    msi_nonbroken = true;
2736
2737    QLIST_INIT(&spapr->phbs);
2738    QTAILQ_INIT(&spapr->pending_dimm_unplugs);
2739
2740    /* Determine capabilities to run with */
2741    spapr_caps_init(spapr);
2742
2743    kvmppc_check_papr_resize_hpt(&resize_hpt_err);
2744    if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DEFAULT) {
2745        /*
2746         * If the user explicitly requested a mode we should either
2747         * supply it, or fail completely (which we do below).  But if
2748         * it's not set explicitly, we reset our mode to something
2749         * that works
2750         */
2751        if (resize_hpt_err) {
2752            spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
2753            error_free(resize_hpt_err);
2754            resize_hpt_err = NULL;
2755        } else {
2756            spapr->resize_hpt = smc->resize_hpt_default;
2757        }
2758    }
2759
2760    assert(spapr->resize_hpt != SPAPR_RESIZE_HPT_DEFAULT);
2761
2762    if ((spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) && resize_hpt_err) {
2763        /*
2764         * User requested HPT resize, but this host can't supply it.  Bail out
2765         */
2766        error_report_err(resize_hpt_err);
2767        exit(1);
2768    }
2769    error_free(resize_hpt_err);
2770
2771    spapr->rma_size = spapr_rma_size(spapr, &error_fatal);
2772
2773    /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */
2774    load_limit = MIN(spapr->rma_size, FDT_MAX_ADDR) - FW_OVERHEAD;
2775
2776    /*
2777     * VSMT must be set in order to be able to compute VCPU ids, ie to
2778     * call spapr_max_server_number() or spapr_vcpu_id().
2779     */
2780    spapr_set_vsmt_mode(spapr, &error_fatal);
2781
2782    /* Set up Interrupt Controller before we create the VCPUs */
2783    spapr_irq_init(spapr, &error_fatal);
2784
2785    /* Set up containers for ibm,client-architecture-support negotiated options
2786     */
2787    spapr->ov5 = spapr_ovec_new();
2788    spapr->ov5_cas = spapr_ovec_new();
2789
2790    if (smc->dr_lmb_enabled) {
2791        spapr_ovec_set(spapr->ov5, OV5_DRCONF_MEMORY);
2792        spapr_validate_node_memory(machine, &error_fatal);
2793    }
2794
2795    spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
2796
2797    /* Do not advertise FORM2 NUMA support for pseries-6.1 and older */
2798    if (!smc->pre_6_2_numa_affinity) {
2799        spapr_ovec_set(spapr->ov5, OV5_FORM2_AFFINITY);
2800    }
2801
2802    /* advertise support for dedicated HP event source to guests */
2803    if (spapr->use_hotplug_event_source) {
2804        spapr_ovec_set(spapr->ov5, OV5_HP_EVT);
2805    }
2806
2807    /* advertise support for HPT resizing */
2808    if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
2809        spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE);
2810    }
2811
2812    /* advertise support for ibm,dyamic-memory-v2 */
2813    spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2);
2814
2815    /* advertise XIVE on POWER9 machines */
2816    if (spapr->irq->xive) {
2817        spapr_ovec_set(spapr->ov5, OV5_XIVE_EXPLOIT);
2818    }
2819
2820    /* init CPUs */
2821    spapr_init_cpus(spapr);
2822
2823    spapr->gpu_numa_id = spapr_numa_initial_nvgpu_numa_id(machine);
2824
2825    /* Init numa_assoc_array */
2826    spapr_numa_associativity_init(spapr, machine);
2827
2828    if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
2829        ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
2830                              spapr->max_compat_pvr)) {
2831        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
2832        /* KVM and TCG always allow GTSE with radix... */
2833        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
2834    }
2835    /* ... but not with hash (currently). */
2836
2837    if (kvm_enabled()) {
2838        /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
2839        kvmppc_enable_logical_ci_hcalls();
2840        kvmppc_enable_set_mode_hcall();
2841
2842        /* H_CLEAR_MOD/_REF are mandatory in PAPR, but off by default */
2843        kvmppc_enable_clear_ref_mod_hcalls();
2844
2845        /* Enable H_PAGE_INIT */
2846        kvmppc_enable_h_page_init();
2847    }
2848
2849    /* map RAM */
2850    memory_region_add_subregion(sysmem, 0, machine->ram);
2851
2852    /* always allocate the device memory information */
2853    machine->device_memory = g_malloc0(sizeof(*machine->device_memory));
2854
2855    /* initialize hotplug memory address space */
2856    if (machine->ram_size < machine->maxram_size) {
2857        ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
2858        /*
2859         * Limit the number of hotpluggable memory slots to half the number
2860         * slots that KVM supports, leaving the other half for PCI and other
2861         * devices. However ensure that number of slots doesn't drop below 32.
2862         */
2863        int max_memslots = kvm_enabled() ? kvm_get_max_memslots() / 2 :
2864                           SPAPR_MAX_RAM_SLOTS;
2865
2866        if (max_memslots < SPAPR_MAX_RAM_SLOTS) {
2867            max_memslots = SPAPR_MAX_RAM_SLOTS;
2868        }
2869        if (machine->ram_slots > max_memslots) {
2870            error_report("Specified number of memory slots %"
2871                         PRIu64" exceeds max supported %d",
2872                         machine->ram_slots, max_memslots);
2873            exit(1);
2874        }
2875
2876        machine->device_memory->base = ROUND_UP(machine->ram_size,
2877                                                SPAPR_DEVICE_MEM_ALIGN);
2878        memory_region_init(&machine->device_memory->mr, OBJECT(spapr),
2879                           "device-memory", device_mem_size);
2880        memory_region_add_subregion(sysmem, machine->device_memory->base,
2881                                    &machine->device_memory->mr);
2882    }
2883
2884    if (smc->dr_lmb_enabled) {
2885        spapr_create_lmb_dr_connectors(spapr);
2886    }
2887
2888    if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI) == SPAPR_CAP_ON) {
2889        /* Create the error string for live migration blocker */
2890        error_setg(&spapr->fwnmi_migration_blocker,
2891            "A machine check is being handled during migration. The handler"
2892            "may run and log hardware error on the destination");
2893    }
2894
2895    if (mc->nvdimm_supported) {
2896        spapr_create_nvdimm_dr_connectors(spapr);
2897    }
2898
2899    /* Set up RTAS event infrastructure */
2900    spapr_events_init(spapr);
2901
2902    /* Set up the RTC RTAS interfaces */
2903    spapr_rtc_create(spapr);
2904
2905    /* Set up VIO bus */
2906    spapr->vio_bus = spapr_vio_bus_init();
2907
2908    for (i = 0; serial_hd(i); i++) {
2909        spapr_vty_create(spapr->vio_bus, serial_hd(i));
2910    }
2911
2912    /* We always have at least the nvram device on VIO */
2913    spapr_create_nvram(spapr);
2914
2915    /*
2916     * Setup hotplug / dynamic-reconfiguration connectors. top-level
2917     * connectors (described in root DT node's "ibm,drc-types" property)
2918     * are pre-initialized here. additional child connectors (such as
2919     * connectors for a PHBs PCI slots) are added as needed during their
2920     * parent's realization.
2921     */
2922    if (smc->dr_phb_enabled) {
2923        for (i = 0; i < SPAPR_MAX_PHBS; i++) {
2924            spapr_dr_connector_new(OBJECT(machine), TYPE_SPAPR_DRC_PHB, i);
2925        }
2926    }
2927
2928    /* Set up PCI */
2929    spapr_pci_rtas_init();
2930
2931    phb = spapr_create_default_phb();
2932
2933    for (i = 0; i < nb_nics; i++) {
2934        NICInfo *nd = &nd_table[i];
2935
2936        if (!nd->model) {
2937            nd->model = g_strdup("spapr-vlan");
2938        }
2939
2940        if (g_str_equal(nd->model, "spapr-vlan") ||
2941            g_str_equal(nd->model, "ibmveth")) {
2942            spapr_vlan_create(spapr->vio_bus, nd);
2943        } else {
2944            pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
2945        }
2946    }
2947
2948    for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
2949        spapr_vscsi_create(spapr->vio_bus);
2950    }
2951
2952    /* Graphics */
2953    if (spapr_vga_init(phb->bus, &error_fatal)) {
2954        spapr->has_graphics = true;
2955        machine->usb |= defaults_enabled() && !machine->usb_disabled;
2956    }
2957
2958    if (machine->usb) {
2959        if (smc->use_ohci_by_default) {
2960            pci_create_simple(phb->bus, -1, "pci-ohci");
2961        } else {
2962            pci_create_simple(phb->bus, -1, "nec-usb-xhci");
2963        }
2964
2965        if (spapr->has_graphics) {
2966            USBBus *usb_bus = usb_bus_find(-1);
2967
2968            usb_create_simple(usb_bus, "usb-kbd");
2969            usb_create_simple(usb_bus, "usb-mouse");
2970        }
2971    }
2972
2973    if (kernel_filename) {
2974        spapr->kernel_size = load_elf(kernel_filename, NULL,
2975                                      translate_kernel_address, spapr,
2976                                      NULL, NULL, NULL, NULL, 1,
2977                                      PPC_ELF_MACHINE, 0, 0);
2978        if (spapr->kernel_size == ELF_LOAD_WRONG_ENDIAN) {
2979            spapr->kernel_size = load_elf(kernel_filename, NULL,
2980                                          translate_kernel_address, spapr,
2981                                          NULL, NULL, NULL, NULL, 0,
2982                                          PPC_ELF_MACHINE, 0, 0);
2983            spapr->kernel_le = spapr->kernel_size > 0;
2984        }
2985        if (spapr->kernel_size < 0) {
2986            error_report("error loading %s: %s", kernel_filename,
2987                         load_elf_strerror(spapr->kernel_size));
2988            exit(1);
2989        }
2990
2991        /* load initrd */
2992        if (initrd_filename) {
2993            /* Try to locate the initrd in the gap between the kernel
2994             * and the firmware. Add a bit of space just in case
2995             */
2996            spapr->initrd_base = (spapr->kernel_addr + spapr->kernel_size
2997                                  + 0x1ffff) & ~0xffff;
2998            spapr->initrd_size = load_image_targphys(initrd_filename,
2999                                                     spapr->initrd_base,
3000                                                     load_limit
3001                                                     - spapr->initrd_base);
3002            if (spapr->initrd_size < 0) {
3003                error_report("could not load initial ram disk '%s'",
3004                             initrd_filename);
3005                exit(1);
3006            }
3007        }
3008    }
3009
3010    /* FIXME: Should register things through the MachineState's qdev
3011     * interface, this is a legacy from the sPAPREnvironment structure
3012     * which predated MachineState but had a similar function */
3013    vmstate_register(NULL, 0, &vmstate_spapr, spapr);
3014    register_savevm_live("spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1,
3015                         &savevm_htab_handlers, spapr);
3016
3017    qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine));
3018
3019    qemu_register_boot_set(spapr_boot_set, spapr);
3020
3021    /*
3022     * Nothing needs to be done to resume a suspended guest because
3023     * suspending does not change the machine state, so no need for
3024     * a ->wakeup method.
3025     */
3026    qemu_register_wakeup_support();
3027
3028    if (kvm_enabled()) {
3029        /* to stop and start vmclock */
3030        qemu_add_vm_change_state_handler(cpu_ppc_clock_vm_state_change,
3031                                         &spapr->tb);
3032
3033        kvmppc_spapr_enable_inkernel_multitce();
3034    }
3035
3036    qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
3037    if (spapr->vof) {
3038        spapr->vof->fw_size = fw_size; /* for claim() on itself */
3039        spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client);
3040    }
3041}
3042
3043#define DEFAULT_KVM_TYPE "auto"
3044static int spapr_kvm_type(MachineState *machine, const char *vm_type)
3045{
3046    /*
3047     * The use of g_ascii_strcasecmp() for 'hv' and 'pr' is to
3048     * accomodate the 'HV' and 'PV' formats that exists in the
3049     * wild. The 'auto' mode is being introduced already as
3050     * lower-case, thus we don't need to bother checking for
3051     * "AUTO".
3052     */
3053    if (!vm_type || !strcmp(vm_type, DEFAULT_KVM_TYPE)) {
3054        return 0;
3055    }
3056
3057    if (!g_ascii_strcasecmp(vm_type, "hv")) {
3058        return 1;
3059    }
3060
3061    if (!g_ascii_strcasecmp(vm_type, "pr")) {
3062        return 2;
3063    }
3064
3065    error_report("Unknown kvm-type specified '%s'", vm_type);
3066    exit(1);
3067}
3068
3069/*
3070 * Implementation of an interface to adjust firmware path
3071 * for the bootindex property handling.
3072 */
3073static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
3074                                   DeviceState *dev)
3075{
3076#define CAST(type, obj, name) \
3077    ((type *)object_dynamic_cast(OBJECT(obj), (name)))
3078    SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
3079    SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);
3080    VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON);
3081    PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
3082
3083    if (d && bus) {
3084        void *spapr = CAST(void, bus->parent, "spapr-vscsi");
3085        VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
3086        USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);
3087
3088        if (spapr) {
3089            /*
3090             * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
3091             * In the top 16 bits of the 64-bit LUN, we use SRP luns of the form
3092             * 0x8000 | (target << 8) | (bus << 5) | lun
3093             * (see the "Logical unit addressing format" table in SAM5)
3094             */
3095            unsigned id = 0x8000 | (d->id << 8) | (d->channel << 5) | d->lun;
3096            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3097                                   (uint64_t)id << 48);
3098        } else if (virtio) {
3099            /*
3100             * We use SRP luns of the form 01000000 | (target << 8) | lun
3101             * in the top 32 bits of the 64-bit LUN
3102             * Note: the quote above is from SLOF and it is wrong,
3103             * the actual binding is:
3104             * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
3105             */
3106            unsigned id = 0x1000000 | (d->id << 16) | d->lun;
3107            if (d->lun >= 256) {
3108                /* Use the LUN "flat space addressing method" */
3109                id |= 0x4000;
3110            }
3111            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3112                                   (uint64_t)id << 32);
3113        } else if (usb) {
3114            /*
3115             * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
3116             * in the top 32 bits of the 64-bit LUN
3117             */
3118            unsigned usb_port = atoi(usb->port->path);
3119            unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
3120            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3121                                   (uint64_t)id << 32);
3122        }
3123    }
3124
3125    /*
3126     * SLOF probes the USB devices, and if it recognizes that the device is a
3127     * storage device, it changes its name to "storage" instead of "usb-host",
3128     * and additionally adds a child node for the SCSI LUN, so the correct
3129     * boot path in SLOF is something like .../storage@1/disk@xxx" instead.
3130     */
3131    if (strcmp("usb-host", qdev_fw_name(dev)) == 0) {
3132        USBDevice *usbdev = CAST(USBDevice, dev, TYPE_USB_DEVICE);
3133        if (usb_device_is_scsi_storage(usbdev)) {
3134            return g_strdup_printf("storage@%s/disk", usbdev->port->path);
3135        }
3136    }
3137
3138    if (phb) {
3139        /* Replace "pci" with "pci@800000020000000" */
3140        return g_strdup_printf("pci@%"PRIX64, phb->buid);
3141    }
3142
3143    if (vsc) {
3144        /* Same logic as virtio above */
3145        unsigned id = 0x1000000 | (vsc->target << 16) | vsc->lun;
3146        return g_strdup_printf("disk@%"PRIX64, (uint64_t)id << 32);
3147    }
3148
3149    if (g_str_equal("pci-bridge", qdev_fw_name(dev))) {
3150        /* SLOF uses "pci" instead of "pci-bridge" for PCI bridges */
3151        PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
3152        return g_strdup_printf("pci@%x", PCI_SLOT(pcidev->devfn));
3153    }
3154
3155    if (pcidev) {
3156        return spapr_pci_fw_dev_name(pcidev);
3157    }
3158
3159    return NULL;
3160}
3161
3162static char *spapr_get_kvm_type(Object *obj, Error **errp)
3163{
3164    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3165
3166    return g_strdup(spapr->kvm_type);
3167}
3168
3169static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
3170{
3171    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3172
3173    g_free(spapr->kvm_type);
3174    spapr->kvm_type = g_strdup(value);
3175}
3176
3177static bool spapr_get_modern_hotplug_events(Object *obj, Error **errp)
3178{
3179    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3180
3181    return spapr->use_hotplug_event_source;
3182}
3183
3184static void spapr_set_modern_hotplug_events(Object *obj, bool value,
3185                                            Error **errp)
3186{
3187    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3188
3189    spapr->use_hotplug_event_source = value;
3190}
3191
3192static bool spapr_get_msix_emulation(Object *obj, Error **errp)
3193{
3194    return true;
3195}
3196
3197static char *spapr_get_resize_hpt(Object *obj, Error **errp)
3198{
3199    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3200
3201    switch (spapr->resize_hpt) {
3202    case SPAPR_RESIZE_HPT_DEFAULT:
3203        return g_strdup("default");
3204    case SPAPR_RESIZE_HPT_DISABLED:
3205        return g_strdup("disabled");
3206    case SPAPR_RESIZE_HPT_ENABLED:
3207        return g_strdup("enabled");
3208    case SPAPR_RESIZE_HPT_REQUIRED:
3209        return g_strdup("required");
3210    }
3211    g_assert_not_reached();
3212}
3213
3214static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
3215{
3216    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3217
3218    if (strcmp(value, "default") == 0) {
3219        spapr->resize_hpt = SPAPR_RESIZE_HPT_DEFAULT;
3220    } else if (strcmp(value, "disabled") == 0) {
3221        spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
3222    } else if (strcmp(value, "enabled") == 0) {
3223        spapr->resize_hpt = SPAPR_RESIZE_HPT_ENABLED;
3224    } else if (strcmp(value, "required") == 0) {
3225        spapr->resize_hpt = SPAPR_RESIZE_HPT_REQUIRED;
3226    } else {
3227        error_setg(errp, "Bad value for \"resize-hpt\" property");
3228    }
3229}
3230
3231static bool spapr_get_vof(Object *obj, Error **errp)
3232{
3233    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3234
3235    return spapr->vof != NULL;
3236}
3237
3238static void spapr_set_vof(Object *obj, bool value, Error **errp)
3239{
3240    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3241
3242    if (spapr->vof) {
3243        vof_cleanup(spapr->vof);
3244        g_free(spapr->vof);
3245        spapr->vof = NULL;
3246    }
3247    if (!value) {
3248        return;
3249    }
3250    spapr->vof = g_malloc0(sizeof(*spapr->vof));
3251}
3252
3253static char *spapr_get_ic_mode(Object *obj, Error **errp)
3254{
3255    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3256
3257    if (spapr->irq == &spapr_irq_xics_legacy) {
3258        return g_strdup("legacy");
3259    } else if (spapr->irq == &spapr_irq_xics) {
3260        return g_strdup("xics");
3261    } else if (spapr->irq == &spapr_irq_xive) {
3262        return g_strdup("xive");
3263    } else if (spapr->irq == &spapr_irq_dual) {
3264        return g_strdup("dual");
3265    }
3266    g_assert_not_reached();
3267}
3268
3269static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
3270{
3271    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3272
3273    if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
3274        error_setg(errp, "This machine only uses the legacy XICS backend, don't pass ic-mode");
3275        return;
3276    }
3277
3278    /* The legacy IRQ backend can not be set */
3279    if (strcmp(value, "xics") == 0) {
3280        spapr->irq = &spapr_irq_xics;
3281    } else if (strcmp(value, "xive") == 0) {
3282        spapr->irq = &spapr_irq_xive;
3283    } else if (strcmp(value, "dual") == 0) {
3284        spapr->irq = &spapr_irq_dual;
3285    } else {
3286        error_setg(errp, "Bad value for \"ic-mode\" property");
3287    }
3288}
3289
3290static char *spapr_get_host_model(Object *obj, Error **errp)
3291{
3292    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3293
3294    return g_strdup(spapr->host_model);
3295}
3296
3297static void spapr_set_host_model(Object *obj, const char *value, Error **errp)
3298{
3299    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3300
3301    g_free(spapr->host_model);
3302    spapr->host_model = g_strdup(value);
3303}
3304
3305static char *spapr_get_host_serial(Object *obj, Error **errp)
3306{
3307    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3308
3309    return g_strdup(spapr->host_serial);
3310}
3311
3312static void spapr_set_host_serial(Object *obj, const char *value, Error **errp)
3313{
3314    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3315
3316    g_free(spapr->host_serial);
3317    spapr->host_serial = g_strdup(value);
3318}
3319
3320static void spapr_instance_init(Object *obj)
3321{
3322    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3323    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
3324    MachineState *ms = MACHINE(spapr);
3325    MachineClass *mc = MACHINE_GET_CLASS(ms);
3326
3327    /*
3328     * NVDIMM support went live in 5.1 without considering that, in
3329     * other archs, the user needs to enable NVDIMM support with the
3330     * 'nvdimm' machine option and the default behavior is NVDIMM
3331     * support disabled. It is too late to roll back to the standard
3332     * behavior without breaking 5.1 guests.
3333     */
3334    if (mc->nvdimm_supported) {
3335        ms->nvdimms_state->is_enabled = true;
3336    }
3337
3338    spapr->htab_fd = -1;
3339    spapr->use_hotplug_event_source = true;
3340    spapr->kvm_type = g_strdup(DEFAULT_KVM_TYPE);
3341    object_property_add_str(obj, "kvm-type",
3342                            spapr_get_kvm_type, spapr_set_kvm_type);
3343    object_property_set_description(obj, "kvm-type",
3344                                    "Specifies the KVM virtualization mode (auto,"
3345                                    " hv, pr). Defaults to 'auto'. This mode will use"
3346                                    " any available KVM module loaded in the host,"
3347                                    " where kvm_hv takes precedence if both kvm_hv and"
3348                                    " kvm_pr are loaded.");
3349    object_property_add_bool(obj, "modern-hotplug-events",
3350                            spapr_get_modern_hotplug_events,
3351                            spapr_set_modern_hotplug_events);
3352    object_property_set_description(obj, "modern-hotplug-events",
3353                                    "Use dedicated hotplug event mechanism in"
3354                                    " place of standard EPOW events when possible"
3355                                    " (required for memory hot-unplug support)");
3356    ppc_compat_add_property(obj, "max-cpu-compat", &spapr->max_compat_pvr,
3357                            "Maximum permitted CPU compatibility mode");
3358
3359    object_property_add_str(obj, "resize-hpt",
3360                            spapr_get_resize_hpt, spapr_set_resize_hpt);
3361    object_property_set_description(obj, "resize-hpt",
3362                                    "Resizing of the Hash Page Table (enabled, disabled, required)");
3363    object_property_add_uint32_ptr(obj, "vsmt",
3364                                   &spapr->vsmt, OBJ_PROP_FLAG_READWRITE);
3365    object_property_set_description(obj, "vsmt",
3366                                    "Virtual SMT: KVM behaves as if this were"
3367                                    " the host's SMT mode");
3368
3369    object_property_add_bool(obj, "vfio-no-msix-emulation",
3370                             spapr_get_msix_emulation, NULL);
3371
3372    object_property_add_uint64_ptr(obj, "kernel-addr",
3373                                   &spapr->kernel_addr, OBJ_PROP_FLAG_READWRITE);
3374    object_property_set_description(obj, "kernel-addr",
3375                                    stringify(KERNEL_LOAD_ADDR)
3376                                    " for -kernel is the default");
3377    spapr->kernel_addr = KERNEL_LOAD_ADDR;
3378
3379    object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof);
3380    object_property_set_description(obj, "x-vof",
3381                                    "Enable Virtual Open Firmware (experimental)");
3382
3383    /* The machine class defines the default interrupt controller mode */
3384    spapr->irq = smc->irq;
3385    object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
3386                            spapr_set_ic_mode);
3387    object_property_set_description(obj, "ic-mode",
3388                 "Specifies the interrupt controller mode (xics, xive, dual)");
3389
3390    object_property_add_str(obj, "host-model",
3391        spapr_get_host_model, spapr_set_host_model);
3392    object_property_set_description(obj, "host-model",
3393        "Host model to advertise in guest device tree");
3394    object_property_add_str(obj, "host-serial",
3395        spapr_get_host_serial, spapr_set_host_serial);
3396    object_property_set_description(obj, "host-serial",
3397        "Host serial number to advertise in guest device tree");
3398}
3399
3400static void spapr_machine_finalizefn(Object *obj)
3401{
3402    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3403
3404    g_free(spapr->kvm_type);
3405}
3406
3407void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg)
3408{
3409    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
3410    PowerPCCPU *cpu = POWERPC_CPU(cs);
3411    CPUPPCState *env = &cpu->env;
3412
3413    cpu_synchronize_state(cs);
3414    /* If FWNMI is inactive, addr will be -1, which will deliver to 0x100 */
3415    if (spapr->fwnmi_system_reset_addr != -1) {
3416        uint64_t rtas_addr, addr;
3417
3418        /* get rtas addr from fdt */
3419        rtas_addr = spapr_get_rtas_addr();
3420        if (!rtas_addr) {
3421            qemu_system_guest_panicked(NULL);
3422            return;
3423        }
3424
3425        addr = rtas_addr + RTAS_ERROR_LOG_MAX + cs->cpu_index * sizeof(uint64_t)*2;
3426        stq_be_phys(&address_space_memory, addr, env->gpr[3]);
3427        stq_be_phys(&address_space_memory, addr + sizeof(uint64_t), 0);
3428        env->gpr[3] = addr;
3429    }
3430    ppc_cpu_do_system_reset(cs);
3431    if (spapr->fwnmi_system_reset_addr != -1) {
3432        env->nip = spapr->fwnmi_system_reset_addr;
3433    }
3434}
3435
3436static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
3437{
3438    CPUState *cs;
3439
3440    CPU_FOREACH(cs) {
3441        async_run_on_cpu(cs, spapr_do_system_reset_on_cpu, RUN_ON_CPU_NULL);
3442    }
3443}
3444
3445int spapr_lmb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3446                          void *fdt, int *fdt_start_offset, Error **errp)
3447{
3448    uint64_t addr;
3449    uint32_t node;
3450
3451    addr = spapr_drc_index(drc) * SPAPR_MEMORY_BLOCK_SIZE;
3452    node = object_property_get_uint(OBJECT(drc->dev), PC_DIMM_NODE_PROP,
3453                                    &error_abort);
3454    *fdt_start_offset = spapr_dt_memory_node(spapr, fdt, node, addr,
3455                                             SPAPR_MEMORY_BLOCK_SIZE);
3456    return 0;
3457}
3458
3459static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
3460                           bool dedicated_hp_event_source)
3461{
3462    SpaprDrc *drc;
3463    uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
3464    int i;
3465    uint64_t addr = addr_start;
3466    bool hotplugged = spapr_drc_hotplugged(dev);
3467
3468    for (i = 0; i < nr_lmbs; i++) {
3469        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3470                              addr / SPAPR_MEMORY_BLOCK_SIZE);
3471        g_assert(drc);
3472
3473        /*
3474         * memory_device_get_free_addr() provided a range of free addresses
3475         * that doesn't overlap with any existing mapping at pre-plug. The
3476         * corresponding LMB DRCs are thus assumed to be all attachable.
3477         */
3478        spapr_drc_attach(drc, dev);
3479        if (!hotplugged) {
3480            spapr_drc_reset(drc);
3481        }
3482        addr += SPAPR_MEMORY_BLOCK_SIZE;
3483    }
3484    /* send hotplug notification to the
3485     * guest only in case of hotplugged memory
3486     */
3487    if (hotplugged) {
3488        if (dedicated_hp_event_source) {
3489            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3490                                  addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3491            g_assert(drc);
3492            spapr_hotplug_req_add_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3493                                                   nr_lmbs,
3494                                                   spapr_drc_index(drc));
3495        } else {
3496            spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB,
3497                                           nr_lmbs);
3498        }
3499    }
3500}
3501
3502static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
3503{
3504    SpaprMachineState *ms = SPAPR_MACHINE(hotplug_dev);
3505    PCDIMMDevice *dimm = PC_DIMM(dev);
3506    uint64_t size, addr;
3507    int64_t slot;
3508    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
3509
3510    size = memory_device_get_region_size(MEMORY_DEVICE(dev), &error_abort);
3511
3512    pc_dimm_plug(dimm, MACHINE(ms));
3513
3514    if (!is_nvdimm) {
3515        addr = object_property_get_uint(OBJECT(dimm),
3516                                        PC_DIMM_ADDR_PROP, &error_abort);
3517        spapr_add_lmbs(dev, addr, size,
3518                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT));
3519    } else {
3520        slot = object_property_get_int(OBJECT(dimm),
3521                                       PC_DIMM_SLOT_PROP, &error_abort);
3522        /* We should have valid slot number at this point */
3523        g_assert(slot >= 0);
3524        spapr_add_nvdimm(dev, slot);
3525    }
3526}
3527
3528static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3529                                  Error **errp)
3530{
3531    const SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
3532    SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3533    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
3534    PCDIMMDevice *dimm = PC_DIMM(dev);
3535    Error *local_err = NULL;
3536    uint64_t size;
3537    Object *memdev;
3538    hwaddr pagesize;
3539
3540    if (!smc->dr_lmb_enabled) {
3541        error_setg(errp, "Memory hotplug not supported for this machine");
3542        return;
3543    }
3544
3545    size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &local_err);
3546    if (local_err) {
3547        error_propagate(errp, local_err);
3548        return;
3549    }
3550
3551    if (is_nvdimm) {
3552        if (!spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, errp)) {
3553            return;
3554        }
3555    } else if (size % SPAPR_MEMORY_BLOCK_SIZE) {
3556        error_setg(errp, "Hotplugged memory size must be a multiple of "
3557                   "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
3558        return;
3559    }
3560
3561    memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
3562                                      &error_abort);
3563    pagesize = host_memory_backend_pagesize(MEMORY_BACKEND(memdev));
3564    if (!spapr_check_pagesize(spapr, pagesize, errp)) {
3565        return;
3566    }
3567
3568    pc_dimm_pre_plug(dimm, MACHINE(hotplug_dev), NULL, errp);
3569}
3570
3571struct SpaprDimmState {
3572    PCDIMMDevice *dimm;
3573    uint32_t nr_lmbs;
3574    QTAILQ_ENTRY(SpaprDimmState) next;
3575};
3576
3577static SpaprDimmState *spapr_pending_dimm_unplugs_find(SpaprMachineState *s,
3578                                                       PCDIMMDevice *dimm)
3579{
3580    SpaprDimmState *dimm_state = NULL;
3581
3582    QTAILQ_FOREACH(dimm_state, &s->pending_dimm_unplugs, next) {
3583        if (dimm_state->dimm == dimm) {
3584            break;
3585        }
3586    }
3587    return dimm_state;
3588}
3589
3590static SpaprDimmState *spapr_pending_dimm_unplugs_add(SpaprMachineState *spapr,
3591                                                      uint32_t nr_lmbs,
3592                                                      PCDIMMDevice *dimm)
3593{
3594    SpaprDimmState *ds = NULL;
3595
3596    /*
3597     * If this request is for a DIMM whose removal had failed earlier
3598     * (due to guest's refusal to remove the LMBs), we would have this
3599     * dimm already in the pending_dimm_unplugs list. In that
3600     * case don't add again.
3601     */
3602    ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
3603    if (!ds) {
3604        ds = g_new0(SpaprDimmState, 1);
3605        ds->nr_lmbs = nr_lmbs;
3606        ds->dimm = dimm;
3607        QTAILQ_INSERT_HEAD(&spapr->pending_dimm_unplugs, ds, next);
3608    }
3609    return ds;
3610}
3611
3612static void spapr_pending_dimm_unplugs_remove(SpaprMachineState *spapr,
3613                                              SpaprDimmState *dimm_state)
3614{
3615    QTAILQ_REMOVE(&spapr->pending_dimm_unplugs, dimm_state, next);
3616    g_free(dimm_state);
3617}
3618
3619static SpaprDimmState *spapr_recover_pending_dimm_state(SpaprMachineState *ms,
3620                                                        PCDIMMDevice *dimm)
3621{
3622    SpaprDrc *drc;
3623    uint64_t size = memory_device_get_region_size(MEMORY_DEVICE(dimm),
3624                                                  &error_abort);
3625    uint32_t nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3626    uint32_t avail_lmbs = 0;
3627    uint64_t addr_start, addr;
3628    int i;
3629
3630    addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3631                                          &error_abort);
3632
3633    addr = addr_start;
3634    for (i = 0; i < nr_lmbs; i++) {
3635        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3636                              addr / SPAPR_MEMORY_BLOCK_SIZE);
3637        g_assert(drc);
3638        if (drc->dev) {
3639            avail_lmbs++;
3640        }
3641        addr += SPAPR_MEMORY_BLOCK_SIZE;
3642    }
3643
3644    return spapr_pending_dimm_unplugs_add(ms, avail_lmbs, dimm);
3645}
3646
3647void spapr_memory_unplug_rollback(SpaprMachineState *spapr, DeviceState *dev)
3648{
3649    SpaprDimmState *ds;
3650    PCDIMMDevice *dimm;
3651    SpaprDrc *drc;
3652    uint32_t nr_lmbs;
3653    uint64_t size, addr_start, addr;
3654    g_autofree char *qapi_error = NULL;
3655    int i;
3656
3657    if (!dev) {
3658        return;
3659    }
3660
3661    dimm = PC_DIMM(dev);
3662    ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
3663
3664    /*
3665     * 'ds == NULL' would mean that the DIMM doesn't have a pending
3666     * unplug state, but one of its DRC is marked as unplug_requested.
3667     * This is bad and weird enough to g_assert() out.
3668     */
3669    g_assert(ds);
3670
3671    spapr_pending_dimm_unplugs_remove(spapr, ds);
3672
3673    size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
3674    nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3675
3676    addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3677                                          &error_abort);
3678
3679    addr = addr_start;
3680    for (i = 0; i < nr_lmbs; i++) {
3681        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3682                              addr / SPAPR_MEMORY_BLOCK_SIZE);
3683        g_assert(drc);
3684
3685        drc->unplug_requested = false;
3686        addr += SPAPR_MEMORY_BLOCK_SIZE;
3687    }
3688
3689    /*
3690     * Tell QAPI that something happened and the memory
3691     * hotunplug wasn't successful. Keep sending
3692     * MEM_UNPLUG_ERROR even while sending
3693     * DEVICE_UNPLUG_GUEST_ERROR until the deprecation of
3694     * MEM_UNPLUG_ERROR is due.
3695     */
3696    qapi_error = g_strdup_printf("Memory hotunplug rejected by the guest "
3697                                 "for device %s", dev->id);
3698
3699    qapi_event_send_mem_unplug_error(dev->id ? : "", qapi_error);
3700
3701    qapi_event_send_device_unplug_guest_error(!!dev->id, dev->id,
3702                                              dev->canonical_path);
3703}
3704
3705/* Callback to be called during DRC release. */
3706void spapr_lmb_release(DeviceState *dev)
3707{
3708    HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3709    SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_ctrl);
3710    SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3711
3712    /* This information will get lost if a migration occurs
3713     * during the unplug process. In this case recover it. */
3714    if (ds == NULL) {
3715        ds = spapr_recover_pending_dimm_state(spapr, PC_DIMM(dev));
3716        g_assert(ds);
3717        /* The DRC being examined by the caller at least must be counted */
3718        g_assert(ds->nr_lmbs);
3719    }
3720
3721    if (--ds->nr_lmbs) {
3722        return;
3723    }
3724
3725    /*
3726     * Now that all the LMBs have been removed by the guest, call the
3727     * unplug handler chain. This can never fail.
3728     */
3729    hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3730    object_unparent(OBJECT(dev));
3731}
3732
3733static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3734{
3735    SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3736    SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3737
3738    /* We really shouldn't get this far without anything to unplug */
3739    g_assert(ds);
3740
3741    pc_dimm_unplug(PC_DIMM(dev), MACHINE(hotplug_dev));
3742    qdev_unrealize(dev);
3743    spapr_pending_dimm_unplugs_remove(spapr, ds);
3744}
3745
3746static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
3747                                        DeviceState *dev, Error **errp)
3748{
3749    SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3750    PCDIMMDevice *dimm = PC_DIMM(dev);
3751    uint32_t nr_lmbs;
3752    uint64_t size, addr_start, addr;
3753    int i;
3754    SpaprDrc *drc;
3755
3756    if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
3757        error_setg(errp, "nvdimm device hot unplug is not supported yet.");
3758        return;
3759    }
3760
3761    size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
3762    nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3763
3764    addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3765                                          &error_abort);
3766
3767    /*
3768     * An existing pending dimm state for this DIMM means that there is an
3769     * unplug operation in progress, waiting for the spapr_lmb_release
3770     * callback to complete the job (BQL can't cover that far). In this case,
3771     * bail out to avoid detaching DRCs that were already released.
3772     */
3773    if (spapr_pending_dimm_unplugs_find(spapr, dimm)) {
3774        error_setg(errp, "Memory unplug already in progress for device %s",
3775                   dev->id);
3776        return;
3777    }
3778
3779    spapr_pending_dimm_unplugs_add(spapr, nr_lmbs, dimm);
3780
3781    addr = addr_start;
3782    for (i = 0; i < nr_lmbs; i++) {
3783        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3784                              addr / SPAPR_MEMORY_BLOCK_SIZE);
3785        g_assert(drc);
3786
3787        spapr_drc_unplug_request(drc);
3788        addr += SPAPR_MEMORY_BLOCK_SIZE;
3789    }
3790
3791    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3792                          addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3793    spapr_hotplug_req_remove_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3794                                              nr_lmbs, spapr_drc_index(drc));
3795}
3796
3797/* Callback to be called during DRC release. */
3798void spapr_core_release(DeviceState *dev)
3799{
3800    HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3801
3802    /* Call the unplug handler chain. This can never fail. */
3803    hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3804    object_unparent(OBJECT(dev));
3805}
3806
3807static void spapr_core_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3808{
3809    MachineState *ms = MACHINE(hotplug_dev);
3810    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
3811    CPUCore *cc = CPU_CORE(dev);
3812    CPUArchId *core_slot = spapr_find_cpu_slot(ms, cc->core_id, NULL);
3813
3814    if (smc->pre_2_10_has_unused_icps) {
3815        SpaprCpuCore *sc = SPAPR_CPU_CORE(OBJECT(dev));
3816        int i;
3817
3818        for (i = 0; i < cc->nr_threads; i++) {
3819            CPUState *cs = CPU(sc->threads[i]);
3820
3821            pre_2_10_vmstate_register_dummy_icp(cs->cpu_index);
3822        }
3823    }
3824
3825    assert(core_slot);
3826    core_slot->cpu = NULL;
3827    qdev_unrealize(dev);
3828}
3829
3830static
3831void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev,
3832                               Error **errp)
3833{
3834    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3835    int index;
3836    SpaprDrc *drc;
3837    CPUCore *cc = CPU_CORE(dev);
3838
3839    if (!spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index)) {
3840        error_setg(errp, "Unable to find CPU core with core-id: %d",
3841                   cc->core_id);
3842        return;
3843    }
3844    if (index == 0) {
3845        error_setg(errp, "Boot CPU core may not be unplugged");
3846        return;
3847    }
3848
3849    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3850                          spapr_vcpu_id(spapr, cc->core_id));
3851    g_assert(drc);
3852
3853    if (!spapr_drc_unplug_requested(drc)) {
3854        spapr_drc_unplug_request(drc);
3855    }
3856
3857    /*
3858     * spapr_hotplug_req_remove_by_index is left unguarded, out of the
3859     * "!spapr_drc_unplug_requested" check, to allow for multiple IRQ
3860     * pulses removing the same CPU. Otherwise, in an failed hotunplug
3861     * attempt (e.g. the kernel will refuse to remove the last online
3862     * CPU), we will never attempt it again because unplug_requested
3863     * will still be 'true' in that case.
3864     */
3865    spapr_hotplug_req_remove_by_index(drc);
3866}
3867
3868int spapr_core_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3869                           void *fdt, int *fdt_start_offset, Error **errp)
3870{
3871    SpaprCpuCore *core = SPAPR_CPU_CORE(drc->dev);
3872    CPUState *cs = CPU(core->threads[0]);
3873    PowerPCCPU *cpu = POWERPC_CPU(cs);
3874    DeviceClass *dc = DEVICE_GET_CLASS(cs);
3875    int id = spapr_get_vcpu_id(cpu);
3876    g_autofree char *nodename = NULL;
3877    int offset;
3878
3879    nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
3880    offset = fdt_add_subnode(fdt, 0, nodename);
3881
3882    spapr_dt_cpu(cs, fdt, offset, spapr);
3883
3884    /*
3885     * spapr_dt_cpu() does not fill the 'name' property in the
3886     * CPU node. The function is called during boot process, before
3887     * and after CAS, and overwriting the 'name' property written
3888     * by SLOF is not allowed.
3889     *
3890     * Write it manually after spapr_dt_cpu(). This makes the hotplug
3891     * CPUs more compatible with the coldplugged ones, which have
3892     * the 'name' property. Linux Kernel also relies on this
3893     * property to identify CPU nodes.
3894     */
3895    _FDT((fdt_setprop_string(fdt, offset, "name", nodename)));
3896
3897    *fdt_start_offset = offset;
3898    return 0;
3899}
3900
3901static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
3902{
3903    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3904    MachineClass *mc = MACHINE_GET_CLASS(spapr);
3905    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
3906    SpaprCpuCore *core = SPAPR_CPU_CORE(OBJECT(dev));
3907    CPUCore *cc = CPU_CORE(dev);
3908    CPUState *cs;
3909    SpaprDrc *drc;
3910    CPUArchId *core_slot;
3911    int index;
3912    bool hotplugged = spapr_drc_hotplugged(dev);
3913    int i;
3914
3915    core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
3916    g_assert(core_slot); /* Already checked in spapr_core_pre_plug() */
3917
3918    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3919                          spapr_vcpu_id(spapr, cc->core_id));
3920
3921    g_assert(drc || !mc->has_hotpluggable_cpus);
3922
3923    if (drc) {
3924        /*
3925         * spapr_core_pre_plug() already buys us this is a brand new
3926         * core being plugged into a free slot. Nothing should already
3927         * be attached to the corresponding DRC.
3928         */
3929        spapr_drc_attach(drc, dev);
3930
3931        if (hotplugged) {
3932            /*
3933             * Send hotplug notification interrupt to the guest only
3934             * in case of hotplugged CPUs.
3935             */
3936            spapr_hotplug_req_add_by_index(drc);
3937        } else {
3938            spapr_drc_reset(drc);
3939        }
3940    }
3941
3942    core_slot->cpu = OBJECT(dev);
3943
3944    /*
3945     * Set compatibility mode to match the boot CPU, which was either set
3946     * by the machine reset code or by CAS. This really shouldn't fail at
3947     * this point.
3948     */
3949    if (hotplugged) {
3950        for (i = 0; i < cc->nr_threads; i++) {
3951            ppc_set_compat(core->threads[i], POWERPC_CPU(first_cpu)->compat_pvr,
3952                           &error_abort);
3953        }
3954    }
3955
3956    if (smc->pre_2_10_has_unused_icps) {
3957        for (i = 0; i < cc->nr_threads; i++) {
3958            cs = CPU(core->threads[i]);
3959            pre_2_10_vmstate_unregister_dummy_icp(cs->cpu_index);
3960        }
3961    }
3962}
3963
3964static void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3965                                Error **errp)
3966{
3967    MachineState *machine = MACHINE(OBJECT(hotplug_dev));
3968    MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
3969    CPUCore *cc = CPU_CORE(dev);
3970    const char *base_core_type = spapr_get_cpu_core_type(machine->cpu_type);
3971    const char *type = object_get_typename(OBJECT(dev));
3972    CPUArchId *core_slot;
3973    int index;
3974    unsigned int smp_threads = machine->smp.threads;
3975
3976    if (dev->hotplugged && !mc->has_hotpluggable_cpus) {
3977        error_setg(errp, "CPU hotplug not supported for this machine");
3978        return;
3979    }
3980
3981    if (strcmp(base_core_type, type)) {
3982        error_setg(errp, "CPU core type should be %s", base_core_type);
3983        return;
3984    }
3985
3986    if (cc->core_id % smp_threads) {
3987        error_setg(errp, "invalid core id %d", cc->core_id);
3988        return;
3989    }
3990
3991    /*
3992     * In general we should have homogeneous threads-per-core, but old
3993     * (pre hotplug support) machine types allow the last core to have
3994     * reduced threads as a compatibility hack for when we allowed
3995     * total vcpus not a multiple of threads-per-core.
3996     */
3997    if (mc->has_hotpluggable_cpus && (cc->nr_threads != smp_threads)) {
3998        error_setg(errp, "invalid nr-threads %d, must be %d", cc->nr_threads,
3999                   smp_threads);
4000        return;
4001    }
4002
4003    core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
4004    if (!core_slot) {
4005        error_setg(errp, "core id %d out of range", cc->core_id);
4006        return;
4007    }
4008
4009    if (core_slot->cpu) {
4010        error_setg(errp, "core %d already populated", cc->core_id);
4011        return;
4012    }
4013
4014    numa_cpu_pre_plug(core_slot, dev, errp);
4015}
4016
4017int spapr_phb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
4018                          void *fdt, int *fdt_start_offset, Error **errp)
4019{
4020    SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(drc->dev);
4021    int intc_phandle;
4022
4023    intc_phandle = spapr_irq_get_phandle(spapr, spapr->fdt_blob, errp);
4024    if (intc_phandle <= 0) {
4025        return -1;
4026    }
4027
4028    if (spapr_dt_phb(spapr, sphb, intc_phandle, fdt, fdt_start_offset)) {
4029        error_setg(errp, "unable to create FDT node for PHB %d", sphb->index);
4030        return -1;
4031    }
4032
4033    /* generally SLOF creates these, for hotplug it's up to QEMU */
4034    _FDT(fdt_setprop_string(fdt, *fdt_start_offset, "name", "pci"));
4035
4036    return 0;
4037}
4038
4039static bool spapr_phb_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
4040                               Error **errp)
4041{
4042    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4043    SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4044    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
4045    const unsigned windows_supported = spapr_phb_windows_supported(sphb);
4046    SpaprDrc *drc;
4047
4048    if (dev->hotplugged && !smc->dr_phb_enabled) {
4049        error_setg(errp, "PHB hotplug not supported for this machine");
4050        return false;
4051    }
4052
4053    if (sphb->index == (uint32_t)-1) {
4054        error_setg(errp, "\"index\" for PAPR PHB is mandatory");
4055        return false;
4056    }
4057
4058    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4059    if (drc && drc->dev) {
4060        error_setg(errp, "PHB %d already attached", sphb->index);
4061        return false;
4062    }
4063
4064    /*
4065     * This will check that sphb->index doesn't exceed the maximum number of
4066     * PHBs for the current machine type.
4067     */
4068    return
4069        smc->phb_placement(spapr, sphb->index,
4070                           &sphb->buid, &sphb->io_win_addr,
4071                           &sphb->mem_win_addr, &sphb->mem64_win_addr,
4072                           windows_supported, sphb->dma_liobn,
4073                           &sphb->nv2_gpa_win_addr, &sphb->nv2_atsd_win_addr,
4074                           errp);
4075}
4076
4077static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
4078{
4079    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4080    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
4081    SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4082    SpaprDrc *drc;
4083    bool hotplugged = spapr_drc_hotplugged(dev);
4084
4085    if (!smc->dr_phb_enabled) {
4086        return;
4087    }
4088
4089    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4090    /* hotplug hooks should check it's enabled before getting this far */
4091    assert(drc);
4092
4093    /* spapr_phb_pre_plug() already checked the DRC is attachable */
4094    spapr_drc_attach(drc, dev);
4095
4096    if (hotplugged) {
4097        spapr_hotplug_req_add_by_index(drc);
4098    } else {
4099        spapr_drc_reset(drc);
4100    }
4101}
4102
4103void spapr_phb_release(DeviceState *dev)
4104{
4105    HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
4106
4107    hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
4108    object_unparent(OBJECT(dev));
4109}
4110
4111static void spapr_phb_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
4112{
4113    qdev_unrealize(dev);
4114}
4115
4116static void spapr_phb_unplug_request(HotplugHandler *hotplug_dev,
4117                                     DeviceState *dev, Error **errp)
4118{
4119    SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4120    SpaprDrc *drc;
4121
4122    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4123    assert(drc);
4124
4125    if (!spapr_drc_unplug_requested(drc)) {
4126        spapr_drc_unplug_request(drc);
4127        spapr_hotplug_req_remove_by_index(drc);
4128    } else {
4129        error_setg(errp,
4130                   "PCI Host Bridge unplug already in progress for device %s",
4131                   dev->id);
4132    }
4133}
4134
4135static
4136bool spapr_tpm_proxy_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
4137                              Error **errp)
4138{
4139    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4140
4141    if (spapr->tpm_proxy != NULL) {
4142        error_setg(errp, "Only one TPM proxy can be specified for this machine");
4143        return false;
4144    }
4145
4146    return true;
4147}
4148
4149static void spapr_tpm_proxy_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
4150{
4151    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4152    SpaprTpmProxy *tpm_proxy = SPAPR_TPM_PROXY(dev);
4153
4154    /* Already checked in spapr_tpm_proxy_pre_plug() */
4155    g_assert(spapr->tpm_proxy == NULL);
4156
4157    spapr->tpm_proxy = tpm_proxy;
4158}
4159
4160static void spapr_tpm_proxy_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
4161{
4162    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4163
4164    qdev_unrealize(dev);
4165    object_unparent(OBJECT(dev));
4166    spapr->tpm_proxy = NULL;
4167}
4168
4169static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
4170                                      DeviceState *dev, Error **errp)
4171{
4172    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4173        spapr_memory_plug(hotplug_dev, dev);
4174    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4175        spapr_core_plug(hotplug_dev, dev);
4176    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4177        spapr_phb_plug(hotplug_dev, dev);
4178    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4179        spapr_tpm_proxy_plug(hotplug_dev, dev);
4180    }
4181}
4182
4183static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
4184                                        DeviceState *dev, Error **errp)
4185{
4186    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4187        spapr_memory_unplug(hotplug_dev, dev);
4188    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4189        spapr_core_unplug(hotplug_dev, dev);
4190    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4191        spapr_phb_unplug(hotplug_dev, dev);
4192    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4193        spapr_tpm_proxy_unplug(hotplug_dev, dev);
4194    }
4195}
4196
4197bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr)
4198{
4199    return spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT) ||
4200        /*
4201         * CAS will process all pending unplug requests.
4202         *
4203         * HACK: a guest could theoretically have cleared all bits in OV5,
4204         * but none of the guests we care for do.
4205         */
4206        spapr_ovec_empty(spapr->ov5_cas);
4207}
4208
4209static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev,
4210                                                DeviceState *dev, Error **errp)
4211{
4212    SpaprMachineState *sms = SPAPR_MACHINE(OBJECT(hotplug_dev));
4213    MachineClass *mc = MACHINE_GET_CLASS(sms);
4214    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4215
4216    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4217        if (spapr_memory_hot_unplug_supported(sms)) {
4218            spapr_memory_unplug_request(hotplug_dev, dev, errp);
4219        } else {
4220            error_setg(errp, "Memory hot unplug not supported for this guest");
4221        }
4222    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4223        if (!mc->has_hotpluggable_cpus) {
4224            error_setg(errp, "CPU hot unplug not supported on this machine");
4225            return;
4226        }
4227        spapr_core_unplug_request(hotplug_dev, dev, errp);
4228    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4229        if (!smc->dr_phb_enabled) {
4230            error_setg(errp, "PHB hot unplug not supported on this machine");
4231            return;
4232        }
4233        spapr_phb_unplug_request(hotplug_dev, dev, errp);
4234    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4235        spapr_tpm_proxy_unplug(hotplug_dev, dev);
4236    }
4237}
4238
4239static void spapr_machine_device_pre_plug(HotplugHandler *hotplug_dev,
4240                                          DeviceState *dev, Error **errp)
4241{
4242    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4243        spapr_memory_pre_plug(hotplug_dev, dev, errp);
4244    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4245        spapr_core_pre_plug(hotplug_dev, dev, errp);
4246    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4247        spapr_phb_pre_plug(hotplug_dev, dev, errp);
4248    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4249        spapr_tpm_proxy_pre_plug(hotplug_dev, dev, errp);
4250    }
4251}
4252
4253static HotplugHandler *spapr_get_hotplug_handler(MachineState *machine,
4254                                                 DeviceState *dev)
4255{
4256    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
4257        object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE) ||
4258        object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE) ||
4259        object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4260        return HOTPLUG_HANDLER(machine);
4261    }
4262    if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
4263        PCIDevice *pcidev = PCI_DEVICE(dev);
4264        PCIBus *root = pci_device_root_bus(pcidev);
4265        SpaprPhbState *phb =
4266            (SpaprPhbState *)object_dynamic_cast(OBJECT(BUS(root)->parent),
4267                                                 TYPE_SPAPR_PCI_HOST_BRIDGE);
4268
4269        if (phb) {
4270            return HOTPLUG_HANDLER(phb);
4271        }
4272    }
4273    return NULL;
4274}
4275
4276static CpuInstanceProperties
4277spapr_cpu_index_to_props(MachineState *machine, unsigned cpu_index)
4278{
4279    CPUArchId *core_slot;
4280    MachineClass *mc = MACHINE_GET_CLASS(machine);
4281
4282    /* make sure possible_cpu are intialized */
4283    mc->possible_cpu_arch_ids(machine);
4284    /* get CPU core slot containing thread that matches cpu_index */
4285    core_slot = spapr_find_cpu_slot(machine, cpu_index, NULL);
4286    assert(core_slot);
4287    return core_slot->props;
4288}
4289
4290static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx)
4291{
4292    return idx / ms->smp.cores % ms->numa_state->num_nodes;
4293}
4294
4295static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine)
4296{
4297    int i;
4298    unsigned int smp_threads = machine->smp.threads;
4299    unsigned int smp_cpus = machine->smp.cpus;
4300    const char *core_type;
4301    int spapr_max_cores = machine->smp.max_cpus / smp_threads;
4302    MachineClass *mc = MACHINE_GET_CLASS(machine);
4303
4304    if (!mc->has_hotpluggable_cpus) {
4305        spapr_max_cores = QEMU_ALIGN_UP(smp_cpus, smp_threads) / smp_threads;
4306    }
4307    if (machine->possible_cpus) {
4308        assert(machine->possible_cpus->len == spapr_max_cores);
4309        return machine->possible_cpus;
4310    }
4311
4312    core_type = spapr_get_cpu_core_type(machine->cpu_type);
4313    if (!core_type) {
4314        error_report("Unable to find sPAPR CPU Core definition");
4315        exit(1);
4316    }
4317
4318    machine->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
4319                             sizeof(CPUArchId) * spapr_max_cores);
4320    machine->possible_cpus->len = spapr_max_cores;
4321    for (i = 0; i < machine->possible_cpus->len; i++) {
4322        int core_id = i * smp_threads;
4323
4324        machine->possible_cpus->cpus[i].type = core_type;
4325        machine->possible_cpus->cpus[i].vcpus_count = smp_threads;
4326        machine->possible_cpus->cpus[i].arch_id = core_id;
4327        machine->possible_cpus->cpus[i].props.has_core_id = true;
4328        machine->possible_cpus->cpus[i].props.core_id = core_id;
4329    }
4330    return machine->possible_cpus;
4331}
4332
4333static bool spapr_phb_placement(SpaprMachineState *spapr, uint32_t index,
4334                                uint64_t *buid, hwaddr *pio,
4335                                hwaddr *mmio32, hwaddr *mmio64,
4336                                unsigned n_dma, uint32_t *liobns,
4337                                hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4338{
4339    /*
4340     * New-style PHB window placement.
4341     *
4342     * Goals: Gives large (1TiB), naturally aligned 64-bit MMIO window
4343     * for each PHB, in addition to 2GiB 32-bit MMIO and 64kiB PIO
4344     * windows.
4345     *
4346     * Some guest kernels can't work with MMIO windows above 1<<46
4347     * (64TiB), so we place up to 31 PHBs in the area 32TiB..64TiB
4348     *
4349     * 32TiB..(33TiB+1984kiB) contains the 64kiB PIO windows for each
4350     * PHB stacked together.  (32TiB+2GiB)..(32TiB+64GiB) contains the
4351     * 2GiB 32-bit MMIO windows for each PHB.  Then 33..64TiB has the
4352     * 1TiB 64-bit MMIO windows for each PHB.
4353     */
4354    const uint64_t base_buid = 0x800000020000000ULL;
4355    int i;
4356
4357    /* Sanity check natural alignments */
4358    QEMU_BUILD_BUG_ON((SPAPR_PCI_BASE % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4359    QEMU_BUILD_BUG_ON((SPAPR_PCI_LIMIT % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4360    QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM64_WIN_SIZE % SPAPR_PCI_MEM32_WIN_SIZE) != 0);
4361    QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM32_WIN_SIZE % SPAPR_PCI_IO_WIN_SIZE) != 0);
4362    /* Sanity check bounds */
4363    QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_IO_WIN_SIZE) >
4364                      SPAPR_PCI_MEM32_WIN_SIZE);
4365    QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_MEM32_WIN_SIZE) >
4366                      SPAPR_PCI_MEM64_WIN_SIZE);
4367
4368    if (index >= SPAPR_MAX_PHBS) {
4369        error_setg(errp, "\"index\" for PAPR PHB is too large (max %llu)",
4370                   SPAPR_MAX_PHBS - 1);
4371        return false;
4372    }
4373
4374    *buid = base_buid + index;
4375    for (i = 0; i < n_dma; ++i) {
4376        liobns[i] = SPAPR_PCI_LIOBN(index, i);
4377    }
4378
4379    *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE;
4380    *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE;
4381    *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
4382
4383    *nv2gpa = SPAPR_PCI_NV2RAM64_WIN_BASE + index * SPAPR_PCI_NV2RAM64_WIN_SIZE;
4384    *nv2atsd = SPAPR_PCI_NV2ATSD_WIN_BASE + index * SPAPR_PCI_NV2ATSD_WIN_SIZE;
4385    return true;
4386}
4387
4388static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
4389{
4390    SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4391
4392    return ics_valid_irq(spapr->ics, irq) ? spapr->ics : NULL;
4393}
4394
4395static void spapr_ics_resend(XICSFabric *dev)
4396{
4397    SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4398
4399    ics_resend(spapr->ics);
4400}
4401
4402static ICPState *spapr_icp_get(XICSFabric *xi, int vcpu_id)
4403{
4404    PowerPCCPU *cpu = spapr_find_cpu(vcpu_id);
4405
4406    return cpu ? spapr_cpu_state(cpu)->icp : NULL;
4407}
4408
4409static void spapr_pic_print_info(InterruptStatsProvider *obj,
4410                                 Monitor *mon)
4411{
4412    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
4413
4414    spapr_irq_print_info(spapr, mon);
4415    monitor_printf(mon, "irqchip: %s\n",
4416                   kvm_irqchip_in_kernel() ? "in-kernel" : "emulated");
4417}
4418
4419/*
4420 * This is a XIVE only operation
4421 */
4422static int spapr_match_nvt(XiveFabric *xfb, uint8_t format,
4423                           uint8_t nvt_blk, uint32_t nvt_idx,
4424                           bool cam_ignore, uint8_t priority,
4425                           uint32_t logic_serv, XiveTCTXMatch *match)
4426{
4427    SpaprMachineState *spapr = SPAPR_MACHINE(xfb);
4428    XivePresenter *xptr = XIVE_PRESENTER(spapr->active_intc);
4429    XivePresenterClass *xpc = XIVE_PRESENTER_GET_CLASS(xptr);
4430    int count;
4431
4432    count = xpc->match_nvt(xptr, format, nvt_blk, nvt_idx, cam_ignore,
4433                           priority, logic_serv, match);
4434    if (count < 0) {
4435        return count;
4436    }
4437
4438    /*
4439     * When we implement the save and restore of the thread interrupt
4440     * contexts in the enter/exit CPU handlers of the machine and the
4441     * escalations in QEMU, we should be able to handle non dispatched
4442     * vCPUs.
4443     *
4444     * Until this is done, the sPAPR machine should find at least one
4445     * matching context always.
4446     */
4447    if (count == 0) {
4448        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: NVT %x/%x is not dispatched\n",
4449                      nvt_blk, nvt_idx);
4450    }
4451
4452    return count;
4453}
4454
4455int spapr_get_vcpu_id(PowerPCCPU *cpu)
4456{
4457    return cpu->vcpu_id;
4458}
4459
4460bool spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp)
4461{
4462    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
4463    MachineState *ms = MACHINE(spapr);
4464    int vcpu_id;
4465
4466    vcpu_id = spapr_vcpu_id(spapr, cpu_index);
4467
4468    if (kvm_enabled() && !kvm_vcpu_id_is_valid(vcpu_id)) {
4469        error_setg(errp, "Can't create CPU with id %d in KVM", vcpu_id);
4470        error_append_hint(errp, "Adjust the number of cpus to %d "
4471                          "or try to raise the number of threads per core\n",
4472                          vcpu_id * ms->smp.threads / spapr->vsmt);
4473        return false;
4474    }
4475
4476    cpu->vcpu_id = vcpu_id;
4477    return true;
4478}
4479
4480PowerPCCPU *spapr_find_cpu(int vcpu_id)
4481{
4482    CPUState *cs;
4483
4484    CPU_FOREACH(cs) {
4485        PowerPCCPU *cpu = POWERPC_CPU(cs);
4486
4487        if (spapr_get_vcpu_id(cpu) == vcpu_id) {
4488            return cpu;
4489        }
4490    }
4491
4492    return NULL;
4493}
4494
4495static bool spapr_cpu_in_nested(PowerPCCPU *cpu)
4496{
4497    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4498
4499    return spapr_cpu->in_nested;
4500}
4501
4502static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
4503{
4504    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4505
4506    /* These are only called by TCG, KVM maintains dispatch state */
4507
4508    spapr_cpu->prod = false;
4509    if (spapr_cpu->vpa_addr) {
4510        CPUState *cs = CPU(cpu);
4511        uint32_t dispatch;
4512
4513        dispatch = ldl_be_phys(cs->as,
4514                               spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER);
4515        dispatch++;
4516        if ((dispatch & 1) != 0) {
4517            qemu_log_mask(LOG_GUEST_ERROR,
4518                          "VPA: incorrect dispatch counter value for "
4519                          "dispatched partition %u, correcting.\n", dispatch);
4520            dispatch++;
4521        }
4522        stl_be_phys(cs->as,
4523                    spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch);
4524    }
4525}
4526
4527static void spapr_cpu_exec_exit(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
4528{
4529    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4530
4531    if (spapr_cpu->vpa_addr) {
4532        CPUState *cs = CPU(cpu);
4533        uint32_t dispatch;
4534
4535        dispatch = ldl_be_phys(cs->as,
4536                               spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER);
4537        dispatch++;
4538        if ((dispatch & 1) != 1) {
4539            qemu_log_mask(LOG_GUEST_ERROR,
4540                          "VPA: incorrect dispatch counter value for "
4541                          "preempted partition %u, correcting.\n", dispatch);
4542            dispatch++;
4543        }
4544        stl_be_phys(cs->as,
4545                    spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch);
4546    }
4547}
4548
4549static void spapr_machine_class_init(ObjectClass *oc, void *data)
4550{
4551    MachineClass *mc = MACHINE_CLASS(oc);
4552    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(oc);
4553    FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
4554    NMIClass *nc = NMI_CLASS(oc);
4555    HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
4556    PPCVirtualHypervisorClass *vhc = PPC_VIRTUAL_HYPERVISOR_CLASS(oc);
4557    XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
4558    InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
4559    XiveFabricClass *xfc = XIVE_FABRIC_CLASS(oc);
4560    VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc);
4561
4562    mc->desc = "pSeries Logical Partition (PAPR compliant)";
4563    mc->ignore_boot_device_suffixes = true;
4564
4565    /*
4566     * We set up the default / latest behaviour here.  The class_init
4567     * functions for the specific versioned machine types can override
4568     * these details for backwards compatibility
4569     */
4570    mc->init = spapr_machine_init;
4571    mc->reset = spapr_machine_reset;
4572    mc->block_default_type = IF_SCSI;
4573
4574    /*
4575     * Setting max_cpus to INT32_MAX. Both KVM and TCG max_cpus values
4576     * should be limited by the host capability instead of hardcoded.
4577     * max_cpus for KVM guests will be checked in kvm_init(), and TCG
4578     * guests are welcome to have as many CPUs as the host are capable
4579     * of emulate.
4580     */
4581    mc->max_cpus = INT32_MAX;
4582
4583    mc->no_parallel = 1;
4584    mc->default_boot_order = "";
4585    mc->default_ram_size = 512 * MiB;
4586    mc->default_ram_id = "ppc_spapr.ram";
4587    mc->default_display = "std";
4588    mc->kvm_type = spapr_kvm_type;
4589    machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SPAPR_PCI_HOST_BRIDGE);
4590    mc->pci_allow_0_address = true;
4591    assert(!mc->get_hotplug_handler);
4592    mc->get_hotplug_handler = spapr_get_hotplug_handler;
4593    hc->pre_plug = spapr_machine_device_pre_plug;
4594    hc->plug = spapr_machine_device_plug;
4595    mc->cpu_index_to_instance_props = spapr_cpu_index_to_props;
4596    mc->get_default_cpu_node_id = spapr_get_default_cpu_node_id;
4597    mc->possible_cpu_arch_ids = spapr_possible_cpu_arch_ids;
4598    hc->unplug_request = spapr_machine_device_unplug_request;
4599    hc->unplug = spapr_machine_device_unplug;
4600
4601    smc->dr_lmb_enabled = true;
4602    smc->update_dt_enabled = true;
4603    mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power9_v2.0");
4604    mc->has_hotpluggable_cpus = true;
4605    mc->nvdimm_supported = true;
4606    smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
4607    fwc->get_dev_path = spapr_get_fw_dev_path;
4608    nc->nmi_monitor_handler = spapr_nmi;
4609    smc->phb_placement = spapr_phb_placement;
4610    vhc->cpu_in_nested = spapr_cpu_in_nested;
4611    vhc->deliver_hv_excp = spapr_exit_nested;
4612    vhc->hypercall = emulate_spapr_hypercall;
4613    vhc->hpt_mask = spapr_hpt_mask;
4614    vhc->map_hptes = spapr_map_hptes;
4615    vhc->unmap_hptes = spapr_unmap_hptes;
4616    vhc->hpte_set_c = spapr_hpte_set_c;
4617    vhc->hpte_set_r = spapr_hpte_set_r;
4618    vhc->get_pate = spapr_get_pate;
4619    vhc->encode_hpt_for_kvm_pr = spapr_encode_hpt_for_kvm_pr;
4620    vhc->cpu_exec_enter = spapr_cpu_exec_enter;
4621    vhc->cpu_exec_exit = spapr_cpu_exec_exit;
4622    xic->ics_get = spapr_ics_get;
4623    xic->ics_resend = spapr_ics_resend;
4624    xic->icp_get = spapr_icp_get;
4625    ispc->print_info = spapr_pic_print_info;
4626    /* Force NUMA node memory size to be a multiple of
4627     * SPAPR_MEMORY_BLOCK_SIZE (256M) since that's the granularity
4628     * in which LMBs are represented and hot-added
4629     */
4630    mc->numa_mem_align_shift = 28;
4631    mc->auto_enable_numa = true;
4632
4633    smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_OFF;
4634    smc->default_caps.caps[SPAPR_CAP_VSX] = SPAPR_CAP_ON;
4635    smc->default_caps.caps[SPAPR_CAP_DFP] = SPAPR_CAP_ON;
4636    smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
4637    smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
4638    smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_WORKAROUND;
4639    smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 16; /* 64kiB */
4640    smc->default_caps.caps[SPAPR_CAP_NESTED_KVM_HV] = SPAPR_CAP_OFF;
4641    smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON;
4642    smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
4643    smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
4644    smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
4645    spapr_caps_add_properties(smc);
4646    smc->irq = &spapr_irq_dual;
4647    smc->dr_phb_enabled = true;
4648    smc->linux_pci_probe = true;
4649    smc->smp_threads_vsmt = true;
4650    smc->nr_xirqs = SPAPR_NR_XIRQS;
4651    xfc->match_nvt = spapr_match_nvt;
4652    vmc->client_architecture_support = spapr_vof_client_architecture_support;
4653    vmc->quiesce = spapr_vof_quiesce;
4654    vmc->setprop = spapr_vof_setprop;
4655}
4656
4657static const TypeInfo spapr_machine_info = {
4658    .name          = TYPE_SPAPR_MACHINE,
4659    .parent        = TYPE_MACHINE,
4660    .abstract      = true,
4661    .instance_size = sizeof(SpaprMachineState),
4662    .instance_init = spapr_instance_init,
4663    .instance_finalize = spapr_machine_finalizefn,
4664    .class_size    = sizeof(SpaprMachineClass),
4665    .class_init    = spapr_machine_class_init,
4666    .interfaces = (InterfaceInfo[]) {
4667        { TYPE_FW_PATH_PROVIDER },
4668        { TYPE_NMI },
4669        { TYPE_HOTPLUG_HANDLER },
4670        { TYPE_PPC_VIRTUAL_HYPERVISOR },
4671        { TYPE_XICS_FABRIC },
4672        { TYPE_INTERRUPT_STATS_PROVIDER },
4673        { TYPE_XIVE_FABRIC },
4674        { TYPE_VOF_MACHINE_IF },
4675        { }
4676    },
4677};
4678
4679static void spapr_machine_latest_class_options(MachineClass *mc)
4680{
4681    mc->alias = "pseries";
4682    mc->is_default = true;
4683}
4684
4685#define DEFINE_SPAPR_MACHINE(suffix, verstr, latest)                 \
4686    static void spapr_machine_##suffix##_class_init(ObjectClass *oc, \
4687                                                    void *data)      \
4688    {                                                                \
4689        MachineClass *mc = MACHINE_CLASS(oc);                        \
4690        spapr_machine_##suffix##_class_options(mc);                  \
4691        if (latest) {                                                \
4692            spapr_machine_latest_class_options(mc);                  \
4693        }                                                            \
4694    }                                                                \
4695    static const TypeInfo spapr_machine_##suffix##_info = {          \
4696        .name = MACHINE_TYPE_NAME("pseries-" verstr),                \
4697        .parent = TYPE_SPAPR_MACHINE,                                \
4698        .class_init = spapr_machine_##suffix##_class_init,           \
4699    };                                                               \
4700    static void spapr_machine_register_##suffix(void)                \
4701    {                                                                \
4702        type_register(&spapr_machine_##suffix##_info);               \
4703    }                                                                \
4704    type_init(spapr_machine_register_##suffix)
4705
4706/*
4707 * pseries-7.0
4708 */
4709static void spapr_machine_7_0_class_options(MachineClass *mc)
4710{
4711    /* Defaults for the latest behaviour inherited from the base class */
4712}
4713
4714DEFINE_SPAPR_MACHINE(7_0, "7.0", true);
4715
4716/*
4717 * pseries-6.2
4718 */
4719static void spapr_machine_6_2_class_options(MachineClass *mc)
4720{
4721    spapr_machine_7_0_class_options(mc);
4722    compat_props_add(mc->compat_props, hw_compat_6_2, hw_compat_6_2_len);
4723}
4724
4725DEFINE_SPAPR_MACHINE(6_2, "6.2", false);
4726
4727/*
4728 * pseries-6.1
4729 */
4730static void spapr_machine_6_1_class_options(MachineClass *mc)
4731{
4732    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4733
4734    spapr_machine_6_2_class_options(mc);
4735    compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len);
4736    smc->pre_6_2_numa_affinity = true;
4737    mc->smp_props.prefer_sockets = true;
4738}
4739
4740DEFINE_SPAPR_MACHINE(6_1, "6.1", false);
4741
4742/*
4743 * pseries-6.0
4744 */
4745static void spapr_machine_6_0_class_options(MachineClass *mc)
4746{
4747    spapr_machine_6_1_class_options(mc);
4748    compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len);
4749}
4750
4751DEFINE_SPAPR_MACHINE(6_0, "6.0", false);
4752
4753/*
4754 * pseries-5.2
4755 */
4756static void spapr_machine_5_2_class_options(MachineClass *mc)
4757{
4758    spapr_machine_6_0_class_options(mc);
4759    compat_props_add(mc->compat_props, hw_compat_5_2, hw_compat_5_2_len);
4760}
4761
4762DEFINE_SPAPR_MACHINE(5_2, "5.2", false);
4763
4764/*
4765 * pseries-5.1
4766 */
4767static void spapr_machine_5_1_class_options(MachineClass *mc)
4768{
4769    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4770
4771    spapr_machine_5_2_class_options(mc);
4772    compat_props_add(mc->compat_props, hw_compat_5_1, hw_compat_5_1_len);
4773    smc->pre_5_2_numa_associativity = true;
4774}
4775
4776DEFINE_SPAPR_MACHINE(5_1, "5.1", false);
4777
4778/*
4779 * pseries-5.0
4780 */
4781static void spapr_machine_5_0_class_options(MachineClass *mc)
4782{
4783    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4784    static GlobalProperty compat[] = {
4785        { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" },
4786    };
4787
4788    spapr_machine_5_1_class_options(mc);
4789    compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len);
4790    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4791    mc->numa_mem_supported = true;
4792    smc->pre_5_1_assoc_refpoints = true;
4793}
4794
4795DEFINE_SPAPR_MACHINE(5_0, "5.0", false);
4796
4797/*
4798 * pseries-4.2
4799 */
4800static void spapr_machine_4_2_class_options(MachineClass *mc)
4801{
4802    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4803
4804    spapr_machine_5_0_class_options(mc);
4805    compat_props_add(mc->compat_props, hw_compat_4_2, hw_compat_4_2_len);
4806    smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF;
4807    smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_OFF;
4808    smc->rma_limit = 16 * GiB;
4809    mc->nvdimm_supported = false;
4810}
4811
4812DEFINE_SPAPR_MACHINE(4_2, "4.2", false);
4813
4814/*
4815 * pseries-4.1
4816 */
4817static void spapr_machine_4_1_class_options(MachineClass *mc)
4818{
4819    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4820    static GlobalProperty compat[] = {
4821        /* Only allow 4kiB and 64kiB IOMMU pagesizes */
4822        { TYPE_SPAPR_PCI_HOST_BRIDGE, "pgsz", "0x11000" },
4823    };
4824
4825    spapr_machine_4_2_class_options(mc);
4826    smc->linux_pci_probe = false;
4827    smc->smp_threads_vsmt = false;
4828    compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len);
4829    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4830}
4831
4832DEFINE_SPAPR_MACHINE(4_1, "4.1", false);
4833
4834/*
4835 * pseries-4.0
4836 */
4837static bool phb_placement_4_0(SpaprMachineState *spapr, uint32_t index,
4838                              uint64_t *buid, hwaddr *pio,
4839                              hwaddr *mmio32, hwaddr *mmio64,
4840                              unsigned n_dma, uint32_t *liobns,
4841                              hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4842{
4843    if (!spapr_phb_placement(spapr, index, buid, pio, mmio32, mmio64, n_dma,
4844                             liobns, nv2gpa, nv2atsd, errp)) {
4845        return false;
4846    }
4847
4848    *nv2gpa = 0;
4849    *nv2atsd = 0;
4850    return true;
4851}
4852static void spapr_machine_4_0_class_options(MachineClass *mc)
4853{
4854    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4855
4856    spapr_machine_4_1_class_options(mc);
4857    compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len);
4858    smc->phb_placement = phb_placement_4_0;
4859    smc->irq = &spapr_irq_xics;
4860    smc->pre_4_1_migration = true;
4861}
4862
4863DEFINE_SPAPR_MACHINE(4_0, "4.0", false);
4864
4865/*
4866 * pseries-3.1
4867 */
4868static void spapr_machine_3_1_class_options(MachineClass *mc)
4869{
4870    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4871
4872    spapr_machine_4_0_class_options(mc);
4873    compat_props_add(mc->compat_props, hw_compat_3_1, hw_compat_3_1_len);
4874
4875    mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
4876    smc->update_dt_enabled = false;
4877    smc->dr_phb_enabled = false;
4878    smc->broken_host_serial_model = true;
4879    smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN;
4880    smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN;
4881    smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN;
4882    smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF;
4883}
4884
4885DEFINE_SPAPR_MACHINE(3_1, "3.1", false);
4886
4887/*
4888 * pseries-3.0
4889 */
4890
4891static void spapr_machine_3_0_class_options(MachineClass *mc)
4892{
4893    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4894
4895    spapr_machine_3_1_class_options(mc);
4896    compat_props_add(mc->compat_props, hw_compat_3_0, hw_compat_3_0_len);
4897
4898    smc->legacy_irq_allocation = true;
4899    smc->nr_xirqs = 0x400;
4900    smc->irq = &spapr_irq_xics_legacy;
4901}
4902
4903DEFINE_SPAPR_MACHINE(3_0, "3.0", false);
4904
4905/*
4906 * pseries-2.12
4907 */
4908static void spapr_machine_2_12_class_options(MachineClass *mc)
4909{
4910    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4911    static GlobalProperty compat[] = {
4912        { TYPE_POWERPC_CPU, "pre-3.0-migration", "on" },
4913        { TYPE_SPAPR_CPU_CORE, "pre-3.0-migration", "on" },
4914    };
4915
4916    spapr_machine_3_0_class_options(mc);
4917    compat_props_add(mc->compat_props, hw_compat_2_12, hw_compat_2_12_len);
4918    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4919
4920    /* We depend on kvm_enabled() to choose a default value for the
4921     * hpt-max-page-size capability. Of course we can't do it here
4922     * because this is too early and the HW accelerator isn't initialzed
4923     * yet. Postpone this to machine init (see default_caps_with_cpu()).
4924     */
4925    smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 0;
4926}
4927
4928DEFINE_SPAPR_MACHINE(2_12, "2.12", false);
4929
4930static void spapr_machine_2_12_sxxm_class_options(MachineClass *mc)
4931{
4932    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4933
4934    spapr_machine_2_12_class_options(mc);
4935    smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
4936    smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
4937    smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD;
4938}
4939
4940DEFINE_SPAPR_MACHINE(2_12_sxxm, "2.12-sxxm", false);
4941
4942/*
4943 * pseries-2.11
4944 */
4945
4946static void spapr_machine_2_11_class_options(MachineClass *mc)
4947{
4948    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4949
4950    spapr_machine_2_12_class_options(mc);
4951    smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_ON;
4952    compat_props_add(mc->compat_props, hw_compat_2_11, hw_compat_2_11_len);
4953}
4954
4955DEFINE_SPAPR_MACHINE(2_11, "2.11", false);
4956
4957/*
4958 * pseries-2.10
4959 */
4960
4961static void spapr_machine_2_10_class_options(MachineClass *mc)
4962{
4963    spapr_machine_2_11_class_options(mc);
4964    compat_props_add(mc->compat_props, hw_compat_2_10, hw_compat_2_10_len);
4965}
4966
4967DEFINE_SPAPR_MACHINE(2_10, "2.10", false);
4968
4969/*
4970 * pseries-2.9
4971 */
4972
4973static void spapr_machine_2_9_class_options(MachineClass *mc)
4974{
4975    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4976    static GlobalProperty compat[] = {
4977        { TYPE_POWERPC_CPU, "pre-2.10-migration", "on" },
4978    };
4979
4980    spapr_machine_2_10_class_options(mc);
4981    compat_props_add(mc->compat_props, hw_compat_2_9, hw_compat_2_9_len);
4982    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4983    smc->pre_2_10_has_unused_icps = true;
4984    smc->resize_hpt_default = SPAPR_RESIZE_HPT_DISABLED;
4985}
4986
4987DEFINE_SPAPR_MACHINE(2_9, "2.9", false);
4988
4989/*
4990 * pseries-2.8
4991 */
4992
4993static void spapr_machine_2_8_class_options(MachineClass *mc)
4994{
4995    static GlobalProperty compat[] = {
4996        { TYPE_SPAPR_PCI_HOST_BRIDGE, "pcie-extended-configuration-space", "off" },
4997    };
4998
4999    spapr_machine_2_9_class_options(mc);
5000    compat_props_add(mc->compat_props, hw_compat_2_8, hw_compat_2_8_len);
5001    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5002    mc->numa_mem_align_shift = 23;
5003}
5004
5005DEFINE_SPAPR_MACHINE(2_8, "2.8", false);
5006
5007/*
5008 * pseries-2.7
5009 */
5010
5011static bool phb_placement_2_7(SpaprMachineState *spapr, uint32_t index,
5012                              uint64_t *buid, hwaddr *pio,
5013                              hwaddr *mmio32, hwaddr *mmio64,
5014                              unsigned n_dma, uint32_t *liobns,
5015                              hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
5016{
5017    /* Legacy PHB placement for pseries-2.7 and earlier machine types */
5018    const uint64_t base_buid = 0x800000020000000ULL;
5019    const hwaddr phb_spacing = 0x1000000000ULL; /* 64 GiB */
5020    const hwaddr mmio_offset = 0xa0000000; /* 2 GiB + 512 MiB */
5021    const hwaddr pio_offset = 0x80000000; /* 2 GiB */
5022    const uint32_t max_index = 255;
5023    const hwaddr phb0_alignment = 0x10000000000ULL; /* 1 TiB */
5024
5025    uint64_t ram_top = MACHINE(spapr)->ram_size;
5026    hwaddr phb0_base, phb_base;
5027    int i;
5028
5029    /* Do we have device memory? */
5030    if (MACHINE(spapr)->maxram_size > ram_top) {
5031        /* Can't just use maxram_size, because there may be an
5032         * alignment gap between normal and device memory regions
5033         */
5034        ram_top = MACHINE(spapr)->device_memory->base +
5035            memory_region_size(&MACHINE(spapr)->device_memory->mr);
5036    }
5037
5038    phb0_base = QEMU_ALIGN_UP(ram_top, phb0_alignment);
5039
5040    if (index > max_index) {
5041        error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)",
5042                   max_index);
5043        return false;
5044    }
5045
5046    *buid = base_buid + index;
5047    for (i = 0; i < n_dma; ++i) {
5048        liobns[i] = SPAPR_PCI_LIOBN(index, i);
5049    }
5050
5051    phb_base = phb0_base + index * phb_spacing;
5052    *pio = phb_base + pio_offset;
5053    *mmio32 = phb_base + mmio_offset;
5054    /*
5055     * We don't set the 64-bit MMIO window, relying on the PHB's
5056     * fallback behaviour of automatically splitting a large "32-bit"
5057     * window into contiguous 32-bit and 64-bit windows
5058     */
5059
5060    *nv2gpa = 0;
5061    *nv2atsd = 0;
5062    return true;
5063}
5064
5065static void spapr_machine_2_7_class_options(MachineClass *mc)
5066{
5067    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5068    static GlobalProperty compat[] = {
5069        { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0xf80000000", },
5070        { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem64_win_size", "0", },
5071        { TYPE_POWERPC_CPU, "pre-2.8-migration", "on", },
5072        { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-2.8-migration", "on", },
5073    };
5074
5075    spapr_machine_2_8_class_options(mc);
5076    mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power7_v2.3");
5077    mc->default_machine_opts = "modern-hotplug-events=off";
5078    compat_props_add(mc->compat_props, hw_compat_2_7, hw_compat_2_7_len);
5079    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5080    smc->phb_placement = phb_placement_2_7;
5081}
5082
5083DEFINE_SPAPR_MACHINE(2_7, "2.7", false);
5084
5085/*
5086 * pseries-2.6
5087 */
5088
5089static void spapr_machine_2_6_class_options(MachineClass *mc)
5090{
5091    static GlobalProperty compat[] = {
5092        { TYPE_SPAPR_PCI_HOST_BRIDGE, "ddw", "off" },
5093    };
5094
5095    spapr_machine_2_7_class_options(mc);
5096    mc->has_hotpluggable_cpus = false;
5097    compat_props_add(mc->compat_props, hw_compat_2_6, hw_compat_2_6_len);
5098    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5099}
5100
5101DEFINE_SPAPR_MACHINE(2_6, "2.6", false);
5102
5103/*
5104 * pseries-2.5
5105 */
5106
5107static void spapr_machine_2_5_class_options(MachineClass *mc)
5108{
5109    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5110    static GlobalProperty compat[] = {
5111        { "spapr-vlan", "use-rx-buffer-pools", "off" },
5112    };
5113
5114    spapr_machine_2_6_class_options(mc);
5115    smc->use_ohci_by_default = true;
5116    compat_props_add(mc->compat_props, hw_compat_2_5, hw_compat_2_5_len);
5117    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5118}
5119
5120DEFINE_SPAPR_MACHINE(2_5, "2.5", false);
5121
5122/*
5123 * pseries-2.4
5124 */
5125
5126static void spapr_machine_2_4_class_options(MachineClass *mc)
5127{
5128    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5129
5130    spapr_machine_2_5_class_options(mc);
5131    smc->dr_lmb_enabled = false;
5132    compat_props_add(mc->compat_props, hw_compat_2_4, hw_compat_2_4_len);
5133}
5134
5135DEFINE_SPAPR_MACHINE(2_4, "2.4", false);
5136
5137/*
5138 * pseries-2.3
5139 */
5140
5141static void spapr_machine_2_3_class_options(MachineClass *mc)
5142{
5143    static GlobalProperty compat[] = {
5144        { "spapr-pci-host-bridge", "dynamic-reconfiguration", "off" },
5145    };
5146    spapr_machine_2_4_class_options(mc);
5147    compat_props_add(mc->compat_props, hw_compat_2_3, hw_compat_2_3_len);
5148    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5149}
5150DEFINE_SPAPR_MACHINE(2_3, "2.3", false);
5151
5152/*
5153 * pseries-2.2
5154 */
5155
5156static void spapr_machine_2_2_class_options(MachineClass *mc)
5157{
5158    static GlobalProperty compat[] = {
5159        { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0x20000000" },
5160    };
5161
5162    spapr_machine_2_3_class_options(mc);
5163    compat_props_add(mc->compat_props, hw_compat_2_2, hw_compat_2_2_len);
5164    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5165    mc->default_machine_opts = "modern-hotplug-events=off,suppress-vmdesc=on";
5166}
5167DEFINE_SPAPR_MACHINE(2_2, "2.2", false);
5168
5169/*
5170 * pseries-2.1
5171 */
5172
5173static void spapr_machine_2_1_class_options(MachineClass *mc)
5174{
5175    spapr_machine_2_2_class_options(mc);
5176    compat_props_add(mc->compat_props, hw_compat_2_1, hw_compat_2_1_len);
5177}
5178DEFINE_SPAPR_MACHINE(2_1, "2.1", false);
5179
5180static void spapr_machine_register_types(void)
5181{
5182    type_register_static(&spapr_machine_info);
5183}
5184
5185type_init(spapr_machine_register_types)
5186