qemu/hw/ppc/spapr.c
<<
>>
Prefs
   1/*
   2 * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
   3 *
   4 * Copyright (c) 2004-2007 Fabrice Bellard
   5 * Copyright (c) 2007 Jocelyn Mayer
   6 * Copyright (c) 2010 David Gibson, IBM Corporation.
   7 *
   8 * Permission is hereby granted, free of charge, to any person obtaining a copy
   9 * of this software and associated documentation files (the "Software"), to deal
  10 * in the Software without restriction, including without limitation the rights
  11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12 * copies of the Software, and to permit persons to whom the Software is
  13 * furnished to do so, subject to the following conditions:
  14 *
  15 * The above copyright notice and this permission notice shall be included in
  16 * all copies or substantial portions of the Software.
  17 *
  18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24 * THE SOFTWARE.
  25 */
  26
  27#include "qemu/osdep.h"
  28#include "qemu-common.h"
  29#include "qemu/datadir.h"
  30#include "qapi/error.h"
  31#include "qapi/qapi-events-machine.h"
  32#include "qapi/qapi-events-qdev.h"
  33#include "qapi/visitor.h"
  34#include "sysemu/sysemu.h"
  35#include "sysemu/hostmem.h"
  36#include "sysemu/numa.h"
  37#include "sysemu/qtest.h"
  38#include "sysemu/reset.h"
  39#include "sysemu/runstate.h"
  40#include "qemu/log.h"
  41#include "hw/fw-path-provider.h"
  42#include "elf.h"
  43#include "net/net.h"
  44#include "sysemu/device_tree.h"
  45#include "sysemu/cpus.h"
  46#include "sysemu/hw_accel.h"
  47#include "kvm_ppc.h"
  48#include "migration/misc.h"
  49#include "migration/qemu-file-types.h"
  50#include "migration/global_state.h"
  51#include "migration/register.h"
  52#include "migration/blocker.h"
  53#include "mmu-hash64.h"
  54#include "mmu-book3s-v3.h"
  55#include "cpu-models.h"
  56#include "hw/core/cpu.h"
  57
  58#include "hw/ppc/ppc.h"
  59#include "hw/loader.h"
  60
  61#include "hw/ppc/fdt.h"
  62#include "hw/ppc/spapr.h"
  63#include "hw/ppc/spapr_vio.h"
  64#include "hw/qdev-properties.h"
  65#include "hw/pci-host/spapr.h"
  66#include "hw/pci/msi.h"
  67
  68#include "hw/pci/pci.h"
  69#include "hw/scsi/scsi.h"
  70#include "hw/virtio/virtio-scsi.h"
  71#include "hw/virtio/vhost-scsi-common.h"
  72
  73#include "exec/ram_addr.h"
  74#include "hw/usb.h"
  75#include "qemu/config-file.h"
  76#include "qemu/error-report.h"
  77#include "trace.h"
  78#include "hw/nmi.h"
  79#include "hw/intc/intc.h"
  80
  81#include "hw/ppc/spapr_cpu_core.h"
  82#include "hw/mem/memory-device.h"
  83#include "hw/ppc/spapr_tpm_proxy.h"
  84#include "hw/ppc/spapr_nvdimm.h"
  85#include "hw/ppc/spapr_numa.h"
  86#include "hw/ppc/pef.h"
  87
  88#include "monitor/monitor.h"
  89
  90#include <libfdt.h>
  91
  92/* SLOF memory layout:
  93 *
  94 * SLOF raw image loaded at 0, copies its romfs right below the flat
  95 * device-tree, then position SLOF itself 31M below that
  96 *
  97 * So we set FW_OVERHEAD to 40MB which should account for all of that
  98 * and more
  99 *
 100 * We load our kernel at 4M, leaving space for SLOF initial image
 101 */
 102#define FDT_MAX_ADDR            0x80000000 /* FDT must stay below that */
 103#define FW_MAX_SIZE             0x400000
 104#define FW_FILE_NAME            "slof.bin"
 105#define FW_FILE_NAME_VOF        "vof.bin"
 106#define FW_OVERHEAD             0x2800000
 107#define KERNEL_LOAD_ADDR        FW_MAX_SIZE
 108
 109#define MIN_RMA_SLOF            (128 * MiB)
 110
 111#define PHANDLE_INTC            0x00001111
 112
 113/* These two functions implement the VCPU id numbering: one to compute them
 114 * all and one to identify thread 0 of a VCORE. Any change to the first one
 115 * is likely to have an impact on the second one, so let's keep them close.
 116 */
 117static int spapr_vcpu_id(SpaprMachineState *spapr, int cpu_index)
 118{
 119    MachineState *ms = MACHINE(spapr);
 120    unsigned int smp_threads = ms->smp.threads;
 121
 122    assert(spapr->vsmt);
 123    return
 124        (cpu_index / smp_threads) * spapr->vsmt + cpu_index % smp_threads;
 125}
 126static bool spapr_is_thread0_in_vcore(SpaprMachineState *spapr,
 127                                      PowerPCCPU *cpu)
 128{
 129    assert(spapr->vsmt);
 130    return spapr_get_vcpu_id(cpu) % spapr->vsmt == 0;
 131}
 132
 133static bool pre_2_10_vmstate_dummy_icp_needed(void *opaque)
 134{
 135    /* Dummy entries correspond to unused ICPState objects in older QEMUs,
 136     * and newer QEMUs don't even have them. In both cases, we don't want
 137     * to send anything on the wire.
 138     */
 139    return false;
 140}
 141
 142static const VMStateDescription pre_2_10_vmstate_dummy_icp = {
 143    .name = "icp/server",
 144    .version_id = 1,
 145    .minimum_version_id = 1,
 146    .needed = pre_2_10_vmstate_dummy_icp_needed,
 147    .fields = (VMStateField[]) {
 148        VMSTATE_UNUSED(4), /* uint32_t xirr */
 149        VMSTATE_UNUSED(1), /* uint8_t pending_priority */
 150        VMSTATE_UNUSED(1), /* uint8_t mfrr */
 151        VMSTATE_END_OF_LIST()
 152    },
 153};
 154
 155static void pre_2_10_vmstate_register_dummy_icp(int i)
 156{
 157    vmstate_register(NULL, i, &pre_2_10_vmstate_dummy_icp,
 158                     (void *)(uintptr_t) i);
 159}
 160
 161static void pre_2_10_vmstate_unregister_dummy_icp(int i)
 162{
 163    vmstate_unregister(NULL, &pre_2_10_vmstate_dummy_icp,
 164                       (void *)(uintptr_t) i);
 165}
 166
 167int spapr_max_server_number(SpaprMachineState *spapr)
 168{
 169    MachineState *ms = MACHINE(spapr);
 170
 171    assert(spapr->vsmt);
 172    return DIV_ROUND_UP(ms->smp.max_cpus * spapr->vsmt, ms->smp.threads);
 173}
 174
 175static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
 176                                  int smt_threads)
 177{
 178    int i, ret = 0;
 179    uint32_t servers_prop[smt_threads];
 180    uint32_t gservers_prop[smt_threads * 2];
 181    int index = spapr_get_vcpu_id(cpu);
 182
 183    if (cpu->compat_pvr) {
 184        ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->compat_pvr);
 185        if (ret < 0) {
 186            return ret;
 187        }
 188    }
 189
 190    /* Build interrupt servers and gservers properties */
 191    for (i = 0; i < smt_threads; i++) {
 192        servers_prop[i] = cpu_to_be32(index + i);
 193        /* Hack, direct the group queues back to cpu 0 */
 194        gservers_prop[i*2] = cpu_to_be32(index + i);
 195        gservers_prop[i*2 + 1] = 0;
 196    }
 197    ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-server#s",
 198                      servers_prop, sizeof(servers_prop));
 199    if (ret < 0) {
 200        return ret;
 201    }
 202    ret = fdt_setprop(fdt, offset, "ibm,ppc-interrupt-gserver#s",
 203                      gservers_prop, sizeof(gservers_prop));
 204
 205    return ret;
 206}
 207
 208static void spapr_dt_pa_features(SpaprMachineState *spapr,
 209                                 PowerPCCPU *cpu,
 210                                 void *fdt, int offset)
 211{
 212    uint8_t pa_features_206[] = { 6, 0,
 213        0xf6, 0x1f, 0xc7, 0x00, 0x80, 0xc0 };
 214    uint8_t pa_features_207[] = { 24, 0,
 215        0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0,
 216        0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
 217        0x00, 0x00, 0x00, 0x00, 0x80, 0x00,
 218        0x80, 0x00, 0x80, 0x00, 0x00, 0x00 };
 219    uint8_t pa_features_300[] = { 66, 0,
 220        /* 0: MMU|FPU|SLB|RUN|DABR|NX, 1: fri[nzpm]|DABRX|SPRG3|SLB0|PP110 */
 221        /* 2: VPM|DS205|PPR|DS202|DS206, 3: LSD|URG, SSO, 5: LE|CFAR|EB|LSQ */
 222        0xf6, 0x1f, 0xc7, 0xc0, 0x80, 0xf0, /* 0 - 5 */
 223        /* 6: DS207 */
 224        0x80, 0x00, 0x00, 0x00, 0x00, 0x00, /* 6 - 11 */
 225        /* 16: Vector */
 226        0x00, 0x00, 0x00, 0x00, 0x80, 0x00, /* 12 - 17 */
 227        /* 18: Vec. Scalar, 20: Vec. XOR, 22: HTM */
 228        0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 18 - 23 */
 229        /* 24: Ext. Dec, 26: 64 bit ftrs, 28: PM ftrs */
 230        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 24 - 29 */
 231        /* 30: MMR, 32: LE atomic, 34: EBB + ext EBB */
 232        0x80, 0x00, 0x80, 0x00, 0xC0, 0x00, /* 30 - 35 */
 233        /* 36: SPR SO, 38: Copy/Paste, 40: Radix MMU */
 234        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 36 - 41 */
 235        /* 42: PM, 44: PC RA, 46: SC vec'd */
 236        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 42 - 47 */
 237        /* 48: SIMD, 50: QP BFP, 52: String */
 238        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
 239        /* 54: DecFP, 56: DecI, 58: SHA */
 240        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
 241        /* 60: NM atomic, 62: RNG */
 242        0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
 243    };
 244    uint8_t *pa_features = NULL;
 245    size_t pa_size;
 246
 247    if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_06, 0, cpu->compat_pvr)) {
 248        pa_features = pa_features_206;
 249        pa_size = sizeof(pa_features_206);
 250    }
 251    if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_07, 0, cpu->compat_pvr)) {
 252        pa_features = pa_features_207;
 253        pa_size = sizeof(pa_features_207);
 254    }
 255    if (ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0, cpu->compat_pvr)) {
 256        pa_features = pa_features_300;
 257        pa_size = sizeof(pa_features_300);
 258    }
 259    if (!pa_features) {
 260        return;
 261    }
 262
 263    if (ppc_hash64_has(cpu, PPC_HASH64_CI_LARGEPAGE)) {
 264        /*
 265         * Note: we keep CI large pages off by default because a 64K capable
 266         * guest provisioned with large pages might otherwise try to map a qemu
 267         * framebuffer (or other kind of memory mapped PCI BAR) using 64K pages
 268         * even if that qemu runs on a 4k host.
 269         * We dd this bit back here if we are confident this is not an issue
 270         */
 271        pa_features[3] |= 0x20;
 272    }
 273    if ((spapr_get_cap(spapr, SPAPR_CAP_HTM) != 0) && pa_size > 24) {
 274        pa_features[24] |= 0x80;    /* Transactional memory support */
 275    }
 276    if (spapr->cas_pre_isa3_guest && pa_size > 40) {
 277        /* Workaround for broken kernels that attempt (guest) radix
 278         * mode when they can't handle it, if they see the radix bit set
 279         * in pa-features. So hide it from them. */
 280        pa_features[40 + 2] &= ~0x80; /* Radix MMU */
 281    }
 282
 283    _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
 284}
 285
 286static hwaddr spapr_node0_size(MachineState *machine)
 287{
 288    if (machine->numa_state->num_nodes) {
 289        int i;
 290        for (i = 0; i < machine->numa_state->num_nodes; ++i) {
 291            if (machine->numa_state->nodes[i].node_mem) {
 292                return MIN(pow2floor(machine->numa_state->nodes[i].node_mem),
 293                           machine->ram_size);
 294            }
 295        }
 296    }
 297    return machine->ram_size;
 298}
 299
 300static void add_str(GString *s, const gchar *s1)
 301{
 302    g_string_append_len(s, s1, strlen(s1) + 1);
 303}
 304
 305static int spapr_dt_memory_node(SpaprMachineState *spapr, void *fdt, int nodeid,
 306                                hwaddr start, hwaddr size)
 307{
 308    char mem_name[32];
 309    uint64_t mem_reg_property[2];
 310    int off;
 311
 312    mem_reg_property[0] = cpu_to_be64(start);
 313    mem_reg_property[1] = cpu_to_be64(size);
 314
 315    sprintf(mem_name, "memory@%" HWADDR_PRIx, start);
 316    off = fdt_add_subnode(fdt, 0, mem_name);
 317    _FDT(off);
 318    _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
 319    _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property,
 320                      sizeof(mem_reg_property))));
 321    spapr_numa_write_associativity_dt(spapr, fdt, off, nodeid);
 322    return off;
 323}
 324
 325static uint32_t spapr_pc_dimm_node(MemoryDeviceInfoList *list, ram_addr_t addr)
 326{
 327    MemoryDeviceInfoList *info;
 328
 329    for (info = list; info; info = info->next) {
 330        MemoryDeviceInfo *value = info->value;
 331
 332        if (value && value->type == MEMORY_DEVICE_INFO_KIND_DIMM) {
 333            PCDIMMDeviceInfo *pcdimm_info = value->u.dimm.data;
 334
 335            if (addr >= pcdimm_info->addr &&
 336                addr < (pcdimm_info->addr + pcdimm_info->size)) {
 337                return pcdimm_info->node;
 338            }
 339        }
 340    }
 341
 342    return -1;
 343}
 344
 345struct sPAPRDrconfCellV2 {
 346     uint32_t seq_lmbs;
 347     uint64_t base_addr;
 348     uint32_t drc_index;
 349     uint32_t aa_index;
 350     uint32_t flags;
 351} QEMU_PACKED;
 352
 353typedef struct DrconfCellQueue {
 354    struct sPAPRDrconfCellV2 cell;
 355    QSIMPLEQ_ENTRY(DrconfCellQueue) entry;
 356} DrconfCellQueue;
 357
 358static DrconfCellQueue *
 359spapr_get_drconf_cell(uint32_t seq_lmbs, uint64_t base_addr,
 360                      uint32_t drc_index, uint32_t aa_index,
 361                      uint32_t flags)
 362{
 363    DrconfCellQueue *elem;
 364
 365    elem = g_malloc0(sizeof(*elem));
 366    elem->cell.seq_lmbs = cpu_to_be32(seq_lmbs);
 367    elem->cell.base_addr = cpu_to_be64(base_addr);
 368    elem->cell.drc_index = cpu_to_be32(drc_index);
 369    elem->cell.aa_index = cpu_to_be32(aa_index);
 370    elem->cell.flags = cpu_to_be32(flags);
 371
 372    return elem;
 373}
 374
 375static int spapr_dt_dynamic_memory_v2(SpaprMachineState *spapr, void *fdt,
 376                                      int offset, MemoryDeviceInfoList *dimms)
 377{
 378    MachineState *machine = MACHINE(spapr);
 379    uint8_t *int_buf, *cur_index;
 380    int ret;
 381    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
 382    uint64_t addr, cur_addr, size;
 383    uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
 384    uint64_t mem_end = machine->device_memory->base +
 385                       memory_region_size(&machine->device_memory->mr);
 386    uint32_t node, buf_len, nr_entries = 0;
 387    SpaprDrc *drc;
 388    DrconfCellQueue *elem, *next;
 389    MemoryDeviceInfoList *info;
 390    QSIMPLEQ_HEAD(, DrconfCellQueue) drconf_queue
 391        = QSIMPLEQ_HEAD_INITIALIZER(drconf_queue);
 392
 393    /* Entry to cover RAM and the gap area */
 394    elem = spapr_get_drconf_cell(nr_boot_lmbs, 0, 0, -1,
 395                                 SPAPR_LMB_FLAGS_RESERVED |
 396                                 SPAPR_LMB_FLAGS_DRC_INVALID);
 397    QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
 398    nr_entries++;
 399
 400    cur_addr = machine->device_memory->base;
 401    for (info = dimms; info; info = info->next) {
 402        PCDIMMDeviceInfo *di = info->value->u.dimm.data;
 403
 404        addr = di->addr;
 405        size = di->size;
 406        node = di->node;
 407
 408        /*
 409         * The NVDIMM area is hotpluggable after the NVDIMM is unplugged. The
 410         * area is marked hotpluggable in the next iteration for the bigger
 411         * chunk including the NVDIMM occupied area.
 412         */
 413        if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM)
 414            continue;
 415
 416        /* Entry for hot-pluggable area */
 417        if (cur_addr < addr) {
 418            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
 419            g_assert(drc);
 420            elem = spapr_get_drconf_cell((addr - cur_addr) / lmb_size,
 421                                         cur_addr, spapr_drc_index(drc), -1, 0);
 422            QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
 423            nr_entries++;
 424        }
 425
 426        /* Entry for DIMM */
 427        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
 428        g_assert(drc);
 429        elem = spapr_get_drconf_cell(size / lmb_size, addr,
 430                                     spapr_drc_index(drc), node,
 431                                     (SPAPR_LMB_FLAGS_ASSIGNED |
 432                                      SPAPR_LMB_FLAGS_HOTREMOVABLE));
 433        QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
 434        nr_entries++;
 435        cur_addr = addr + size;
 436    }
 437
 438    /* Entry for remaining hotpluggable area */
 439    if (cur_addr < mem_end) {
 440        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
 441        g_assert(drc);
 442        elem = spapr_get_drconf_cell((mem_end - cur_addr) / lmb_size,
 443                                     cur_addr, spapr_drc_index(drc), -1, 0);
 444        QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
 445        nr_entries++;
 446    }
 447
 448    buf_len = nr_entries * sizeof(struct sPAPRDrconfCellV2) + sizeof(uint32_t);
 449    int_buf = cur_index = g_malloc0(buf_len);
 450    *(uint32_t *)int_buf = cpu_to_be32(nr_entries);
 451    cur_index += sizeof(nr_entries);
 452
 453    QSIMPLEQ_FOREACH_SAFE(elem, &drconf_queue, entry, next) {
 454        memcpy(cur_index, &elem->cell, sizeof(elem->cell));
 455        cur_index += sizeof(elem->cell);
 456        QSIMPLEQ_REMOVE(&drconf_queue, elem, DrconfCellQueue, entry);
 457        g_free(elem);
 458    }
 459
 460    ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory-v2", int_buf, buf_len);
 461    g_free(int_buf);
 462    if (ret < 0) {
 463        return -1;
 464    }
 465    return 0;
 466}
 467
 468static int spapr_dt_dynamic_memory(SpaprMachineState *spapr, void *fdt,
 469                                   int offset, MemoryDeviceInfoList *dimms)
 470{
 471    MachineState *machine = MACHINE(spapr);
 472    int i, ret;
 473    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
 474    uint32_t device_lmb_start = machine->device_memory->base / lmb_size;
 475    uint32_t nr_lmbs = (machine->device_memory->base +
 476                       memory_region_size(&machine->device_memory->mr)) /
 477                       lmb_size;
 478    uint32_t *int_buf, *cur_index, buf_len;
 479
 480    /*
 481     * Allocate enough buffer size to fit in ibm,dynamic-memory
 482     */
 483    buf_len = (nr_lmbs * SPAPR_DR_LMB_LIST_ENTRY_SIZE + 1) * sizeof(uint32_t);
 484    cur_index = int_buf = g_malloc0(buf_len);
 485    int_buf[0] = cpu_to_be32(nr_lmbs);
 486    cur_index++;
 487    for (i = 0; i < nr_lmbs; i++) {
 488        uint64_t addr = i * lmb_size;
 489        uint32_t *dynamic_memory = cur_index;
 490
 491        if (i >= device_lmb_start) {
 492            SpaprDrc *drc;
 493
 494            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, i);
 495            g_assert(drc);
 496
 497            dynamic_memory[0] = cpu_to_be32(addr >> 32);
 498            dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
 499            dynamic_memory[2] = cpu_to_be32(spapr_drc_index(drc));
 500            dynamic_memory[3] = cpu_to_be32(0); /* reserved */
 501            dynamic_memory[4] = cpu_to_be32(spapr_pc_dimm_node(dimms, addr));
 502            if (memory_region_present(get_system_memory(), addr)) {
 503                dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_ASSIGNED);
 504            } else {
 505                dynamic_memory[5] = cpu_to_be32(0);
 506            }
 507        } else {
 508            /*
 509             * LMB information for RMA, boot time RAM and gap b/n RAM and
 510             * device memory region -- all these are marked as reserved
 511             * and as having no valid DRC.
 512             */
 513            dynamic_memory[0] = cpu_to_be32(addr >> 32);
 514            dynamic_memory[1] = cpu_to_be32(addr & 0xffffffff);
 515            dynamic_memory[2] = cpu_to_be32(0);
 516            dynamic_memory[3] = cpu_to_be32(0); /* reserved */
 517            dynamic_memory[4] = cpu_to_be32(-1);
 518            dynamic_memory[5] = cpu_to_be32(SPAPR_LMB_FLAGS_RESERVED |
 519                                            SPAPR_LMB_FLAGS_DRC_INVALID);
 520        }
 521
 522        cur_index += SPAPR_DR_LMB_LIST_ENTRY_SIZE;
 523    }
 524    ret = fdt_setprop(fdt, offset, "ibm,dynamic-memory", int_buf, buf_len);
 525    g_free(int_buf);
 526    if (ret < 0) {
 527        return -1;
 528    }
 529    return 0;
 530}
 531
 532/*
 533 * Adds ibm,dynamic-reconfiguration-memory node.
 534 * Refer to docs/specs/ppc-spapr-hotplug.txt for the documentation
 535 * of this device tree node.
 536 */
 537static int spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState *spapr,
 538                                                   void *fdt)
 539{
 540    MachineState *machine = MACHINE(spapr);
 541    int ret, offset;
 542    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
 543    uint32_t prop_lmb_size[] = {cpu_to_be32(lmb_size >> 32),
 544                                cpu_to_be32(lmb_size & 0xffffffff)};
 545    MemoryDeviceInfoList *dimms = NULL;
 546
 547    /*
 548     * Don't create the node if there is no device memory
 549     */
 550    if (machine->ram_size == machine->maxram_size) {
 551        return 0;
 552    }
 553
 554    offset = fdt_add_subnode(fdt, 0, "ibm,dynamic-reconfiguration-memory");
 555
 556    ret = fdt_setprop(fdt, offset, "ibm,lmb-size", prop_lmb_size,
 557                    sizeof(prop_lmb_size));
 558    if (ret < 0) {
 559        return ret;
 560    }
 561
 562    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-flags-mask", 0xff);
 563    if (ret < 0) {
 564        return ret;
 565    }
 566
 567    ret = fdt_setprop_cell(fdt, offset, "ibm,memory-preservation-time", 0x0);
 568    if (ret < 0) {
 569        return ret;
 570    }
 571
 572    /* ibm,dynamic-memory or ibm,dynamic-memory-v2 */
 573    dimms = qmp_memory_device_list();
 574    if (spapr_ovec_test(spapr->ov5_cas, OV5_DRMEM_V2)) {
 575        ret = spapr_dt_dynamic_memory_v2(spapr, fdt, offset, dimms);
 576    } else {
 577        ret = spapr_dt_dynamic_memory(spapr, fdt, offset, dimms);
 578    }
 579    qapi_free_MemoryDeviceInfoList(dimms);
 580
 581    if (ret < 0) {
 582        return ret;
 583    }
 584
 585    ret = spapr_numa_write_assoc_lookup_arrays(spapr, fdt, offset);
 586
 587    return ret;
 588}
 589
 590static int spapr_dt_memory(SpaprMachineState *spapr, void *fdt)
 591{
 592    MachineState *machine = MACHINE(spapr);
 593    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
 594    hwaddr mem_start, node_size;
 595    int i, nb_nodes = machine->numa_state->num_nodes;
 596    NodeInfo *nodes = machine->numa_state->nodes;
 597
 598    for (i = 0, mem_start = 0; i < nb_nodes; ++i) {
 599        if (!nodes[i].node_mem) {
 600            continue;
 601        }
 602        if (mem_start >= machine->ram_size) {
 603            node_size = 0;
 604        } else {
 605            node_size = nodes[i].node_mem;
 606            if (node_size > machine->ram_size - mem_start) {
 607                node_size = machine->ram_size - mem_start;
 608            }
 609        }
 610        if (!mem_start) {
 611            /* spapr_machine_init() checks for rma_size <= node0_size
 612             * already */
 613            spapr_dt_memory_node(spapr, fdt, i, 0, spapr->rma_size);
 614            mem_start += spapr->rma_size;
 615            node_size -= spapr->rma_size;
 616        }
 617        for ( ; node_size; ) {
 618            hwaddr sizetmp = pow2floor(node_size);
 619
 620            /* mem_start != 0 here */
 621            if (ctzl(mem_start) < ctzl(sizetmp)) {
 622                sizetmp = 1ULL << ctzl(mem_start);
 623            }
 624
 625            spapr_dt_memory_node(spapr, fdt, i, mem_start, sizetmp);
 626            node_size -= sizetmp;
 627            mem_start += sizetmp;
 628        }
 629    }
 630
 631    /* Generate ibm,dynamic-reconfiguration-memory node if required */
 632    if (spapr_ovec_test(spapr->ov5_cas, OV5_DRCONF_MEMORY)) {
 633        int ret;
 634
 635        g_assert(smc->dr_lmb_enabled);
 636        ret = spapr_dt_dynamic_reconfiguration_memory(spapr, fdt);
 637        if (ret) {
 638            return ret;
 639        }
 640    }
 641
 642    return 0;
 643}
 644
 645static void spapr_dt_cpu(CPUState *cs, void *fdt, int offset,
 646                         SpaprMachineState *spapr)
 647{
 648    MachineState *ms = MACHINE(spapr);
 649    PowerPCCPU *cpu = POWERPC_CPU(cs);
 650    CPUPPCState *env = &cpu->env;
 651    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
 652    int index = spapr_get_vcpu_id(cpu);
 653    uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
 654                       0xffffffff, 0xffffffff};
 655    uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq()
 656        : SPAPR_TIMEBASE_FREQ;
 657    uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000;
 658    uint32_t page_sizes_prop[64];
 659    size_t page_sizes_prop_size;
 660    unsigned int smp_threads = ms->smp.threads;
 661    uint32_t vcpus_per_socket = smp_threads * ms->smp.cores;
 662    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
 663    int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));
 664    SpaprDrc *drc;
 665    int drc_index;
 666    uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ];
 667    int i;
 668
 669    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, index);
 670    if (drc) {
 671        drc_index = spapr_drc_index(drc);
 672        _FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)));
 673    }
 674
 675    _FDT((fdt_setprop_cell(fdt, offset, "reg", index)));
 676    _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
 677
 678    _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR])));
 679    _FDT((fdt_setprop_cell(fdt, offset, "d-cache-block-size",
 680                           env->dcache_line_size)));
 681    _FDT((fdt_setprop_cell(fdt, offset, "d-cache-line-size",
 682                           env->dcache_line_size)));
 683    _FDT((fdt_setprop_cell(fdt, offset, "i-cache-block-size",
 684                           env->icache_line_size)));
 685    _FDT((fdt_setprop_cell(fdt, offset, "i-cache-line-size",
 686                           env->icache_line_size)));
 687
 688    if (pcc->l1_dcache_size) {
 689        _FDT((fdt_setprop_cell(fdt, offset, "d-cache-size",
 690                               pcc->l1_dcache_size)));
 691    } else {
 692        warn_report("Unknown L1 dcache size for cpu");
 693    }
 694    if (pcc->l1_icache_size) {
 695        _FDT((fdt_setprop_cell(fdt, offset, "i-cache-size",
 696                               pcc->l1_icache_size)));
 697    } else {
 698        warn_report("Unknown L1 icache size for cpu");
 699    }
 700
 701    _FDT((fdt_setprop_cell(fdt, offset, "timebase-frequency", tbfreq)));
 702    _FDT((fdt_setprop_cell(fdt, offset, "clock-frequency", cpufreq)));
 703    _FDT((fdt_setprop_cell(fdt, offset, "slb-size", cpu->hash64_opts->slb_size)));
 704    _FDT((fdt_setprop_cell(fdt, offset, "ibm,slb-size", cpu->hash64_opts->slb_size)));
 705    _FDT((fdt_setprop_string(fdt, offset, "status", "okay")));
 706    _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0)));
 707
 708    if (ppc_has_spr(cpu, SPR_PURR)) {
 709        _FDT((fdt_setprop_cell(fdt, offset, "ibm,purr", 1)));
 710    }
 711    if (ppc_has_spr(cpu, SPR_PURR)) {
 712        _FDT((fdt_setprop_cell(fdt, offset, "ibm,spurr", 1)));
 713    }
 714
 715    if (ppc_hash64_has(cpu, PPC_HASH64_1TSEG)) {
 716        _FDT((fdt_setprop(fdt, offset, "ibm,processor-segment-sizes",
 717                          segs, sizeof(segs))));
 718    }
 719
 720    /* Advertise VSX (vector extensions) if available
 721     *   1               == VMX / Altivec available
 722     *   2               == VSX available
 723     *
 724     * Only CPUs for which we create core types in spapr_cpu_core.c
 725     * are possible, and all of those have VMX */
 726    if (spapr_get_cap(spapr, SPAPR_CAP_VSX) != 0) {
 727        _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 2)));
 728    } else {
 729        _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 1)));
 730    }
 731
 732    /* Advertise DFP (Decimal Floating Point) if available
 733     *   0 / no property == no DFP
 734     *   1               == DFP available */
 735    if (spapr_get_cap(spapr, SPAPR_CAP_DFP) != 0) {
 736        _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1)));
 737    }
 738
 739    page_sizes_prop_size = ppc_create_page_sizes_prop(cpu, page_sizes_prop,
 740                                                      sizeof(page_sizes_prop));
 741    if (page_sizes_prop_size) {
 742        _FDT((fdt_setprop(fdt, offset, "ibm,segment-page-sizes",
 743                          page_sizes_prop, page_sizes_prop_size)));
 744    }
 745
 746    spapr_dt_pa_features(spapr, cpu, fdt, offset);
 747
 748    _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
 749                           cs->cpu_index / vcpus_per_socket)));
 750
 751    _FDT((fdt_setprop(fdt, offset, "ibm,pft-size",
 752                      pft_size_prop, sizeof(pft_size_prop))));
 753
 754    if (ms->numa_state->num_nodes > 1) {
 755        _FDT(spapr_numa_fixup_cpu_dt(spapr, fdt, offset, cpu));
 756    }
 757
 758    _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt));
 759
 760    if (pcc->radix_page_info) {
 761        for (i = 0; i < pcc->radix_page_info->count; i++) {
 762            radix_AP_encodings[i] =
 763                cpu_to_be32(pcc->radix_page_info->entries[i]);
 764        }
 765        _FDT((fdt_setprop(fdt, offset, "ibm,processor-radix-AP-encodings",
 766                          radix_AP_encodings,
 767                          pcc->radix_page_info->count *
 768                          sizeof(radix_AP_encodings[0]))));
 769    }
 770
 771    /*
 772     * We set this property to let the guest know that it can use the large
 773     * decrementer and its width in bits.
 774     */
 775    if (spapr_get_cap(spapr, SPAPR_CAP_LARGE_DECREMENTER) != SPAPR_CAP_OFF)
 776        _FDT((fdt_setprop_u32(fdt, offset, "ibm,dec-bits",
 777                              pcc->lrg_decr_bits)));
 778}
 779
 780static void spapr_dt_cpus(void *fdt, SpaprMachineState *spapr)
 781{
 782    CPUState **rev;
 783    CPUState *cs;
 784    int n_cpus;
 785    int cpus_offset;
 786    int i;
 787
 788    cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
 789    _FDT(cpus_offset);
 790    _FDT((fdt_setprop_cell(fdt, cpus_offset, "#address-cells", 0x1)));
 791    _FDT((fdt_setprop_cell(fdt, cpus_offset, "#size-cells", 0x0)));
 792
 793    /*
 794     * We walk the CPUs in reverse order to ensure that CPU DT nodes
 795     * created by fdt_add_subnode() end up in the right order in FDT
 796     * for the guest kernel the enumerate the CPUs correctly.
 797     *
 798     * The CPU list cannot be traversed in reverse order, so we need
 799     * to do extra work.
 800     */
 801    n_cpus = 0;
 802    rev = NULL;
 803    CPU_FOREACH(cs) {
 804        rev = g_renew(CPUState *, rev, n_cpus + 1);
 805        rev[n_cpus++] = cs;
 806    }
 807
 808    for (i = n_cpus - 1; i >= 0; i--) {
 809        CPUState *cs = rev[i];
 810        PowerPCCPU *cpu = POWERPC_CPU(cs);
 811        int index = spapr_get_vcpu_id(cpu);
 812        DeviceClass *dc = DEVICE_GET_CLASS(cs);
 813        g_autofree char *nodename = NULL;
 814        int offset;
 815
 816        if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
 817            continue;
 818        }
 819
 820        nodename = g_strdup_printf("%s@%x", dc->fw_name, index);
 821        offset = fdt_add_subnode(fdt, cpus_offset, nodename);
 822        _FDT(offset);
 823        spapr_dt_cpu(cs, fdt, offset, spapr);
 824    }
 825
 826    g_free(rev);
 827}
 828
 829static int spapr_dt_rng(void *fdt)
 830{
 831    int node;
 832    int ret;
 833
 834    node = qemu_fdt_add_subnode(fdt, "/ibm,platform-facilities");
 835    if (node <= 0) {
 836        return -1;
 837    }
 838    ret = fdt_setprop_string(fdt, node, "device_type",
 839                             "ibm,platform-facilities");
 840    ret |= fdt_setprop_cell(fdt, node, "#address-cells", 0x1);
 841    ret |= fdt_setprop_cell(fdt, node, "#size-cells", 0x0);
 842
 843    node = fdt_add_subnode(fdt, node, "ibm,random-v1");
 844    if (node <= 0) {
 845        return -1;
 846    }
 847    ret |= fdt_setprop_string(fdt, node, "compatible", "ibm,random");
 848
 849    return ret ? -1 : 0;
 850}
 851
 852static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt)
 853{
 854    MachineState *ms = MACHINE(spapr);
 855    int rtas;
 856    GString *hypertas = g_string_sized_new(256);
 857    GString *qemu_hypertas = g_string_sized_new(256);
 858    uint64_t max_device_addr = MACHINE(spapr)->device_memory->base +
 859        memory_region_size(&MACHINE(spapr)->device_memory->mr);
 860    uint32_t lrdr_capacity[] = {
 861        cpu_to_be32(max_device_addr >> 32),
 862        cpu_to_be32(max_device_addr & 0xffffffff),
 863        cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE >> 32),
 864        cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE & 0xffffffff),
 865        cpu_to_be32(ms->smp.max_cpus / ms->smp.threads),
 866    };
 867
 868    _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas"));
 869
 870    /* hypertas */
 871    add_str(hypertas, "hcall-pft");
 872    add_str(hypertas, "hcall-term");
 873    add_str(hypertas, "hcall-dabr");
 874    add_str(hypertas, "hcall-interrupt");
 875    add_str(hypertas, "hcall-tce");
 876    add_str(hypertas, "hcall-vio");
 877    add_str(hypertas, "hcall-splpar");
 878    add_str(hypertas, "hcall-join");
 879    add_str(hypertas, "hcall-bulk");
 880    add_str(hypertas, "hcall-set-mode");
 881    add_str(hypertas, "hcall-sprg0");
 882    add_str(hypertas, "hcall-copy");
 883    add_str(hypertas, "hcall-debug");
 884    add_str(hypertas, "hcall-vphn");
 885    if (spapr_get_cap(spapr, SPAPR_CAP_RPT_INVALIDATE) == SPAPR_CAP_ON) {
 886        add_str(hypertas, "hcall-rpt-invalidate");
 887    }
 888
 889    add_str(qemu_hypertas, "hcall-memop1");
 890
 891    if (!kvm_enabled() || kvmppc_spapr_use_multitce()) {
 892        add_str(hypertas, "hcall-multi-tce");
 893    }
 894
 895    if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
 896        add_str(hypertas, "hcall-hpt-resize");
 897    }
 898
 899    _FDT(fdt_setprop(fdt, rtas, "ibm,hypertas-functions",
 900                     hypertas->str, hypertas->len));
 901    g_string_free(hypertas, TRUE);
 902    _FDT(fdt_setprop(fdt, rtas, "qemu,hypertas-functions",
 903                     qemu_hypertas->str, qemu_hypertas->len));
 904    g_string_free(qemu_hypertas, TRUE);
 905
 906    spapr_numa_write_rtas_dt(spapr, fdt, rtas);
 907
 908    /*
 909     * FWNMI reserves RTAS_ERROR_LOG_MAX for the machine check error log,
 910     * and 16 bytes per CPU for system reset error log plus an extra 8 bytes.
 911     *
 912     * The system reset requirements are driven by existing Linux and PowerVM
 913     * implementation which (contrary to PAPR) saves r3 in the error log
 914     * structure like machine check, so Linux expects to find the saved r3
 915     * value at the address in r3 upon FWNMI-enabled sreset interrupt (and
 916     * does not look at the error value).
 917     *
 918     * System reset interrupts are not subject to interlock like machine
 919     * check, so this memory area could be corrupted if the sreset is
 920     * interrupted by a machine check (or vice versa) if it was shared. To
 921     * prevent this, system reset uses per-CPU areas for the sreset save
 922     * area. A system reset that interrupts a system reset handler could
 923     * still overwrite this area, but Linux doesn't try to recover in that
 924     * case anyway.
 925     *
 926     * The extra 8 bytes is required because Linux's FWNMI error log check
 927     * is off-by-one.
 928     *
 929     * RTAS_MIN_SIZE is required for the RTAS blob itself.
 930     */
 931    _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_MIN_SIZE +
 932                          RTAS_ERROR_LOG_MAX +
 933                          ms->smp.max_cpus * sizeof(uint64_t) * 2 +
 934                          sizeof(uint64_t)));
 935    _FDT(fdt_setprop_cell(fdt, rtas, "rtas-error-log-max",
 936                          RTAS_ERROR_LOG_MAX));
 937    _FDT(fdt_setprop_cell(fdt, rtas, "rtas-event-scan-rate",
 938                          RTAS_EVENT_SCAN_RATE));
 939
 940    g_assert(msi_nonbroken);
 941    _FDT(fdt_setprop(fdt, rtas, "ibm,change-msix-capable", NULL, 0));
 942
 943    /*
 944     * According to PAPR, rtas ibm,os-term does not guarantee a return
 945     * back to the guest cpu.
 946     *
 947     * While an additional ibm,extended-os-term property indicates
 948     * that rtas call return will always occur. Set this property.
 949     */
 950    _FDT(fdt_setprop(fdt, rtas, "ibm,extended-os-term", NULL, 0));
 951
 952    _FDT(fdt_setprop(fdt, rtas, "ibm,lrdr-capacity",
 953                     lrdr_capacity, sizeof(lrdr_capacity)));
 954
 955    spapr_dt_rtas_tokens(fdt, rtas);
 956}
 957
 958/*
 959 * Prepare ibm,arch-vec-5-platform-support, which indicates the MMU
 960 * and the XIVE features that the guest may request and thus the valid
 961 * values for bytes 23..26 of option vector 5:
 962 */
 963static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt,
 964                                          int chosen)
 965{
 966    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
 967
 968    char val[2 * 4] = {
 969        23, 0x00, /* XICS / XIVE mode */
 970        24, 0x00, /* Hash/Radix, filled in below. */
 971        25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */
 972        26, 0x40, /* Radix options: GTSE == yes. */
 973    };
 974
 975    if (spapr->irq->xics && spapr->irq->xive) {
 976        val[1] = SPAPR_OV5_XIVE_BOTH;
 977    } else if (spapr->irq->xive) {
 978        val[1] = SPAPR_OV5_XIVE_EXPLOIT;
 979    } else {
 980        assert(spapr->irq->xics);
 981        val[1] = SPAPR_OV5_XIVE_LEGACY;
 982    }
 983
 984    if (!ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0,
 985                          first_ppc_cpu->compat_pvr)) {
 986        /*
 987         * If we're in a pre POWER9 compat mode then the guest should
 988         * do hash and use the legacy interrupt mode
 989         */
 990        val[1] = SPAPR_OV5_XIVE_LEGACY; /* XICS */
 991        val[3] = 0x00; /* Hash */
 992        spapr_check_mmu_mode(false);
 993    } else if (kvm_enabled()) {
 994        if (kvmppc_has_cap_mmu_radix() && kvmppc_has_cap_mmu_hash_v3()) {
 995            val[3] = 0x80; /* OV5_MMU_BOTH */
 996        } else if (kvmppc_has_cap_mmu_radix()) {
 997            val[3] = 0x40; /* OV5_MMU_RADIX_300 */
 998        } else {
 999            val[3] = 0x00; /* Hash */
1000        }
1001    } else {
1002        /* V3 MMU supports both hash and radix in tcg (with dynamic switching) */
1003        val[3] = 0xC0;
1004    }
1005    _FDT(fdt_setprop(fdt, chosen, "ibm,arch-vec-5-platform-support",
1006                     val, sizeof(val)));
1007}
1008
1009static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt, bool reset)
1010{
1011    MachineState *machine = MACHINE(spapr);
1012    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
1013    int chosen;
1014
1015    _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen"));
1016
1017    if (reset) {
1018        const char *boot_device = spapr->boot_device;
1019        char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus);
1020        size_t cb = 0;
1021        char *bootlist = get_boot_devices_list(&cb);
1022
1023        if (machine->kernel_cmdline && machine->kernel_cmdline[0]) {
1024            _FDT(fdt_setprop_string(fdt, chosen, "bootargs",
1025                                    machine->kernel_cmdline));
1026        }
1027
1028        if (spapr->initrd_size) {
1029            _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-start",
1030                                  spapr->initrd_base));
1031            _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-end",
1032                                  spapr->initrd_base + spapr->initrd_size));
1033        }
1034
1035        if (spapr->kernel_size) {
1036            uint64_t kprop[2] = { cpu_to_be64(spapr->kernel_addr),
1037                                  cpu_to_be64(spapr->kernel_size) };
1038
1039            _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel",
1040                         &kprop, sizeof(kprop)));
1041            if (spapr->kernel_le) {
1042                _FDT(fdt_setprop(fdt, chosen, "qemu,boot-kernel-le", NULL, 0));
1043            }
1044        }
1045        if (boot_menu) {
1046            _FDT((fdt_setprop_cell(fdt, chosen, "qemu,boot-menu", boot_menu)));
1047        }
1048        _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-width", graphic_width));
1049        _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-height", graphic_height));
1050        _FDT(fdt_setprop_cell(fdt, chosen, "qemu,graphic-depth", graphic_depth));
1051
1052        if (cb && bootlist) {
1053            int i;
1054
1055            for (i = 0; i < cb; i++) {
1056                if (bootlist[i] == '\n') {
1057                    bootlist[i] = ' ';
1058                }
1059            }
1060            _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-list", bootlist));
1061        }
1062
1063        if (boot_device && strlen(boot_device)) {
1064            _FDT(fdt_setprop_string(fdt, chosen, "qemu,boot-device", boot_device));
1065        }
1066
1067        if (!spapr->has_graphics && stdout_path) {
1068            /*
1069             * "linux,stdout-path" and "stdout" properties are
1070             * deprecated by linux kernel. New platforms should only
1071             * use the "stdout-path" property. Set the new property
1072             * and continue using older property to remain compatible
1073             * with the existing firmware.
1074             */
1075            _FDT(fdt_setprop_string(fdt, chosen, "linux,stdout-path", stdout_path));
1076            _FDT(fdt_setprop_string(fdt, chosen, "stdout-path", stdout_path));
1077        }
1078
1079        /*
1080         * We can deal with BAR reallocation just fine, advertise it
1081         * to the guest
1082         */
1083        if (smc->linux_pci_probe) {
1084            _FDT(fdt_setprop_cell(fdt, chosen, "linux,pci-probe-only", 0));
1085        }
1086
1087        spapr_dt_ov5_platform_support(spapr, fdt, chosen);
1088
1089        g_free(stdout_path);
1090        g_free(bootlist);
1091    }
1092
1093    _FDT(spapr_dt_ovec(fdt, chosen, spapr->ov5_cas, "ibm,architecture-vec-5"));
1094}
1095
1096static void spapr_dt_hypervisor(SpaprMachineState *spapr, void *fdt)
1097{
1098    /* The /hypervisor node isn't in PAPR - this is a hack to allow PR
1099     * KVM to work under pHyp with some guest co-operation */
1100    int hypervisor;
1101    uint8_t hypercall[16];
1102
1103    _FDT(hypervisor = fdt_add_subnode(fdt, 0, "hypervisor"));
1104    /* indicate KVM hypercall interface */
1105    _FDT(fdt_setprop_string(fdt, hypervisor, "compatible", "linux,kvm"));
1106    if (kvmppc_has_cap_fixup_hcalls()) {
1107        /*
1108         * Older KVM versions with older guest kernels were broken
1109         * with the magic page, don't allow the guest to map it.
1110         */
1111        if (!kvmppc_get_hypercall(first_cpu->env_ptr, hypercall,
1112                                  sizeof(hypercall))) {
1113            _FDT(fdt_setprop(fdt, hypervisor, "hcall-instructions",
1114                             hypercall, sizeof(hypercall)));
1115        }
1116    }
1117}
1118
1119void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space)
1120{
1121    MachineState *machine = MACHINE(spapr);
1122    MachineClass *mc = MACHINE_GET_CLASS(machine);
1123    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
1124    uint32_t root_drc_type_mask = 0;
1125    int ret;
1126    void *fdt;
1127    SpaprPhbState *phb;
1128    char *buf;
1129
1130    fdt = g_malloc0(space);
1131    _FDT((fdt_create_empty_tree(fdt, space)));
1132
1133    /* Root node */
1134    _FDT(fdt_setprop_string(fdt, 0, "device_type", "chrp"));
1135    _FDT(fdt_setprop_string(fdt, 0, "model", "IBM pSeries (emulated by qemu)"));
1136    _FDT(fdt_setprop_string(fdt, 0, "compatible", "qemu,pseries"));
1137
1138    /* Guest UUID & Name*/
1139    buf = qemu_uuid_unparse_strdup(&qemu_uuid);
1140    _FDT(fdt_setprop_string(fdt, 0, "vm,uuid", buf));
1141    if (qemu_uuid_set) {
1142        _FDT(fdt_setprop_string(fdt, 0, "system-id", buf));
1143    }
1144    g_free(buf);
1145
1146    if (qemu_get_vm_name()) {
1147        _FDT(fdt_setprop_string(fdt, 0, "ibm,partition-name",
1148                                qemu_get_vm_name()));
1149    }
1150
1151    /* Host Model & Serial Number */
1152    if (spapr->host_model) {
1153        _FDT(fdt_setprop_string(fdt, 0, "host-model", spapr->host_model));
1154    } else if (smc->broken_host_serial_model && kvmppc_get_host_model(&buf)) {
1155        _FDT(fdt_setprop_string(fdt, 0, "host-model", buf));
1156        g_free(buf);
1157    }
1158
1159    if (spapr->host_serial) {
1160        _FDT(fdt_setprop_string(fdt, 0, "host-serial", spapr->host_serial));
1161    } else if (smc->broken_host_serial_model && kvmppc_get_host_serial(&buf)) {
1162        _FDT(fdt_setprop_string(fdt, 0, "host-serial", buf));
1163        g_free(buf);
1164    }
1165
1166    _FDT(fdt_setprop_cell(fdt, 0, "#address-cells", 2));
1167    _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2));
1168
1169    /* /interrupt controller */
1170    spapr_irq_dt(spapr, spapr_max_server_number(spapr), fdt, PHANDLE_INTC);
1171
1172    ret = spapr_dt_memory(spapr, fdt);
1173    if (ret < 0) {
1174        error_report("couldn't setup memory nodes in fdt");
1175        exit(1);
1176    }
1177
1178    /* /vdevice */
1179    spapr_dt_vdevice(spapr->vio_bus, fdt);
1180
1181    if (object_resolve_path_type("", TYPE_SPAPR_RNG, NULL)) {
1182        ret = spapr_dt_rng(fdt);
1183        if (ret < 0) {
1184            error_report("could not set up rng device in the fdt");
1185            exit(1);
1186        }
1187    }
1188
1189    QLIST_FOREACH(phb, &spapr->phbs, list) {
1190        ret = spapr_dt_phb(spapr, phb, PHANDLE_INTC, fdt, NULL);
1191        if (ret < 0) {
1192            error_report("couldn't setup PCI devices in fdt");
1193            exit(1);
1194        }
1195    }
1196
1197    spapr_dt_cpus(fdt, spapr);
1198
1199    /* ibm,drc-indexes and friends */
1200    if (smc->dr_lmb_enabled) {
1201        root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_LMB;
1202    }
1203    if (smc->dr_phb_enabled) {
1204        root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PHB;
1205    }
1206    if (mc->nvdimm_supported) {
1207        root_drc_type_mask |= SPAPR_DR_CONNECTOR_TYPE_PMEM;
1208    }
1209    if (root_drc_type_mask) {
1210        _FDT(spapr_dt_drc(fdt, 0, NULL, root_drc_type_mask));
1211    }
1212
1213    if (mc->has_hotpluggable_cpus) {
1214        int offset = fdt_path_offset(fdt, "/cpus");
1215        ret = spapr_dt_drc(fdt, offset, NULL, SPAPR_DR_CONNECTOR_TYPE_CPU);
1216        if (ret < 0) {
1217            error_report("Couldn't set up CPU DR device tree properties");
1218            exit(1);
1219        }
1220    }
1221
1222    /* /event-sources */
1223    spapr_dt_events(spapr, fdt);
1224
1225    /* /rtas */
1226    spapr_dt_rtas(spapr, fdt);
1227
1228    /* /chosen */
1229    spapr_dt_chosen(spapr, fdt, reset);
1230
1231    /* /hypervisor */
1232    if (kvm_enabled()) {
1233        spapr_dt_hypervisor(spapr, fdt);
1234    }
1235
1236    /* Build memory reserve map */
1237    if (reset) {
1238        if (spapr->kernel_size) {
1239            _FDT((fdt_add_mem_rsv(fdt, spapr->kernel_addr,
1240                                  spapr->kernel_size)));
1241        }
1242        if (spapr->initrd_size) {
1243            _FDT((fdt_add_mem_rsv(fdt, spapr->initrd_base,
1244                                  spapr->initrd_size)));
1245        }
1246    }
1247
1248    /* NVDIMM devices */
1249    if (mc->nvdimm_supported) {
1250        spapr_dt_persistent_memory(spapr, fdt);
1251    }
1252
1253    return fdt;
1254}
1255
1256static uint64_t translate_kernel_address(void *opaque, uint64_t addr)
1257{
1258    SpaprMachineState *spapr = opaque;
1259
1260    return (addr & 0x0fffffff) + spapr->kernel_addr;
1261}
1262
1263static void emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
1264                                    PowerPCCPU *cpu)
1265{
1266    CPUPPCState *env = &cpu->env;
1267
1268    /* The TCG path should also be holding the BQL at this point */
1269    g_assert(qemu_mutex_iothread_locked());
1270
1271    if (msr_pr) {
1272        hcall_dprintf("Hypercall made with MSR[PR]=1\n");
1273        env->gpr[3] = H_PRIVILEGE;
1274    } else {
1275        env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
1276    }
1277}
1278
1279struct LPCRSyncState {
1280    target_ulong value;
1281    target_ulong mask;
1282};
1283
1284static void do_lpcr_sync(CPUState *cs, run_on_cpu_data arg)
1285{
1286    struct LPCRSyncState *s = arg.host_ptr;
1287    PowerPCCPU *cpu = POWERPC_CPU(cs);
1288    CPUPPCState *env = &cpu->env;
1289    target_ulong lpcr;
1290
1291    cpu_synchronize_state(cs);
1292    lpcr = env->spr[SPR_LPCR];
1293    lpcr &= ~s->mask;
1294    lpcr |= s->value;
1295    ppc_store_lpcr(cpu, lpcr);
1296}
1297
1298void spapr_set_all_lpcrs(target_ulong value, target_ulong mask)
1299{
1300    CPUState *cs;
1301    struct LPCRSyncState s = {
1302        .value = value,
1303        .mask = mask
1304    };
1305    CPU_FOREACH(cs) {
1306        run_on_cpu(cs, do_lpcr_sync, RUN_ON_CPU_HOST_PTR(&s));
1307    }
1308}
1309
1310static void spapr_get_pate(PPCVirtualHypervisor *vhyp, ppc_v3_pate_t *entry)
1311{
1312    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1313
1314    /* Copy PATE1:GR into PATE0:HR */
1315    entry->dw0 = spapr->patb_entry & PATE0_HR;
1316    entry->dw1 = spapr->patb_entry;
1317}
1318
1319#define HPTE(_table, _i)   (void *)(((uint64_t *)(_table)) + ((_i) * 2))
1320#define HPTE_VALID(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_VALID)
1321#define HPTE_DIRTY(_hpte)  (tswap64(*((uint64_t *)(_hpte))) & HPTE64_V_HPTE_DIRTY)
1322#define CLEAN_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY))
1323#define DIRTY_HPTE(_hpte)  ((*(uint64_t *)(_hpte)) |= tswap64(HPTE64_V_HPTE_DIRTY))
1324
1325/*
1326 * Get the fd to access the kernel htab, re-opening it if necessary
1327 */
1328static int get_htab_fd(SpaprMachineState *spapr)
1329{
1330    Error *local_err = NULL;
1331
1332    if (spapr->htab_fd >= 0) {
1333        return spapr->htab_fd;
1334    }
1335
1336    spapr->htab_fd = kvmppc_get_htab_fd(false, 0, &local_err);
1337    if (spapr->htab_fd < 0) {
1338        error_report_err(local_err);
1339    }
1340
1341    return spapr->htab_fd;
1342}
1343
1344void close_htab_fd(SpaprMachineState *spapr)
1345{
1346    if (spapr->htab_fd >= 0) {
1347        close(spapr->htab_fd);
1348    }
1349    spapr->htab_fd = -1;
1350}
1351
1352static hwaddr spapr_hpt_mask(PPCVirtualHypervisor *vhyp)
1353{
1354    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1355
1356    return HTAB_SIZE(spapr) / HASH_PTEG_SIZE_64 - 1;
1357}
1358
1359static target_ulong spapr_encode_hpt_for_kvm_pr(PPCVirtualHypervisor *vhyp)
1360{
1361    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1362
1363    assert(kvm_enabled());
1364
1365    if (!spapr->htab) {
1366        return 0;
1367    }
1368
1369    return (target_ulong)(uintptr_t)spapr->htab | (spapr->htab_shift - 18);
1370}
1371
1372static const ppc_hash_pte64_t *spapr_map_hptes(PPCVirtualHypervisor *vhyp,
1373                                                hwaddr ptex, int n)
1374{
1375    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1376    hwaddr pte_offset = ptex * HASH_PTE_SIZE_64;
1377
1378    if (!spapr->htab) {
1379        /*
1380         * HTAB is controlled by KVM. Fetch into temporary buffer
1381         */
1382        ppc_hash_pte64_t *hptes = g_malloc(n * HASH_PTE_SIZE_64);
1383        kvmppc_read_hptes(hptes, ptex, n);
1384        return hptes;
1385    }
1386
1387    /*
1388     * HTAB is controlled by QEMU. Just point to the internally
1389     * accessible PTEG.
1390     */
1391    return (const ppc_hash_pte64_t *)(spapr->htab + pte_offset);
1392}
1393
1394static void spapr_unmap_hptes(PPCVirtualHypervisor *vhyp,
1395                              const ppc_hash_pte64_t *hptes,
1396                              hwaddr ptex, int n)
1397{
1398    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1399
1400    if (!spapr->htab) {
1401        g_free((void *)hptes);
1402    }
1403
1404    /* Nothing to do for qemu managed HPT */
1405}
1406
1407void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
1408                      uint64_t pte0, uint64_t pte1)
1409{
1410    SpaprMachineState *spapr = SPAPR_MACHINE(cpu->vhyp);
1411    hwaddr offset = ptex * HASH_PTE_SIZE_64;
1412
1413    if (!spapr->htab) {
1414        kvmppc_write_hpte(ptex, pte0, pte1);
1415    } else {
1416        if (pte0 & HPTE64_V_VALID) {
1417            stq_p(spapr->htab + offset + HPTE64_DW1, pte1);
1418            /*
1419             * When setting valid, we write PTE1 first. This ensures
1420             * proper synchronization with the reading code in
1421             * ppc_hash64_pteg_search()
1422             */
1423            smp_wmb();
1424            stq_p(spapr->htab + offset, pte0);
1425        } else {
1426            stq_p(spapr->htab + offset, pte0);
1427            /*
1428             * When clearing it we set PTE0 first. This ensures proper
1429             * synchronization with the reading code in
1430             * ppc_hash64_pteg_search()
1431             */
1432            smp_wmb();
1433            stq_p(spapr->htab + offset + HPTE64_DW1, pte1);
1434        }
1435    }
1436}
1437
1438static void spapr_hpte_set_c(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1439                             uint64_t pte1)
1440{
1441    hwaddr offset = ptex * HASH_PTE_SIZE_64 + HPTE64_DW1_C;
1442    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1443
1444    if (!spapr->htab) {
1445        /* There should always be a hash table when this is called */
1446        error_report("spapr_hpte_set_c called with no hash table !");
1447        return;
1448    }
1449
1450    /* The HW performs a non-atomic byte update */
1451    stb_p(spapr->htab + offset, (pte1 & 0xff) | 0x80);
1452}
1453
1454static void spapr_hpte_set_r(PPCVirtualHypervisor *vhyp, hwaddr ptex,
1455                             uint64_t pte1)
1456{
1457    hwaddr offset = ptex * HASH_PTE_SIZE_64 + HPTE64_DW1_R;
1458    SpaprMachineState *spapr = SPAPR_MACHINE(vhyp);
1459
1460    if (!spapr->htab) {
1461        /* There should always be a hash table when this is called */
1462        error_report("spapr_hpte_set_r called with no hash table !");
1463        return;
1464    }
1465
1466    /* The HW performs a non-atomic byte update */
1467    stb_p(spapr->htab + offset, ((pte1 >> 8) & 0xff) | 0x01);
1468}
1469
1470int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
1471{
1472    int shift;
1473
1474    /* We aim for a hash table of size 1/128 the size of RAM (rounded
1475     * up).  The PAPR recommendation is actually 1/64 of RAM size, but
1476     * that's much more than is needed for Linux guests */
1477    shift = ctz64(pow2ceil(ramsize)) - 7;
1478    shift = MAX(shift, 18); /* Minimum architected size */
1479    shift = MIN(shift, 46); /* Maximum architected size */
1480    return shift;
1481}
1482
1483void spapr_free_hpt(SpaprMachineState *spapr)
1484{
1485    g_free(spapr->htab);
1486    spapr->htab = NULL;
1487    spapr->htab_shift = 0;
1488    close_htab_fd(spapr);
1489}
1490
1491int spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, Error **errp)
1492{
1493    ERRP_GUARD();
1494    long rc;
1495
1496    /* Clean up any HPT info from a previous boot */
1497    spapr_free_hpt(spapr);
1498
1499    rc = kvmppc_reset_htab(shift);
1500
1501    if (rc == -EOPNOTSUPP) {
1502        error_setg(errp, "HPT not supported in nested guests");
1503        return -EOPNOTSUPP;
1504    }
1505
1506    if (rc < 0) {
1507        /* kernel-side HPT needed, but couldn't allocate one */
1508        error_setg_errno(errp, errno, "Failed to allocate KVM HPT of order %d",
1509                         shift);
1510        error_append_hint(errp, "Try smaller maxmem?\n");
1511        return -errno;
1512    } else if (rc > 0) {
1513        /* kernel-side HPT allocated */
1514        if (rc != shift) {
1515            error_setg(errp,
1516                       "Requested order %d HPT, but kernel allocated order %ld",
1517                       shift, rc);
1518            error_append_hint(errp, "Try smaller maxmem?\n");
1519            return -ENOSPC;
1520        }
1521
1522        spapr->htab_shift = shift;
1523        spapr->htab = NULL;
1524    } else {
1525        /* kernel-side HPT not needed, allocate in userspace instead */
1526        size_t size = 1ULL << shift;
1527        int i;
1528
1529        spapr->htab = qemu_memalign(size, size);
1530        memset(spapr->htab, 0, size);
1531        spapr->htab_shift = shift;
1532
1533        for (i = 0; i < size / HASH_PTE_SIZE_64; i++) {
1534            DIRTY_HPTE(HPTE(spapr->htab, i));
1535        }
1536    }
1537    /* We're setting up a hash table, so that means we're not radix */
1538    spapr->patb_entry = 0;
1539    spapr_set_all_lpcrs(0, LPCR_HR | LPCR_UPRT);
1540    return 0;
1541}
1542
1543void spapr_setup_hpt(SpaprMachineState *spapr)
1544{
1545    int hpt_shift;
1546
1547    if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
1548        hpt_shift = spapr_hpt_shift_for_ramsize(MACHINE(spapr)->maxram_size);
1549    } else {
1550        uint64_t current_ram_size;
1551
1552        current_ram_size = MACHINE(spapr)->ram_size + get_plugged_memory_size();
1553        hpt_shift = spapr_hpt_shift_for_ramsize(current_ram_size);
1554    }
1555    spapr_reallocate_hpt(spapr, hpt_shift, &error_fatal);
1556
1557    if (kvm_enabled()) {
1558        hwaddr vrma_limit = kvmppc_vrma_limit(spapr->htab_shift);
1559
1560        /* Check our RMA fits in the possible VRMA */
1561        if (vrma_limit < spapr->rma_size) {
1562            error_report("Unable to create %" HWADDR_PRIu
1563                         "MiB RMA (VRMA only allows %" HWADDR_PRIu "MiB",
1564                         spapr->rma_size / MiB, vrma_limit / MiB);
1565            exit(EXIT_FAILURE);
1566        }
1567    }
1568}
1569
1570void spapr_check_mmu_mode(bool guest_radix)
1571{
1572    if (guest_radix) {
1573        if (kvm_enabled() && !kvmppc_has_cap_mmu_radix()) {
1574            error_report("Guest requested unavailable MMU mode (radix).");
1575            exit(EXIT_FAILURE);
1576        }
1577    } else {
1578        if (kvm_enabled() && kvmppc_has_cap_mmu_radix()
1579            && !kvmppc_has_cap_mmu_hash_v3()) {
1580            error_report("Guest requested unavailable MMU mode (hash).");
1581            exit(EXIT_FAILURE);
1582        }
1583    }
1584}
1585
1586static void spapr_machine_reset(MachineState *machine)
1587{
1588    SpaprMachineState *spapr = SPAPR_MACHINE(machine);
1589    PowerPCCPU *first_ppc_cpu;
1590    hwaddr fdt_addr;
1591    void *fdt;
1592    int rc;
1593
1594    pef_kvm_reset(machine->cgs, &error_fatal);
1595    spapr_caps_apply(spapr);
1596
1597    first_ppc_cpu = POWERPC_CPU(first_cpu);
1598    if (kvm_enabled() && kvmppc_has_cap_mmu_radix() &&
1599        ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
1600                              spapr->max_compat_pvr)) {
1601        /*
1602         * If using KVM with radix mode available, VCPUs can be started
1603         * without a HPT because KVM will start them in radix mode.
1604         * Set the GR bit in PATE so that we know there is no HPT.
1605         */
1606        spapr->patb_entry = PATE1_GR;
1607        spapr_set_all_lpcrs(LPCR_HR | LPCR_UPRT, LPCR_HR | LPCR_UPRT);
1608    } else {
1609        spapr_setup_hpt(spapr);
1610    }
1611
1612    qemu_devices_reset();
1613
1614    spapr_ovec_cleanup(spapr->ov5_cas);
1615    spapr->ov5_cas = spapr_ovec_new();
1616
1617    ppc_set_compat_all(spapr->max_compat_pvr, &error_fatal);
1618
1619    /*
1620     * This is fixing some of the default configuration of the XIVE
1621     * devices. To be called after the reset of the machine devices.
1622     */
1623    spapr_irq_reset(spapr, &error_fatal);
1624
1625    /*
1626     * There is no CAS under qtest. Simulate one to please the code that
1627     * depends on spapr->ov5_cas. This is especially needed to test device
1628     * unplug, so we do that before resetting the DRCs.
1629     */
1630    if (qtest_enabled()) {
1631        spapr_ovec_cleanup(spapr->ov5_cas);
1632        spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
1633    }
1634
1635    /* DRC reset may cause a device to be unplugged. This will cause troubles
1636     * if this device is used by another device (eg, a running vhost backend
1637     * will crash QEMU if the DIMM holding the vring goes away). To avoid such
1638     * situations, we reset DRCs after all devices have been reset.
1639     */
1640    spapr_drc_reset_all(spapr);
1641
1642    spapr_clear_pending_events(spapr);
1643
1644    /*
1645     * We place the device tree just below either the top of the RMA,
1646     * or just below 2GB, whichever is lower, so that it can be
1647     * processed with 32-bit real mode code if necessary
1648     */
1649    fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE;
1650
1651    fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE);
1652    if (spapr->vof) {
1653        spapr_vof_reset(spapr, fdt, &error_fatal);
1654        /*
1655         * Do not pack the FDT as the client may change properties.
1656         * VOF client does not expect the FDT so we do not load it to the VM.
1657         */
1658    } else {
1659        rc = fdt_pack(fdt);
1660        /* Should only fail if we've built a corrupted tree */
1661        assert(rc == 0);
1662
1663        spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT,
1664                                  0, fdt_addr, 0);
1665        cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
1666    }
1667    qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
1668
1669    g_free(spapr->fdt_blob);
1670    spapr->fdt_size = fdt_totalsize(fdt);
1671    spapr->fdt_initial_size = spapr->fdt_size;
1672    spapr->fdt_blob = fdt;
1673
1674    /* Set up the entry state */
1675    first_ppc_cpu->env.gpr[5] = 0;
1676
1677    spapr->fwnmi_system_reset_addr = -1;
1678    spapr->fwnmi_machine_check_addr = -1;
1679    spapr->fwnmi_machine_check_interlock = -1;
1680
1681    /* Signal all vCPUs waiting on this condition */
1682    qemu_cond_broadcast(&spapr->fwnmi_machine_check_interlock_cond);
1683
1684    migrate_del_blocker(spapr->fwnmi_migration_blocker);
1685}
1686
1687static void spapr_create_nvram(SpaprMachineState *spapr)
1688{
1689    DeviceState *dev = qdev_new("spapr-nvram");
1690    DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0);
1691
1692    if (dinfo) {
1693        qdev_prop_set_drive_err(dev, "drive", blk_by_legacy_dinfo(dinfo),
1694                                &error_fatal);
1695    }
1696
1697    qdev_realize_and_unref(dev, &spapr->vio_bus->bus, &error_fatal);
1698
1699    spapr->nvram = (struct SpaprNvram *)dev;
1700}
1701
1702static void spapr_rtc_create(SpaprMachineState *spapr)
1703{
1704    object_initialize_child_with_props(OBJECT(spapr), "rtc", &spapr->rtc,
1705                                       sizeof(spapr->rtc), TYPE_SPAPR_RTC,
1706                                       &error_fatal, NULL);
1707    qdev_realize(DEVICE(&spapr->rtc), NULL, &error_fatal);
1708    object_property_add_alias(OBJECT(spapr), "rtc-time", OBJECT(&spapr->rtc),
1709                              "date");
1710}
1711
1712/* Returns whether we want to use VGA or not */
1713static bool spapr_vga_init(PCIBus *pci_bus, Error **errp)
1714{
1715    switch (vga_interface_type) {
1716    case VGA_NONE:
1717        return false;
1718    case VGA_DEVICE:
1719        return true;
1720    case VGA_STD:
1721    case VGA_VIRTIO:
1722    case VGA_CIRRUS:
1723        return pci_vga_init(pci_bus) != NULL;
1724    default:
1725        error_setg(errp,
1726                   "Unsupported VGA mode, only -vga std or -vga virtio is supported");
1727        return false;
1728    }
1729}
1730
1731static int spapr_pre_load(void *opaque)
1732{
1733    int rc;
1734
1735    rc = spapr_caps_pre_load(opaque);
1736    if (rc) {
1737        return rc;
1738    }
1739
1740    return 0;
1741}
1742
1743static int spapr_post_load(void *opaque, int version_id)
1744{
1745    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1746    int err = 0;
1747
1748    err = spapr_caps_post_migration(spapr);
1749    if (err) {
1750        return err;
1751    }
1752
1753    /*
1754     * In earlier versions, there was no separate qdev for the PAPR
1755     * RTC, so the RTC offset was stored directly in sPAPREnvironment.
1756     * So when migrating from those versions, poke the incoming offset
1757     * value into the RTC device
1758     */
1759    if (version_id < 3) {
1760        err = spapr_rtc_import_offset(&spapr->rtc, spapr->rtc_offset);
1761        if (err) {
1762            return err;
1763        }
1764    }
1765
1766    if (kvm_enabled() && spapr->patb_entry) {
1767        PowerPCCPU *cpu = POWERPC_CPU(first_cpu);
1768        bool radix = !!(spapr->patb_entry & PATE1_GR);
1769        bool gtse = !!(cpu->env.spr[SPR_LPCR] & LPCR_GTSE);
1770
1771        /*
1772         * Update LPCR:HR and UPRT as they may not be set properly in
1773         * the stream
1774         */
1775        spapr_set_all_lpcrs(radix ? (LPCR_HR | LPCR_UPRT) : 0,
1776                            LPCR_HR | LPCR_UPRT);
1777
1778        err = kvmppc_configure_v3_mmu(cpu, radix, gtse, spapr->patb_entry);
1779        if (err) {
1780            error_report("Process table config unsupported by the host");
1781            return -EINVAL;
1782        }
1783    }
1784
1785    err = spapr_irq_post_load(spapr, version_id);
1786    if (err) {
1787        return err;
1788    }
1789
1790    return err;
1791}
1792
1793static int spapr_pre_save(void *opaque)
1794{
1795    int rc;
1796
1797    rc = spapr_caps_pre_save(opaque);
1798    if (rc) {
1799        return rc;
1800    }
1801
1802    return 0;
1803}
1804
1805static bool version_before_3(void *opaque, int version_id)
1806{
1807    return version_id < 3;
1808}
1809
1810static bool spapr_pending_events_needed(void *opaque)
1811{
1812    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1813    return !QTAILQ_EMPTY(&spapr->pending_events);
1814}
1815
1816static const VMStateDescription vmstate_spapr_event_entry = {
1817    .name = "spapr_event_log_entry",
1818    .version_id = 1,
1819    .minimum_version_id = 1,
1820    .fields = (VMStateField[]) {
1821        VMSTATE_UINT32(summary, SpaprEventLogEntry),
1822        VMSTATE_UINT32(extended_length, SpaprEventLogEntry),
1823        VMSTATE_VBUFFER_ALLOC_UINT32(extended_log, SpaprEventLogEntry, 0,
1824                                     NULL, extended_length),
1825        VMSTATE_END_OF_LIST()
1826    },
1827};
1828
1829static const VMStateDescription vmstate_spapr_pending_events = {
1830    .name = "spapr_pending_events",
1831    .version_id = 1,
1832    .minimum_version_id = 1,
1833    .needed = spapr_pending_events_needed,
1834    .fields = (VMStateField[]) {
1835        VMSTATE_QTAILQ_V(pending_events, SpaprMachineState, 1,
1836                         vmstate_spapr_event_entry, SpaprEventLogEntry, next),
1837        VMSTATE_END_OF_LIST()
1838    },
1839};
1840
1841static bool spapr_ov5_cas_needed(void *opaque)
1842{
1843    SpaprMachineState *spapr = opaque;
1844    SpaprOptionVector *ov5_mask = spapr_ovec_new();
1845    bool cas_needed;
1846
1847    /* Prior to the introduction of SpaprOptionVector, we had two option
1848     * vectors we dealt with: OV5_FORM1_AFFINITY, and OV5_DRCONF_MEMORY.
1849     * Both of these options encode machine topology into the device-tree
1850     * in such a way that the now-booted OS should still be able to interact
1851     * appropriately with QEMU regardless of what options were actually
1852     * negotiatied on the source side.
1853     *
1854     * As such, we can avoid migrating the CAS-negotiated options if these
1855     * are the only options available on the current machine/platform.
1856     * Since these are the only options available for pseries-2.7 and
1857     * earlier, this allows us to maintain old->new/new->old migration
1858     * compatibility.
1859     *
1860     * For QEMU 2.8+, there are additional CAS-negotiatable options available
1861     * via default pseries-2.8 machines and explicit command-line parameters.
1862     * Some of these options, like OV5_HP_EVT, *do* require QEMU to be aware
1863     * of the actual CAS-negotiated values to continue working properly. For
1864     * example, availability of memory unplug depends on knowing whether
1865     * OV5_HP_EVT was negotiated via CAS.
1866     *
1867     * Thus, for any cases where the set of available CAS-negotiatable
1868     * options extends beyond OV5_FORM1_AFFINITY and OV5_DRCONF_MEMORY, we
1869     * include the CAS-negotiated options in the migration stream, unless
1870     * if they affect boot time behaviour only.
1871     */
1872    spapr_ovec_set(ov5_mask, OV5_FORM1_AFFINITY);
1873    spapr_ovec_set(ov5_mask, OV5_DRCONF_MEMORY);
1874    spapr_ovec_set(ov5_mask, OV5_DRMEM_V2);
1875
1876    /* We need extra information if we have any bits outside the mask
1877     * defined above */
1878    cas_needed = !spapr_ovec_subset(spapr->ov5, ov5_mask);
1879
1880    spapr_ovec_cleanup(ov5_mask);
1881
1882    return cas_needed;
1883}
1884
1885static const VMStateDescription vmstate_spapr_ov5_cas = {
1886    .name = "spapr_option_vector_ov5_cas",
1887    .version_id = 1,
1888    .minimum_version_id = 1,
1889    .needed = spapr_ov5_cas_needed,
1890    .fields = (VMStateField[]) {
1891        VMSTATE_STRUCT_POINTER_V(ov5_cas, SpaprMachineState, 1,
1892                                 vmstate_spapr_ovec, SpaprOptionVector),
1893        VMSTATE_END_OF_LIST()
1894    },
1895};
1896
1897static bool spapr_patb_entry_needed(void *opaque)
1898{
1899    SpaprMachineState *spapr = opaque;
1900
1901    return !!spapr->patb_entry;
1902}
1903
1904static const VMStateDescription vmstate_spapr_patb_entry = {
1905    .name = "spapr_patb_entry",
1906    .version_id = 1,
1907    .minimum_version_id = 1,
1908    .needed = spapr_patb_entry_needed,
1909    .fields = (VMStateField[]) {
1910        VMSTATE_UINT64(patb_entry, SpaprMachineState),
1911        VMSTATE_END_OF_LIST()
1912    },
1913};
1914
1915static bool spapr_irq_map_needed(void *opaque)
1916{
1917    SpaprMachineState *spapr = opaque;
1918
1919    return spapr->irq_map && !bitmap_empty(spapr->irq_map, spapr->irq_map_nr);
1920}
1921
1922static const VMStateDescription vmstate_spapr_irq_map = {
1923    .name = "spapr_irq_map",
1924    .version_id = 1,
1925    .minimum_version_id = 1,
1926    .needed = spapr_irq_map_needed,
1927    .fields = (VMStateField[]) {
1928        VMSTATE_BITMAP(irq_map, SpaprMachineState, 0, irq_map_nr),
1929        VMSTATE_END_OF_LIST()
1930    },
1931};
1932
1933static bool spapr_dtb_needed(void *opaque)
1934{
1935    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(opaque);
1936
1937    return smc->update_dt_enabled;
1938}
1939
1940static int spapr_dtb_pre_load(void *opaque)
1941{
1942    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1943
1944    g_free(spapr->fdt_blob);
1945    spapr->fdt_blob = NULL;
1946    spapr->fdt_size = 0;
1947
1948    return 0;
1949}
1950
1951static const VMStateDescription vmstate_spapr_dtb = {
1952    .name = "spapr_dtb",
1953    .version_id = 1,
1954    .minimum_version_id = 1,
1955    .needed = spapr_dtb_needed,
1956    .pre_load = spapr_dtb_pre_load,
1957    .fields = (VMStateField[]) {
1958        VMSTATE_UINT32(fdt_initial_size, SpaprMachineState),
1959        VMSTATE_UINT32(fdt_size, SpaprMachineState),
1960        VMSTATE_VBUFFER_ALLOC_UINT32(fdt_blob, SpaprMachineState, 0, NULL,
1961                                     fdt_size),
1962        VMSTATE_END_OF_LIST()
1963    },
1964};
1965
1966static bool spapr_fwnmi_needed(void *opaque)
1967{
1968    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1969
1970    return spapr->fwnmi_machine_check_addr != -1;
1971}
1972
1973static int spapr_fwnmi_pre_save(void *opaque)
1974{
1975    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
1976
1977    /*
1978     * Check if machine check handling is in progress and print a
1979     * warning message.
1980     */
1981    if (spapr->fwnmi_machine_check_interlock != -1) {
1982        warn_report("A machine check is being handled during migration. The"
1983                "handler may run and log hardware error on the destination");
1984    }
1985
1986    return 0;
1987}
1988
1989static const VMStateDescription vmstate_spapr_fwnmi = {
1990    .name = "spapr_fwnmi",
1991    .version_id = 1,
1992    .minimum_version_id = 1,
1993    .needed = spapr_fwnmi_needed,
1994    .pre_save = spapr_fwnmi_pre_save,
1995    .fields = (VMStateField[]) {
1996        VMSTATE_UINT64(fwnmi_system_reset_addr, SpaprMachineState),
1997        VMSTATE_UINT64(fwnmi_machine_check_addr, SpaprMachineState),
1998        VMSTATE_INT32(fwnmi_machine_check_interlock, SpaprMachineState),
1999        VMSTATE_END_OF_LIST()
2000    },
2001};
2002
2003static const VMStateDescription vmstate_spapr = {
2004    .name = "spapr",
2005    .version_id = 3,
2006    .minimum_version_id = 1,
2007    .pre_load = spapr_pre_load,
2008    .post_load = spapr_post_load,
2009    .pre_save = spapr_pre_save,
2010    .fields = (VMStateField[]) {
2011        /* used to be @next_irq */
2012        VMSTATE_UNUSED_BUFFER(version_before_3, 0, 4),
2013
2014        /* RTC offset */
2015        VMSTATE_UINT64_TEST(rtc_offset, SpaprMachineState, version_before_3),
2016
2017        VMSTATE_PPC_TIMEBASE_V(tb, SpaprMachineState, 2),
2018        VMSTATE_END_OF_LIST()
2019    },
2020    .subsections = (const VMStateDescription*[]) {
2021        &vmstate_spapr_ov5_cas,
2022        &vmstate_spapr_patb_entry,
2023        &vmstate_spapr_pending_events,
2024        &vmstate_spapr_cap_htm,
2025        &vmstate_spapr_cap_vsx,
2026        &vmstate_spapr_cap_dfp,
2027        &vmstate_spapr_cap_cfpc,
2028        &vmstate_spapr_cap_sbbc,
2029        &vmstate_spapr_cap_ibs,
2030        &vmstate_spapr_cap_hpt_maxpagesize,
2031        &vmstate_spapr_irq_map,
2032        &vmstate_spapr_cap_nested_kvm_hv,
2033        &vmstate_spapr_dtb,
2034        &vmstate_spapr_cap_large_decr,
2035        &vmstate_spapr_cap_ccf_assist,
2036        &vmstate_spapr_cap_fwnmi,
2037        &vmstate_spapr_fwnmi,
2038        &vmstate_spapr_cap_rpt_invalidate,
2039        NULL
2040    }
2041};
2042
2043static int htab_save_setup(QEMUFile *f, void *opaque)
2044{
2045    SpaprMachineState *spapr = opaque;
2046
2047    /* "Iteration" header */
2048    if (!spapr->htab_shift) {
2049        qemu_put_be32(f, -1);
2050    } else {
2051        qemu_put_be32(f, spapr->htab_shift);
2052    }
2053
2054    if (spapr->htab) {
2055        spapr->htab_save_index = 0;
2056        spapr->htab_first_pass = true;
2057    } else {
2058        if (spapr->htab_shift) {
2059            assert(kvm_enabled());
2060        }
2061    }
2062
2063
2064    return 0;
2065}
2066
2067static void htab_save_chunk(QEMUFile *f, SpaprMachineState *spapr,
2068                            int chunkstart, int n_valid, int n_invalid)
2069{
2070    qemu_put_be32(f, chunkstart);
2071    qemu_put_be16(f, n_valid);
2072    qemu_put_be16(f, n_invalid);
2073    qemu_put_buffer(f, HPTE(spapr->htab, chunkstart),
2074                    HASH_PTE_SIZE_64 * n_valid);
2075}
2076
2077static void htab_save_end_marker(QEMUFile *f)
2078{
2079    qemu_put_be32(f, 0);
2080    qemu_put_be16(f, 0);
2081    qemu_put_be16(f, 0);
2082}
2083
2084static void htab_save_first_pass(QEMUFile *f, SpaprMachineState *spapr,
2085                                 int64_t max_ns)
2086{
2087    bool has_timeout = max_ns != -1;
2088    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2089    int index = spapr->htab_save_index;
2090    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2091
2092    assert(spapr->htab_first_pass);
2093
2094    do {
2095        int chunkstart;
2096
2097        /* Consume invalid HPTEs */
2098        while ((index < htabslots)
2099               && !HPTE_VALID(HPTE(spapr->htab, index))) {
2100            CLEAN_HPTE(HPTE(spapr->htab, index));
2101            index++;
2102        }
2103
2104        /* Consume valid HPTEs */
2105        chunkstart = index;
2106        while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2107               && HPTE_VALID(HPTE(spapr->htab, index))) {
2108            CLEAN_HPTE(HPTE(spapr->htab, index));
2109            index++;
2110        }
2111
2112        if (index > chunkstart) {
2113            int n_valid = index - chunkstart;
2114
2115            htab_save_chunk(f, spapr, chunkstart, n_valid, 0);
2116
2117            if (has_timeout &&
2118                (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2119                break;
2120            }
2121        }
2122    } while ((index < htabslots) && !qemu_file_rate_limit(f));
2123
2124    if (index >= htabslots) {
2125        assert(index == htabslots);
2126        index = 0;
2127        spapr->htab_first_pass = false;
2128    }
2129    spapr->htab_save_index = index;
2130}
2131
2132static int htab_save_later_pass(QEMUFile *f, SpaprMachineState *spapr,
2133                                int64_t max_ns)
2134{
2135    bool final = max_ns < 0;
2136    int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64;
2137    int examined = 0, sent = 0;
2138    int index = spapr->htab_save_index;
2139    int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2140
2141    assert(!spapr->htab_first_pass);
2142
2143    do {
2144        int chunkstart, invalidstart;
2145
2146        /* Consume non-dirty HPTEs */
2147        while ((index < htabslots)
2148               && !HPTE_DIRTY(HPTE(spapr->htab, index))) {
2149            index++;
2150            examined++;
2151        }
2152
2153        chunkstart = index;
2154        /* Consume valid dirty HPTEs */
2155        while ((index < htabslots) && (index - chunkstart < USHRT_MAX)
2156               && HPTE_DIRTY(HPTE(spapr->htab, index))
2157               && HPTE_VALID(HPTE(spapr->htab, index))) {
2158            CLEAN_HPTE(HPTE(spapr->htab, index));
2159            index++;
2160            examined++;
2161        }
2162
2163        invalidstart = index;
2164        /* Consume invalid dirty HPTEs */
2165        while ((index < htabslots) && (index - invalidstart < USHRT_MAX)
2166               && HPTE_DIRTY(HPTE(spapr->htab, index))
2167               && !HPTE_VALID(HPTE(spapr->htab, index))) {
2168            CLEAN_HPTE(HPTE(spapr->htab, index));
2169            index++;
2170            examined++;
2171        }
2172
2173        if (index > chunkstart) {
2174            int n_valid = invalidstart - chunkstart;
2175            int n_invalid = index - invalidstart;
2176
2177            htab_save_chunk(f, spapr, chunkstart, n_valid, n_invalid);
2178            sent += index - chunkstart;
2179
2180            if (!final && (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) > max_ns) {
2181                break;
2182            }
2183        }
2184
2185        if (examined >= htabslots) {
2186            break;
2187        }
2188
2189        if (index >= htabslots) {
2190            assert(index == htabslots);
2191            index = 0;
2192        }
2193    } while ((examined < htabslots) && (!qemu_file_rate_limit(f) || final));
2194
2195    if (index >= htabslots) {
2196        assert(index == htabslots);
2197        index = 0;
2198    }
2199
2200    spapr->htab_save_index = index;
2201
2202    return (examined >= htabslots) && (sent == 0) ? 1 : 0;
2203}
2204
2205#define MAX_ITERATION_NS    5000000 /* 5 ms */
2206#define MAX_KVM_BUF_SIZE    2048
2207
2208static int htab_save_iterate(QEMUFile *f, void *opaque)
2209{
2210    SpaprMachineState *spapr = opaque;
2211    int fd;
2212    int rc = 0;
2213
2214    /* Iteration header */
2215    if (!spapr->htab_shift) {
2216        qemu_put_be32(f, -1);
2217        return 1;
2218    } else {
2219        qemu_put_be32(f, 0);
2220    }
2221
2222    if (!spapr->htab) {
2223        assert(kvm_enabled());
2224
2225        fd = get_htab_fd(spapr);
2226        if (fd < 0) {
2227            return fd;
2228        }
2229
2230        rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, MAX_ITERATION_NS);
2231        if (rc < 0) {
2232            return rc;
2233        }
2234    } else  if (spapr->htab_first_pass) {
2235        htab_save_first_pass(f, spapr, MAX_ITERATION_NS);
2236    } else {
2237        rc = htab_save_later_pass(f, spapr, MAX_ITERATION_NS);
2238    }
2239
2240    htab_save_end_marker(f);
2241
2242    return rc;
2243}
2244
2245static int htab_save_complete(QEMUFile *f, void *opaque)
2246{
2247    SpaprMachineState *spapr = opaque;
2248    int fd;
2249
2250    /* Iteration header */
2251    if (!spapr->htab_shift) {
2252        qemu_put_be32(f, -1);
2253        return 0;
2254    } else {
2255        qemu_put_be32(f, 0);
2256    }
2257
2258    if (!spapr->htab) {
2259        int rc;
2260
2261        assert(kvm_enabled());
2262
2263        fd = get_htab_fd(spapr);
2264        if (fd < 0) {
2265            return fd;
2266        }
2267
2268        rc = kvmppc_save_htab(f, fd, MAX_KVM_BUF_SIZE, -1);
2269        if (rc < 0) {
2270            return rc;
2271        }
2272    } else {
2273        if (spapr->htab_first_pass) {
2274            htab_save_first_pass(f, spapr, -1);
2275        }
2276        htab_save_later_pass(f, spapr, -1);
2277    }
2278
2279    /* End marker */
2280    htab_save_end_marker(f);
2281
2282    return 0;
2283}
2284
2285static int htab_load(QEMUFile *f, void *opaque, int version_id)
2286{
2287    SpaprMachineState *spapr = opaque;
2288    uint32_t section_hdr;
2289    int fd = -1;
2290    Error *local_err = NULL;
2291
2292    if (version_id < 1 || version_id > 1) {
2293        error_report("htab_load() bad version");
2294        return -EINVAL;
2295    }
2296
2297    section_hdr = qemu_get_be32(f);
2298
2299    if (section_hdr == -1) {
2300        spapr_free_hpt(spapr);
2301        return 0;
2302    }
2303
2304    if (section_hdr) {
2305        int ret;
2306
2307        /* First section gives the htab size */
2308        ret = spapr_reallocate_hpt(spapr, section_hdr, &local_err);
2309        if (ret < 0) {
2310            error_report_err(local_err);
2311            return ret;
2312        }
2313        return 0;
2314    }
2315
2316    if (!spapr->htab) {
2317        assert(kvm_enabled());
2318
2319        fd = kvmppc_get_htab_fd(true, 0, &local_err);
2320        if (fd < 0) {
2321            error_report_err(local_err);
2322            return fd;
2323        }
2324    }
2325
2326    while (true) {
2327        uint32_t index;
2328        uint16_t n_valid, n_invalid;
2329
2330        index = qemu_get_be32(f);
2331        n_valid = qemu_get_be16(f);
2332        n_invalid = qemu_get_be16(f);
2333
2334        if ((index == 0) && (n_valid == 0) && (n_invalid == 0)) {
2335            /* End of Stream */
2336            break;
2337        }
2338
2339        if ((index + n_valid + n_invalid) >
2340            (HTAB_SIZE(spapr) / HASH_PTE_SIZE_64)) {
2341            /* Bad index in stream */
2342            error_report(
2343                "htab_load() bad index %d (%hd+%hd entries) in htab stream (htab_shift=%d)",
2344                index, n_valid, n_invalid, spapr->htab_shift);
2345            return -EINVAL;
2346        }
2347
2348        if (spapr->htab) {
2349            if (n_valid) {
2350                qemu_get_buffer(f, HPTE(spapr->htab, index),
2351                                HASH_PTE_SIZE_64 * n_valid);
2352            }
2353            if (n_invalid) {
2354                memset(HPTE(spapr->htab, index + n_valid), 0,
2355                       HASH_PTE_SIZE_64 * n_invalid);
2356            }
2357        } else {
2358            int rc;
2359
2360            assert(fd >= 0);
2361
2362            rc = kvmppc_load_htab_chunk(f, fd, index, n_valid, n_invalid,
2363                                        &local_err);
2364            if (rc < 0) {
2365                error_report_err(local_err);
2366                return rc;
2367            }
2368        }
2369    }
2370
2371    if (!spapr->htab) {
2372        assert(fd >= 0);
2373        close(fd);
2374    }
2375
2376    return 0;
2377}
2378
2379static void htab_save_cleanup(void *opaque)
2380{
2381    SpaprMachineState *spapr = opaque;
2382
2383    close_htab_fd(spapr);
2384}
2385
2386static SaveVMHandlers savevm_htab_handlers = {
2387    .save_setup = htab_save_setup,
2388    .save_live_iterate = htab_save_iterate,
2389    .save_live_complete_precopy = htab_save_complete,
2390    .save_cleanup = htab_save_cleanup,
2391    .load_state = htab_load,
2392};
2393
2394static void spapr_boot_set(void *opaque, const char *boot_device,
2395                           Error **errp)
2396{
2397    SpaprMachineState *spapr = SPAPR_MACHINE(opaque);
2398
2399    g_free(spapr->boot_device);
2400    spapr->boot_device = g_strdup(boot_device);
2401}
2402
2403static void spapr_create_lmb_dr_connectors(SpaprMachineState *spapr)
2404{
2405    MachineState *machine = MACHINE(spapr);
2406    uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
2407    uint32_t nr_lmbs = (machine->maxram_size - machine->ram_size)/lmb_size;
2408    int i;
2409
2410    for (i = 0; i < nr_lmbs; i++) {
2411        uint64_t addr;
2412
2413        addr = i * lmb_size + machine->device_memory->base;
2414        spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_LMB,
2415                               addr / lmb_size);
2416    }
2417}
2418
2419/*
2420 * If RAM size, maxmem size and individual node mem sizes aren't aligned
2421 * to SPAPR_MEMORY_BLOCK_SIZE(256MB), then refuse to start the guest
2422 * since we can't support such unaligned sizes with DRCONF_MEMORY.
2423 */
2424static void spapr_validate_node_memory(MachineState *machine, Error **errp)
2425{
2426    int i;
2427
2428    if (machine->ram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2429        error_setg(errp, "Memory size 0x" RAM_ADDR_FMT
2430                   " is not aligned to %" PRIu64 " MiB",
2431                   machine->ram_size,
2432                   SPAPR_MEMORY_BLOCK_SIZE / MiB);
2433        return;
2434    }
2435
2436    if (machine->maxram_size % SPAPR_MEMORY_BLOCK_SIZE) {
2437        error_setg(errp, "Maximum memory size 0x" RAM_ADDR_FMT
2438                   " is not aligned to %" PRIu64 " MiB",
2439                   machine->ram_size,
2440                   SPAPR_MEMORY_BLOCK_SIZE / MiB);
2441        return;
2442    }
2443
2444    for (i = 0; i < machine->numa_state->num_nodes; i++) {
2445        if (machine->numa_state->nodes[i].node_mem % SPAPR_MEMORY_BLOCK_SIZE) {
2446            error_setg(errp,
2447                       "Node %d memory size 0x%" PRIx64
2448                       " is not aligned to %" PRIu64 " MiB",
2449                       i, machine->numa_state->nodes[i].node_mem,
2450                       SPAPR_MEMORY_BLOCK_SIZE / MiB);
2451            return;
2452        }
2453    }
2454}
2455
2456/* find cpu slot in machine->possible_cpus by core_id */
2457static CPUArchId *spapr_find_cpu_slot(MachineState *ms, uint32_t id, int *idx)
2458{
2459    int index = id / ms->smp.threads;
2460
2461    if (index >= ms->possible_cpus->len) {
2462        return NULL;
2463    }
2464    if (idx) {
2465        *idx = index;
2466    }
2467    return &ms->possible_cpus->cpus[index];
2468}
2469
2470static void spapr_set_vsmt_mode(SpaprMachineState *spapr, Error **errp)
2471{
2472    MachineState *ms = MACHINE(spapr);
2473    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
2474    Error *local_err = NULL;
2475    bool vsmt_user = !!spapr->vsmt;
2476    int kvm_smt = kvmppc_smt_threads();
2477    int ret;
2478    unsigned int smp_threads = ms->smp.threads;
2479
2480    if (!kvm_enabled() && (smp_threads > 1)) {
2481        error_setg(errp, "TCG cannot support more than 1 thread/core "
2482                   "on a pseries machine");
2483        return;
2484    }
2485    if (!is_power_of_2(smp_threads)) {
2486        error_setg(errp, "Cannot support %d threads/core on a pseries "
2487                   "machine because it must be a power of 2", smp_threads);
2488        return;
2489    }
2490
2491    /* Detemine the VSMT mode to use: */
2492    if (vsmt_user) {
2493        if (spapr->vsmt < smp_threads) {
2494            error_setg(errp, "Cannot support VSMT mode %d"
2495                       " because it must be >= threads/core (%d)",
2496                       spapr->vsmt, smp_threads);
2497            return;
2498        }
2499        /* In this case, spapr->vsmt has been set by the command line */
2500    } else if (!smc->smp_threads_vsmt) {
2501        /*
2502         * Default VSMT value is tricky, because we need it to be as
2503         * consistent as possible (for migration), but this requires
2504         * changing it for at least some existing cases.  We pick 8 as
2505         * the value that we'd get with KVM on POWER8, the
2506         * overwhelmingly common case in production systems.
2507         */
2508        spapr->vsmt = MAX(8, smp_threads);
2509    } else {
2510        spapr->vsmt = smp_threads;
2511    }
2512
2513    /* KVM: If necessary, set the SMT mode: */
2514    if (kvm_enabled() && (spapr->vsmt != kvm_smt)) {
2515        ret = kvmppc_set_smt_threads(spapr->vsmt);
2516        if (ret) {
2517            /* Looks like KVM isn't able to change VSMT mode */
2518            error_setg(&local_err,
2519                       "Failed to set KVM's VSMT mode to %d (errno %d)",
2520                       spapr->vsmt, ret);
2521            /* We can live with that if the default one is big enough
2522             * for the number of threads, and a submultiple of the one
2523             * we want.  In this case we'll waste some vcpu ids, but
2524             * behaviour will be correct */
2525            if ((kvm_smt >= smp_threads) && ((spapr->vsmt % kvm_smt) == 0)) {
2526                warn_report_err(local_err);
2527            } else {
2528                if (!vsmt_user) {
2529                    error_append_hint(&local_err,
2530                                      "On PPC, a VM with %d threads/core"
2531                                      " on a host with %d threads/core"
2532                                      " requires the use of VSMT mode %d.\n",
2533                                      smp_threads, kvm_smt, spapr->vsmt);
2534                }
2535                kvmppc_error_append_smt_possible_hint(&local_err);
2536                error_propagate(errp, local_err);
2537            }
2538        }
2539    }
2540    /* else TCG: nothing to do currently */
2541}
2542
2543static void spapr_init_cpus(SpaprMachineState *spapr)
2544{
2545    MachineState *machine = MACHINE(spapr);
2546    MachineClass *mc = MACHINE_GET_CLASS(machine);
2547    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
2548    const char *type = spapr_get_cpu_core_type(machine->cpu_type);
2549    const CPUArchIdList *possible_cpus;
2550    unsigned int smp_cpus = machine->smp.cpus;
2551    unsigned int smp_threads = machine->smp.threads;
2552    unsigned int max_cpus = machine->smp.max_cpus;
2553    int boot_cores_nr = smp_cpus / smp_threads;
2554    int i;
2555
2556    possible_cpus = mc->possible_cpu_arch_ids(machine);
2557    if (mc->has_hotpluggable_cpus) {
2558        if (smp_cpus % smp_threads) {
2559            error_report("smp_cpus (%u) must be multiple of threads (%u)",
2560                         smp_cpus, smp_threads);
2561            exit(1);
2562        }
2563        if (max_cpus % smp_threads) {
2564            error_report("max_cpus (%u) must be multiple of threads (%u)",
2565                         max_cpus, smp_threads);
2566            exit(1);
2567        }
2568    } else {
2569        if (max_cpus != smp_cpus) {
2570            error_report("This machine version does not support CPU hotplug");
2571            exit(1);
2572        }
2573        boot_cores_nr = possible_cpus->len;
2574    }
2575
2576    if (smc->pre_2_10_has_unused_icps) {
2577        int i;
2578
2579        for (i = 0; i < spapr_max_server_number(spapr); i++) {
2580            /* Dummy entries get deregistered when real ICPState objects
2581             * are registered during CPU core hotplug.
2582             */
2583            pre_2_10_vmstate_register_dummy_icp(i);
2584        }
2585    }
2586
2587    for (i = 0; i < possible_cpus->len; i++) {
2588        int core_id = i * smp_threads;
2589
2590        if (mc->has_hotpluggable_cpus) {
2591            spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_CPU,
2592                                   spapr_vcpu_id(spapr, core_id));
2593        }
2594
2595        if (i < boot_cores_nr) {
2596            Object *core  = object_new(type);
2597            int nr_threads = smp_threads;
2598
2599            /* Handle the partially filled core for older machine types */
2600            if ((i + 1) * smp_threads >= smp_cpus) {
2601                nr_threads = smp_cpus - i * smp_threads;
2602            }
2603
2604            object_property_set_int(core, "nr-threads", nr_threads,
2605                                    &error_fatal);
2606            object_property_set_int(core, CPU_CORE_PROP_CORE_ID, core_id,
2607                                    &error_fatal);
2608            qdev_realize(DEVICE(core), NULL, &error_fatal);
2609
2610            object_unref(core);
2611        }
2612    }
2613}
2614
2615static PCIHostState *spapr_create_default_phb(void)
2616{
2617    DeviceState *dev;
2618
2619    dev = qdev_new(TYPE_SPAPR_PCI_HOST_BRIDGE);
2620    qdev_prop_set_uint32(dev, "index", 0);
2621    sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
2622
2623    return PCI_HOST_BRIDGE(dev);
2624}
2625
2626static hwaddr spapr_rma_size(SpaprMachineState *spapr, Error **errp)
2627{
2628    MachineState *machine = MACHINE(spapr);
2629    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
2630    hwaddr rma_size = machine->ram_size;
2631    hwaddr node0_size = spapr_node0_size(machine);
2632
2633    /* RMA has to fit in the first NUMA node */
2634    rma_size = MIN(rma_size, node0_size);
2635
2636    /*
2637     * VRMA access is via a special 1TiB SLB mapping, so the RMA can
2638     * never exceed that
2639     */
2640    rma_size = MIN(rma_size, 1 * TiB);
2641
2642    /*
2643     * Clamp the RMA size based on machine type.  This is for
2644     * migration compatibility with older qemu versions, which limited
2645     * the RMA size for complicated and mostly bad reasons.
2646     */
2647    if (smc->rma_limit) {
2648        rma_size = MIN(rma_size, smc->rma_limit);
2649    }
2650
2651    if (rma_size < MIN_RMA_SLOF) {
2652        error_setg(errp,
2653                   "pSeries SLOF firmware requires >= %" HWADDR_PRIx
2654                   "ldMiB guest RMA (Real Mode Area memory)",
2655                   MIN_RMA_SLOF / MiB);
2656        return 0;
2657    }
2658
2659    return rma_size;
2660}
2661
2662static void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr)
2663{
2664    MachineState *machine = MACHINE(spapr);
2665    int i;
2666
2667    for (i = 0; i < machine->ram_slots; i++) {
2668        spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM, i);
2669    }
2670}
2671
2672/* pSeries LPAR / sPAPR hardware init */
2673static void spapr_machine_init(MachineState *machine)
2674{
2675    SpaprMachineState *spapr = SPAPR_MACHINE(machine);
2676    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
2677    MachineClass *mc = MACHINE_GET_CLASS(machine);
2678    const char *bios_default = spapr->vof ? FW_FILE_NAME_VOF : FW_FILE_NAME;
2679    const char *bios_name = machine->firmware ?: bios_default;
2680    const char *kernel_filename = machine->kernel_filename;
2681    const char *initrd_filename = machine->initrd_filename;
2682    PCIHostState *phb;
2683    int i;
2684    MemoryRegion *sysmem = get_system_memory();
2685    long load_limit, fw_size;
2686    char *filename;
2687    Error *resize_hpt_err = NULL;
2688
2689    /*
2690     * if Secure VM (PEF) support is configured, then initialize it
2691     */
2692    pef_kvm_init(machine->cgs, &error_fatal);
2693
2694    msi_nonbroken = true;
2695
2696    QLIST_INIT(&spapr->phbs);
2697    QTAILQ_INIT(&spapr->pending_dimm_unplugs);
2698
2699    /* Determine capabilities to run with */
2700    spapr_caps_init(spapr);
2701
2702    kvmppc_check_papr_resize_hpt(&resize_hpt_err);
2703    if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DEFAULT) {
2704        /*
2705         * If the user explicitly requested a mode we should either
2706         * supply it, or fail completely (which we do below).  But if
2707         * it's not set explicitly, we reset our mode to something
2708         * that works
2709         */
2710        if (resize_hpt_err) {
2711            spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
2712            error_free(resize_hpt_err);
2713            resize_hpt_err = NULL;
2714        } else {
2715            spapr->resize_hpt = smc->resize_hpt_default;
2716        }
2717    }
2718
2719    assert(spapr->resize_hpt != SPAPR_RESIZE_HPT_DEFAULT);
2720
2721    if ((spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) && resize_hpt_err) {
2722        /*
2723         * User requested HPT resize, but this host can't supply it.  Bail out
2724         */
2725        error_report_err(resize_hpt_err);
2726        exit(1);
2727    }
2728    error_free(resize_hpt_err);
2729
2730    spapr->rma_size = spapr_rma_size(spapr, &error_fatal);
2731
2732    /* Setup a load limit for the ramdisk leaving room for SLOF and FDT */
2733    load_limit = MIN(spapr->rma_size, FDT_MAX_ADDR) - FW_OVERHEAD;
2734
2735    /*
2736     * VSMT must be set in order to be able to compute VCPU ids, ie to
2737     * call spapr_max_server_number() or spapr_vcpu_id().
2738     */
2739    spapr_set_vsmt_mode(spapr, &error_fatal);
2740
2741    /* Set up Interrupt Controller before we create the VCPUs */
2742    spapr_irq_init(spapr, &error_fatal);
2743
2744    /* Set up containers for ibm,client-architecture-support negotiated options
2745     */
2746    spapr->ov5 = spapr_ovec_new();
2747    spapr->ov5_cas = spapr_ovec_new();
2748
2749    if (smc->dr_lmb_enabled) {
2750        spapr_ovec_set(spapr->ov5, OV5_DRCONF_MEMORY);
2751        spapr_validate_node_memory(machine, &error_fatal);
2752    }
2753
2754    spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
2755
2756    /* Do not advertise FORM2 NUMA support for pseries-6.1 and older */
2757    if (!smc->pre_6_2_numa_affinity) {
2758        spapr_ovec_set(spapr->ov5, OV5_FORM2_AFFINITY);
2759    }
2760
2761    /* advertise support for dedicated HP event source to guests */
2762    if (spapr->use_hotplug_event_source) {
2763        spapr_ovec_set(spapr->ov5, OV5_HP_EVT);
2764    }
2765
2766    /* advertise support for HPT resizing */
2767    if (spapr->resize_hpt != SPAPR_RESIZE_HPT_DISABLED) {
2768        spapr_ovec_set(spapr->ov5, OV5_HPT_RESIZE);
2769    }
2770
2771    /* advertise support for ibm,dyamic-memory-v2 */
2772    spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2);
2773
2774    /* advertise XIVE on POWER9 machines */
2775    if (spapr->irq->xive) {
2776        spapr_ovec_set(spapr->ov5, OV5_XIVE_EXPLOIT);
2777    }
2778
2779    /* init CPUs */
2780    spapr_init_cpus(spapr);
2781
2782    spapr->gpu_numa_id = spapr_numa_initial_nvgpu_numa_id(machine);
2783
2784    /* Init numa_assoc_array */
2785    spapr_numa_associativity_init(spapr, machine);
2786
2787    if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
2788        ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
2789                              spapr->max_compat_pvr)) {
2790        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
2791        /* KVM and TCG always allow GTSE with radix... */
2792        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
2793    }
2794    /* ... but not with hash (currently). */
2795
2796    if (kvm_enabled()) {
2797        /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
2798        kvmppc_enable_logical_ci_hcalls();
2799        kvmppc_enable_set_mode_hcall();
2800
2801        /* H_CLEAR_MOD/_REF are mandatory in PAPR, but off by default */
2802        kvmppc_enable_clear_ref_mod_hcalls();
2803
2804        /* Enable H_PAGE_INIT */
2805        kvmppc_enable_h_page_init();
2806    }
2807
2808    /* map RAM */
2809    memory_region_add_subregion(sysmem, 0, machine->ram);
2810
2811    /* always allocate the device memory information */
2812    machine->device_memory = g_malloc0(sizeof(*machine->device_memory));
2813
2814    /* initialize hotplug memory address space */
2815    if (machine->ram_size < machine->maxram_size) {
2816        ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
2817        /*
2818         * Limit the number of hotpluggable memory slots to half the number
2819         * slots that KVM supports, leaving the other half for PCI and other
2820         * devices. However ensure that number of slots doesn't drop below 32.
2821         */
2822        int max_memslots = kvm_enabled() ? kvm_get_max_memslots() / 2 :
2823                           SPAPR_MAX_RAM_SLOTS;
2824
2825        if (max_memslots < SPAPR_MAX_RAM_SLOTS) {
2826            max_memslots = SPAPR_MAX_RAM_SLOTS;
2827        }
2828        if (machine->ram_slots > max_memslots) {
2829            error_report("Specified number of memory slots %"
2830                         PRIu64" exceeds max supported %d",
2831                         machine->ram_slots, max_memslots);
2832            exit(1);
2833        }
2834
2835        machine->device_memory->base = ROUND_UP(machine->ram_size,
2836                                                SPAPR_DEVICE_MEM_ALIGN);
2837        memory_region_init(&machine->device_memory->mr, OBJECT(spapr),
2838                           "device-memory", device_mem_size);
2839        memory_region_add_subregion(sysmem, machine->device_memory->base,
2840                                    &machine->device_memory->mr);
2841    }
2842
2843    if (smc->dr_lmb_enabled) {
2844        spapr_create_lmb_dr_connectors(spapr);
2845    }
2846
2847    if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI) == SPAPR_CAP_ON) {
2848        /* Create the error string for live migration blocker */
2849        error_setg(&spapr->fwnmi_migration_blocker,
2850            "A machine check is being handled during migration. The handler"
2851            "may run and log hardware error on the destination");
2852    }
2853
2854    if (mc->nvdimm_supported) {
2855        spapr_create_nvdimm_dr_connectors(spapr);
2856    }
2857
2858    /* Set up RTAS event infrastructure */
2859    spapr_events_init(spapr);
2860
2861    /* Set up the RTC RTAS interfaces */
2862    spapr_rtc_create(spapr);
2863
2864    /* Set up VIO bus */
2865    spapr->vio_bus = spapr_vio_bus_init();
2866
2867    for (i = 0; serial_hd(i); i++) {
2868        spapr_vty_create(spapr->vio_bus, serial_hd(i));
2869    }
2870
2871    /* We always have at least the nvram device on VIO */
2872    spapr_create_nvram(spapr);
2873
2874    /*
2875     * Setup hotplug / dynamic-reconfiguration connectors. top-level
2876     * connectors (described in root DT node's "ibm,drc-types" property)
2877     * are pre-initialized here. additional child connectors (such as
2878     * connectors for a PHBs PCI slots) are added as needed during their
2879     * parent's realization.
2880     */
2881    if (smc->dr_phb_enabled) {
2882        for (i = 0; i < SPAPR_MAX_PHBS; i++) {
2883            spapr_dr_connector_new(OBJECT(machine), TYPE_SPAPR_DRC_PHB, i);
2884        }
2885    }
2886
2887    /* Set up PCI */
2888    spapr_pci_rtas_init();
2889
2890    phb = spapr_create_default_phb();
2891
2892    for (i = 0; i < nb_nics; i++) {
2893        NICInfo *nd = &nd_table[i];
2894
2895        if (!nd->model) {
2896            nd->model = g_strdup("spapr-vlan");
2897        }
2898
2899        if (g_str_equal(nd->model, "spapr-vlan") ||
2900            g_str_equal(nd->model, "ibmveth")) {
2901            spapr_vlan_create(spapr->vio_bus, nd);
2902        } else {
2903            pci_nic_init_nofail(&nd_table[i], phb->bus, nd->model, NULL);
2904        }
2905    }
2906
2907    for (i = 0; i <= drive_get_max_bus(IF_SCSI); i++) {
2908        spapr_vscsi_create(spapr->vio_bus);
2909    }
2910
2911    /* Graphics */
2912    if (spapr_vga_init(phb->bus, &error_fatal)) {
2913        spapr->has_graphics = true;
2914        machine->usb |= defaults_enabled() && !machine->usb_disabled;
2915    }
2916
2917    if (machine->usb) {
2918        if (smc->use_ohci_by_default) {
2919            pci_create_simple(phb->bus, -1, "pci-ohci");
2920        } else {
2921            pci_create_simple(phb->bus, -1, "nec-usb-xhci");
2922        }
2923
2924        if (spapr->has_graphics) {
2925            USBBus *usb_bus = usb_bus_find(-1);
2926
2927            usb_create_simple(usb_bus, "usb-kbd");
2928            usb_create_simple(usb_bus, "usb-mouse");
2929        }
2930    }
2931
2932    if (kernel_filename) {
2933        spapr->kernel_size = load_elf(kernel_filename, NULL,
2934                                      translate_kernel_address, spapr,
2935                                      NULL, NULL, NULL, NULL, 1,
2936                                      PPC_ELF_MACHINE, 0, 0);
2937        if (spapr->kernel_size == ELF_LOAD_WRONG_ENDIAN) {
2938            spapr->kernel_size = load_elf(kernel_filename, NULL,
2939                                          translate_kernel_address, spapr,
2940                                          NULL, NULL, NULL, NULL, 0,
2941                                          PPC_ELF_MACHINE, 0, 0);
2942            spapr->kernel_le = spapr->kernel_size > 0;
2943        }
2944        if (spapr->kernel_size < 0) {
2945            error_report("error loading %s: %s", kernel_filename,
2946                         load_elf_strerror(spapr->kernel_size));
2947            exit(1);
2948        }
2949
2950        /* load initrd */
2951        if (initrd_filename) {
2952            /* Try to locate the initrd in the gap between the kernel
2953             * and the firmware. Add a bit of space just in case
2954             */
2955            spapr->initrd_base = (spapr->kernel_addr + spapr->kernel_size
2956                                  + 0x1ffff) & ~0xffff;
2957            spapr->initrd_size = load_image_targphys(initrd_filename,
2958                                                     spapr->initrd_base,
2959                                                     load_limit
2960                                                     - spapr->initrd_base);
2961            if (spapr->initrd_size < 0) {
2962                error_report("could not load initial ram disk '%s'",
2963                             initrd_filename);
2964                exit(1);
2965            }
2966        }
2967    }
2968
2969    filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
2970    if (!filename) {
2971        error_report("Could not find LPAR firmware '%s'", bios_name);
2972        exit(1);
2973    }
2974    fw_size = load_image_targphys(filename, 0, FW_MAX_SIZE);
2975    if (fw_size <= 0) {
2976        error_report("Could not load LPAR firmware '%s'", filename);
2977        exit(1);
2978    }
2979    g_free(filename);
2980
2981    /* FIXME: Should register things through the MachineState's qdev
2982     * interface, this is a legacy from the sPAPREnvironment structure
2983     * which predated MachineState but had a similar function */
2984    vmstate_register(NULL, 0, &vmstate_spapr, spapr);
2985    register_savevm_live("spapr/htab", VMSTATE_INSTANCE_ID_ANY, 1,
2986                         &savevm_htab_handlers, spapr);
2987
2988    qbus_set_hotplug_handler(sysbus_get_default(), OBJECT(machine));
2989
2990    qemu_register_boot_set(spapr_boot_set, spapr);
2991
2992    /*
2993     * Nothing needs to be done to resume a suspended guest because
2994     * suspending does not change the machine state, so no need for
2995     * a ->wakeup method.
2996     */
2997    qemu_register_wakeup_support();
2998
2999    if (kvm_enabled()) {
3000        /* to stop and start vmclock */
3001        qemu_add_vm_change_state_handler(cpu_ppc_clock_vm_state_change,
3002                                         &spapr->tb);
3003
3004        kvmppc_spapr_enable_inkernel_multitce();
3005    }
3006
3007    qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
3008    if (spapr->vof) {
3009        spapr->vof->fw_size = fw_size; /* for claim() on itself */
3010        spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client);
3011    }
3012}
3013
3014#define DEFAULT_KVM_TYPE "auto"
3015static int spapr_kvm_type(MachineState *machine, const char *vm_type)
3016{
3017    /*
3018     * The use of g_ascii_strcasecmp() for 'hv' and 'pr' is to
3019     * accomodate the 'HV' and 'PV' formats that exists in the
3020     * wild. The 'auto' mode is being introduced already as
3021     * lower-case, thus we don't need to bother checking for
3022     * "AUTO".
3023     */
3024    if (!vm_type || !strcmp(vm_type, DEFAULT_KVM_TYPE)) {
3025        return 0;
3026    }
3027
3028    if (!g_ascii_strcasecmp(vm_type, "hv")) {
3029        return 1;
3030    }
3031
3032    if (!g_ascii_strcasecmp(vm_type, "pr")) {
3033        return 2;
3034    }
3035
3036    error_report("Unknown kvm-type specified '%s'", vm_type);
3037    exit(1);
3038}
3039
3040/*
3041 * Implementation of an interface to adjust firmware path
3042 * for the bootindex property handling.
3043 */
3044static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus,
3045                                   DeviceState *dev)
3046{
3047#define CAST(type, obj, name) \
3048    ((type *)object_dynamic_cast(OBJECT(obj), (name)))
3049    SCSIDevice *d = CAST(SCSIDevice,  dev, TYPE_SCSI_DEVICE);
3050    SpaprPhbState *phb = CAST(SpaprPhbState, dev, TYPE_SPAPR_PCI_HOST_BRIDGE);
3051    VHostSCSICommon *vsc = CAST(VHostSCSICommon, dev, TYPE_VHOST_SCSI_COMMON);
3052    PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
3053
3054    if (d) {
3055        void *spapr = CAST(void, bus->parent, "spapr-vscsi");
3056        VirtIOSCSI *virtio = CAST(VirtIOSCSI, bus->parent, TYPE_VIRTIO_SCSI);
3057        USBDevice *usb = CAST(USBDevice, bus->parent, TYPE_USB_DEVICE);
3058
3059        if (spapr) {
3060            /*
3061             * Replace "channel@0/disk@0,0" with "disk@8000000000000000":
3062             * In the top 16 bits of the 64-bit LUN, we use SRP luns of the form
3063             * 0x8000 | (target << 8) | (bus << 5) | lun
3064             * (see the "Logical unit addressing format" table in SAM5)
3065             */
3066            unsigned id = 0x8000 | (d->id << 8) | (d->channel << 5) | d->lun;
3067            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3068                                   (uint64_t)id << 48);
3069        } else if (virtio) {
3070            /*
3071             * We use SRP luns of the form 01000000 | (target << 8) | lun
3072             * in the top 32 bits of the 64-bit LUN
3073             * Note: the quote above is from SLOF and it is wrong,
3074             * the actual binding is:
3075             * swap 0100 or 10 << or 20 << ( target lun-id -- srplun )
3076             */
3077            unsigned id = 0x1000000 | (d->id << 16) | d->lun;
3078            if (d->lun >= 256) {
3079                /* Use the LUN "flat space addressing method" */
3080                id |= 0x4000;
3081            }
3082            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3083                                   (uint64_t)id << 32);
3084        } else if (usb) {
3085            /*
3086             * We use SRP luns of the form 01000000 | (usb-port << 16) | lun
3087             * in the top 32 bits of the 64-bit LUN
3088             */
3089            unsigned usb_port = atoi(usb->port->path);
3090            unsigned id = 0x1000000 | (usb_port << 16) | d->lun;
3091            return g_strdup_printf("%s@%"PRIX64, qdev_fw_name(dev),
3092                                   (uint64_t)id << 32);
3093        }
3094    }
3095
3096    /*
3097     * SLOF probes the USB devices, and if it recognizes that the device is a
3098     * storage device, it changes its name to "storage" instead of "usb-host",
3099     * and additionally adds a child node for the SCSI LUN, so the correct
3100     * boot path in SLOF is something like .../storage@1/disk@xxx" instead.
3101     */
3102    if (strcmp("usb-host", qdev_fw_name(dev)) == 0) {
3103        USBDevice *usbdev = CAST(USBDevice, dev, TYPE_USB_DEVICE);
3104        if (usb_device_is_scsi_storage(usbdev)) {
3105            return g_strdup_printf("storage@%s/disk", usbdev->port->path);
3106        }
3107    }
3108
3109    if (phb) {
3110        /* Replace "pci" with "pci@800000020000000" */
3111        return g_strdup_printf("pci@%"PRIX64, phb->buid);
3112    }
3113
3114    if (vsc) {
3115        /* Same logic as virtio above */
3116        unsigned id = 0x1000000 | (vsc->target << 16) | vsc->lun;
3117        return g_strdup_printf("disk@%"PRIX64, (uint64_t)id << 32);
3118    }
3119
3120    if (g_str_equal("pci-bridge", qdev_fw_name(dev))) {
3121        /* SLOF uses "pci" instead of "pci-bridge" for PCI bridges */
3122        PCIDevice *pcidev = CAST(PCIDevice, dev, TYPE_PCI_DEVICE);
3123        return g_strdup_printf("pci@%x", PCI_SLOT(pcidev->devfn));
3124    }
3125
3126    if (pcidev) {
3127        return spapr_pci_fw_dev_name(pcidev);
3128    }
3129
3130    return NULL;
3131}
3132
3133static char *spapr_get_kvm_type(Object *obj, Error **errp)
3134{
3135    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3136
3137    return g_strdup(spapr->kvm_type);
3138}
3139
3140static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp)
3141{
3142    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3143
3144    g_free(spapr->kvm_type);
3145    spapr->kvm_type = g_strdup(value);
3146}
3147
3148static bool spapr_get_modern_hotplug_events(Object *obj, Error **errp)
3149{
3150    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3151
3152    return spapr->use_hotplug_event_source;
3153}
3154
3155static void spapr_set_modern_hotplug_events(Object *obj, bool value,
3156                                            Error **errp)
3157{
3158    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3159
3160    spapr->use_hotplug_event_source = value;
3161}
3162
3163static bool spapr_get_msix_emulation(Object *obj, Error **errp)
3164{
3165    return true;
3166}
3167
3168static char *spapr_get_resize_hpt(Object *obj, Error **errp)
3169{
3170    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3171
3172    switch (spapr->resize_hpt) {
3173    case SPAPR_RESIZE_HPT_DEFAULT:
3174        return g_strdup("default");
3175    case SPAPR_RESIZE_HPT_DISABLED:
3176        return g_strdup("disabled");
3177    case SPAPR_RESIZE_HPT_ENABLED:
3178        return g_strdup("enabled");
3179    case SPAPR_RESIZE_HPT_REQUIRED:
3180        return g_strdup("required");
3181    }
3182    g_assert_not_reached();
3183}
3184
3185static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp)
3186{
3187    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3188
3189    if (strcmp(value, "default") == 0) {
3190        spapr->resize_hpt = SPAPR_RESIZE_HPT_DEFAULT;
3191    } else if (strcmp(value, "disabled") == 0) {
3192        spapr->resize_hpt = SPAPR_RESIZE_HPT_DISABLED;
3193    } else if (strcmp(value, "enabled") == 0) {
3194        spapr->resize_hpt = SPAPR_RESIZE_HPT_ENABLED;
3195    } else if (strcmp(value, "required") == 0) {
3196        spapr->resize_hpt = SPAPR_RESIZE_HPT_REQUIRED;
3197    } else {
3198        error_setg(errp, "Bad value for \"resize-hpt\" property");
3199    }
3200}
3201
3202static bool spapr_get_vof(Object *obj, Error **errp)
3203{
3204    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3205
3206    return spapr->vof != NULL;
3207}
3208
3209static void spapr_set_vof(Object *obj, bool value, Error **errp)
3210{
3211    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3212
3213    if (spapr->vof) {
3214        vof_cleanup(spapr->vof);
3215        g_free(spapr->vof);
3216        spapr->vof = NULL;
3217    }
3218    if (!value) {
3219        return;
3220    }
3221    spapr->vof = g_malloc0(sizeof(*spapr->vof));
3222}
3223
3224static char *spapr_get_ic_mode(Object *obj, Error **errp)
3225{
3226    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3227
3228    if (spapr->irq == &spapr_irq_xics_legacy) {
3229        return g_strdup("legacy");
3230    } else if (spapr->irq == &spapr_irq_xics) {
3231        return g_strdup("xics");
3232    } else if (spapr->irq == &spapr_irq_xive) {
3233        return g_strdup("xive");
3234    } else if (spapr->irq == &spapr_irq_dual) {
3235        return g_strdup("dual");
3236    }
3237    g_assert_not_reached();
3238}
3239
3240static void spapr_set_ic_mode(Object *obj, const char *value, Error **errp)
3241{
3242    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3243
3244    if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
3245        error_setg(errp, "This machine only uses the legacy XICS backend, don't pass ic-mode");
3246        return;
3247    }
3248
3249    /* The legacy IRQ backend can not be set */
3250    if (strcmp(value, "xics") == 0) {
3251        spapr->irq = &spapr_irq_xics;
3252    } else if (strcmp(value, "xive") == 0) {
3253        spapr->irq = &spapr_irq_xive;
3254    } else if (strcmp(value, "dual") == 0) {
3255        spapr->irq = &spapr_irq_dual;
3256    } else {
3257        error_setg(errp, "Bad value for \"ic-mode\" property");
3258    }
3259}
3260
3261static char *spapr_get_host_model(Object *obj, Error **errp)
3262{
3263    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3264
3265    return g_strdup(spapr->host_model);
3266}
3267
3268static void spapr_set_host_model(Object *obj, const char *value, Error **errp)
3269{
3270    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3271
3272    g_free(spapr->host_model);
3273    spapr->host_model = g_strdup(value);
3274}
3275
3276static char *spapr_get_host_serial(Object *obj, Error **errp)
3277{
3278    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3279
3280    return g_strdup(spapr->host_serial);
3281}
3282
3283static void spapr_set_host_serial(Object *obj, const char *value, Error **errp)
3284{
3285    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3286
3287    g_free(spapr->host_serial);
3288    spapr->host_serial = g_strdup(value);
3289}
3290
3291static void spapr_instance_init(Object *obj)
3292{
3293    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3294    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
3295    MachineState *ms = MACHINE(spapr);
3296    MachineClass *mc = MACHINE_GET_CLASS(ms);
3297
3298    /*
3299     * NVDIMM support went live in 5.1 without considering that, in
3300     * other archs, the user needs to enable NVDIMM support with the
3301     * 'nvdimm' machine option and the default behavior is NVDIMM
3302     * support disabled. It is too late to roll back to the standard
3303     * behavior without breaking 5.1 guests.
3304     */
3305    if (mc->nvdimm_supported) {
3306        ms->nvdimms_state->is_enabled = true;
3307    }
3308
3309    spapr->htab_fd = -1;
3310    spapr->use_hotplug_event_source = true;
3311    spapr->kvm_type = g_strdup(DEFAULT_KVM_TYPE);
3312    object_property_add_str(obj, "kvm-type",
3313                            spapr_get_kvm_type, spapr_set_kvm_type);
3314    object_property_set_description(obj, "kvm-type",
3315                                    "Specifies the KVM virtualization mode (auto,"
3316                                    " hv, pr). Defaults to 'auto'. This mode will use"
3317                                    " any available KVM module loaded in the host,"
3318                                    " where kvm_hv takes precedence if both kvm_hv and"
3319                                    " kvm_pr are loaded.");
3320    object_property_add_bool(obj, "modern-hotplug-events",
3321                            spapr_get_modern_hotplug_events,
3322                            spapr_set_modern_hotplug_events);
3323    object_property_set_description(obj, "modern-hotplug-events",
3324                                    "Use dedicated hotplug event mechanism in"
3325                                    " place of standard EPOW events when possible"
3326                                    " (required for memory hot-unplug support)");
3327    ppc_compat_add_property(obj, "max-cpu-compat", &spapr->max_compat_pvr,
3328                            "Maximum permitted CPU compatibility mode");
3329
3330    object_property_add_str(obj, "resize-hpt",
3331                            spapr_get_resize_hpt, spapr_set_resize_hpt);
3332    object_property_set_description(obj, "resize-hpt",
3333                                    "Resizing of the Hash Page Table (enabled, disabled, required)");
3334    object_property_add_uint32_ptr(obj, "vsmt",
3335                                   &spapr->vsmt, OBJ_PROP_FLAG_READWRITE);
3336    object_property_set_description(obj, "vsmt",
3337                                    "Virtual SMT: KVM behaves as if this were"
3338                                    " the host's SMT mode");
3339
3340    object_property_add_bool(obj, "vfio-no-msix-emulation",
3341                             spapr_get_msix_emulation, NULL);
3342
3343    object_property_add_uint64_ptr(obj, "kernel-addr",
3344                                   &spapr->kernel_addr, OBJ_PROP_FLAG_READWRITE);
3345    object_property_set_description(obj, "kernel-addr",
3346                                    stringify(KERNEL_LOAD_ADDR)
3347                                    " for -kernel is the default");
3348    spapr->kernel_addr = KERNEL_LOAD_ADDR;
3349
3350    object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof);
3351    object_property_set_description(obj, "x-vof",
3352                                    "Enable Virtual Open Firmware (experimental)");
3353
3354    /* The machine class defines the default interrupt controller mode */
3355    spapr->irq = smc->irq;
3356    object_property_add_str(obj, "ic-mode", spapr_get_ic_mode,
3357                            spapr_set_ic_mode);
3358    object_property_set_description(obj, "ic-mode",
3359                 "Specifies the interrupt controller mode (xics, xive, dual)");
3360
3361    object_property_add_str(obj, "host-model",
3362        spapr_get_host_model, spapr_set_host_model);
3363    object_property_set_description(obj, "host-model",
3364        "Host model to advertise in guest device tree");
3365    object_property_add_str(obj, "host-serial",
3366        spapr_get_host_serial, spapr_set_host_serial);
3367    object_property_set_description(obj, "host-serial",
3368        "Host serial number to advertise in guest device tree");
3369}
3370
3371static void spapr_machine_finalizefn(Object *obj)
3372{
3373    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
3374
3375    g_free(spapr->kvm_type);
3376}
3377
3378void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg)
3379{
3380    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
3381    PowerPCCPU *cpu = POWERPC_CPU(cs);
3382    CPUPPCState *env = &cpu->env;
3383
3384    cpu_synchronize_state(cs);
3385    /* If FWNMI is inactive, addr will be -1, which will deliver to 0x100 */
3386    if (spapr->fwnmi_system_reset_addr != -1) {
3387        uint64_t rtas_addr, addr;
3388
3389        /* get rtas addr from fdt */
3390        rtas_addr = spapr_get_rtas_addr();
3391        if (!rtas_addr) {
3392            qemu_system_guest_panicked(NULL);
3393            return;
3394        }
3395
3396        addr = rtas_addr + RTAS_ERROR_LOG_MAX + cs->cpu_index * sizeof(uint64_t)*2;
3397        stq_be_phys(&address_space_memory, addr, env->gpr[3]);
3398        stq_be_phys(&address_space_memory, addr + sizeof(uint64_t), 0);
3399        env->gpr[3] = addr;
3400    }
3401    ppc_cpu_do_system_reset(cs);
3402    if (spapr->fwnmi_system_reset_addr != -1) {
3403        env->nip = spapr->fwnmi_system_reset_addr;
3404    }
3405}
3406
3407static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
3408{
3409    CPUState *cs;
3410
3411    CPU_FOREACH(cs) {
3412        async_run_on_cpu(cs, spapr_do_system_reset_on_cpu, RUN_ON_CPU_NULL);
3413    }
3414}
3415
3416int spapr_lmb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3417                          void *fdt, int *fdt_start_offset, Error **errp)
3418{
3419    uint64_t addr;
3420    uint32_t node;
3421
3422    addr = spapr_drc_index(drc) * SPAPR_MEMORY_BLOCK_SIZE;
3423    node = object_property_get_uint(OBJECT(drc->dev), PC_DIMM_NODE_PROP,
3424                                    &error_abort);
3425    *fdt_start_offset = spapr_dt_memory_node(spapr, fdt, node, addr,
3426                                             SPAPR_MEMORY_BLOCK_SIZE);
3427    return 0;
3428}
3429
3430static void spapr_add_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t size,
3431                           bool dedicated_hp_event_source)
3432{
3433    SpaprDrc *drc;
3434    uint32_t nr_lmbs = size/SPAPR_MEMORY_BLOCK_SIZE;
3435    int i;
3436    uint64_t addr = addr_start;
3437    bool hotplugged = spapr_drc_hotplugged(dev);
3438
3439    for (i = 0; i < nr_lmbs; i++) {
3440        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3441                              addr / SPAPR_MEMORY_BLOCK_SIZE);
3442        g_assert(drc);
3443
3444        /*
3445         * memory_device_get_free_addr() provided a range of free addresses
3446         * that doesn't overlap with any existing mapping at pre-plug. The
3447         * corresponding LMB DRCs are thus assumed to be all attachable.
3448         */
3449        spapr_drc_attach(drc, dev);
3450        if (!hotplugged) {
3451            spapr_drc_reset(drc);
3452        }
3453        addr += SPAPR_MEMORY_BLOCK_SIZE;
3454    }
3455    /* send hotplug notification to the
3456     * guest only in case of hotplugged memory
3457     */
3458    if (hotplugged) {
3459        if (dedicated_hp_event_source) {
3460            drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3461                                  addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3462            g_assert(drc);
3463            spapr_hotplug_req_add_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3464                                                   nr_lmbs,
3465                                                   spapr_drc_index(drc));
3466        } else {
3467            spapr_hotplug_req_add_by_count(SPAPR_DR_CONNECTOR_TYPE_LMB,
3468                                           nr_lmbs);
3469        }
3470    }
3471}
3472
3473static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
3474{
3475    SpaprMachineState *ms = SPAPR_MACHINE(hotplug_dev);
3476    PCDIMMDevice *dimm = PC_DIMM(dev);
3477    uint64_t size, addr;
3478    int64_t slot;
3479    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
3480
3481    size = memory_device_get_region_size(MEMORY_DEVICE(dev), &error_abort);
3482
3483    pc_dimm_plug(dimm, MACHINE(ms));
3484
3485    if (!is_nvdimm) {
3486        addr = object_property_get_uint(OBJECT(dimm),
3487                                        PC_DIMM_ADDR_PROP, &error_abort);
3488        spapr_add_lmbs(dev, addr, size,
3489                       spapr_ovec_test(ms->ov5_cas, OV5_HP_EVT));
3490    } else {
3491        slot = object_property_get_int(OBJECT(dimm),
3492                                       PC_DIMM_SLOT_PROP, &error_abort);
3493        /* We should have valid slot number at this point */
3494        g_assert(slot >= 0);
3495        spapr_add_nvdimm(dev, slot);
3496    }
3497}
3498
3499static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3500                                  Error **errp)
3501{
3502    const SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev);
3503    SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3504    bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM);
3505    PCDIMMDevice *dimm = PC_DIMM(dev);
3506    Error *local_err = NULL;
3507    uint64_t size;
3508    Object *memdev;
3509    hwaddr pagesize;
3510
3511    if (!smc->dr_lmb_enabled) {
3512        error_setg(errp, "Memory hotplug not supported for this machine");
3513        return;
3514    }
3515
3516    size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &local_err);
3517    if (local_err) {
3518        error_propagate(errp, local_err);
3519        return;
3520    }
3521
3522    if (is_nvdimm) {
3523        if (!spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, errp)) {
3524            return;
3525        }
3526    } else if (size % SPAPR_MEMORY_BLOCK_SIZE) {
3527        error_setg(errp, "Hotplugged memory size must be a multiple of "
3528                   "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB);
3529        return;
3530    }
3531
3532    memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP,
3533                                      &error_abort);
3534    pagesize = host_memory_backend_pagesize(MEMORY_BACKEND(memdev));
3535    if (!spapr_check_pagesize(spapr, pagesize, errp)) {
3536        return;
3537    }
3538
3539    pc_dimm_pre_plug(dimm, MACHINE(hotplug_dev), NULL, errp);
3540}
3541
3542struct SpaprDimmState {
3543    PCDIMMDevice *dimm;
3544    uint32_t nr_lmbs;
3545    QTAILQ_ENTRY(SpaprDimmState) next;
3546};
3547
3548static SpaprDimmState *spapr_pending_dimm_unplugs_find(SpaprMachineState *s,
3549                                                       PCDIMMDevice *dimm)
3550{
3551    SpaprDimmState *dimm_state = NULL;
3552
3553    QTAILQ_FOREACH(dimm_state, &s->pending_dimm_unplugs, next) {
3554        if (dimm_state->dimm == dimm) {
3555            break;
3556        }
3557    }
3558    return dimm_state;
3559}
3560
3561static SpaprDimmState *spapr_pending_dimm_unplugs_add(SpaprMachineState *spapr,
3562                                                      uint32_t nr_lmbs,
3563                                                      PCDIMMDevice *dimm)
3564{
3565    SpaprDimmState *ds = NULL;
3566
3567    /*
3568     * If this request is for a DIMM whose removal had failed earlier
3569     * (due to guest's refusal to remove the LMBs), we would have this
3570     * dimm already in the pending_dimm_unplugs list. In that
3571     * case don't add again.
3572     */
3573    ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
3574    if (!ds) {
3575        ds = g_malloc0(sizeof(SpaprDimmState));
3576        ds->nr_lmbs = nr_lmbs;
3577        ds->dimm = dimm;
3578        QTAILQ_INSERT_HEAD(&spapr->pending_dimm_unplugs, ds, next);
3579    }
3580    return ds;
3581}
3582
3583static void spapr_pending_dimm_unplugs_remove(SpaprMachineState *spapr,
3584                                              SpaprDimmState *dimm_state)
3585{
3586    QTAILQ_REMOVE(&spapr->pending_dimm_unplugs, dimm_state, next);
3587    g_free(dimm_state);
3588}
3589
3590static SpaprDimmState *spapr_recover_pending_dimm_state(SpaprMachineState *ms,
3591                                                        PCDIMMDevice *dimm)
3592{
3593    SpaprDrc *drc;
3594    uint64_t size = memory_device_get_region_size(MEMORY_DEVICE(dimm),
3595                                                  &error_abort);
3596    uint32_t nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3597    uint32_t avail_lmbs = 0;
3598    uint64_t addr_start, addr;
3599    int i;
3600
3601    addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3602                                          &error_abort);
3603
3604    addr = addr_start;
3605    for (i = 0; i < nr_lmbs; i++) {
3606        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3607                              addr / SPAPR_MEMORY_BLOCK_SIZE);
3608        g_assert(drc);
3609        if (drc->dev) {
3610            avail_lmbs++;
3611        }
3612        addr += SPAPR_MEMORY_BLOCK_SIZE;
3613    }
3614
3615    return spapr_pending_dimm_unplugs_add(ms, avail_lmbs, dimm);
3616}
3617
3618void spapr_memory_unplug_rollback(SpaprMachineState *spapr, DeviceState *dev)
3619{
3620    SpaprDimmState *ds;
3621    PCDIMMDevice *dimm;
3622    SpaprDrc *drc;
3623    uint32_t nr_lmbs;
3624    uint64_t size, addr_start, addr;
3625    g_autofree char *qapi_error = NULL;
3626    int i;
3627
3628    if (!dev) {
3629        return;
3630    }
3631
3632    dimm = PC_DIMM(dev);
3633    ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
3634
3635    /*
3636     * 'ds == NULL' would mean that the DIMM doesn't have a pending
3637     * unplug state, but one of its DRC is marked as unplug_requested.
3638     * This is bad and weird enough to g_assert() out.
3639     */
3640    g_assert(ds);
3641
3642    spapr_pending_dimm_unplugs_remove(spapr, ds);
3643
3644    size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
3645    nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3646
3647    addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3648                                          &error_abort);
3649
3650    addr = addr_start;
3651    for (i = 0; i < nr_lmbs; i++) {
3652        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3653                              addr / SPAPR_MEMORY_BLOCK_SIZE);
3654        g_assert(drc);
3655
3656        drc->unplug_requested = false;
3657        addr += SPAPR_MEMORY_BLOCK_SIZE;
3658    }
3659
3660    /*
3661     * Tell QAPI that something happened and the memory
3662     * hotunplug wasn't successful. Keep sending
3663     * MEM_UNPLUG_ERROR even while sending
3664     * DEVICE_UNPLUG_GUEST_ERROR until the deprecation of
3665     * MEM_UNPLUG_ERROR is due.
3666     */
3667    qapi_error = g_strdup_printf("Memory hotunplug rejected by the guest "
3668                                 "for device %s", dev->id);
3669
3670    qapi_event_send_mem_unplug_error(dev->id ? : "", qapi_error);
3671
3672    qapi_event_send_device_unplug_guest_error(!!dev->id, dev->id,
3673                                              dev->canonical_path);
3674}
3675
3676/* Callback to be called during DRC release. */
3677void spapr_lmb_release(DeviceState *dev)
3678{
3679    HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3680    SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_ctrl);
3681    SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3682
3683    /* This information will get lost if a migration occurs
3684     * during the unplug process. In this case recover it. */
3685    if (ds == NULL) {
3686        ds = spapr_recover_pending_dimm_state(spapr, PC_DIMM(dev));
3687        g_assert(ds);
3688        /* The DRC being examined by the caller at least must be counted */
3689        g_assert(ds->nr_lmbs);
3690    }
3691
3692    if (--ds->nr_lmbs) {
3693        return;
3694    }
3695
3696    /*
3697     * Now that all the LMBs have been removed by the guest, call the
3698     * unplug handler chain. This can never fail.
3699     */
3700    hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3701    object_unparent(OBJECT(dev));
3702}
3703
3704static void spapr_memory_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3705{
3706    SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3707    SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, PC_DIMM(dev));
3708
3709    /* We really shouldn't get this far without anything to unplug */
3710    g_assert(ds);
3711
3712    pc_dimm_unplug(PC_DIMM(dev), MACHINE(hotplug_dev));
3713    qdev_unrealize(dev);
3714    spapr_pending_dimm_unplugs_remove(spapr, ds);
3715}
3716
3717static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev,
3718                                        DeviceState *dev, Error **errp)
3719{
3720    SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev);
3721    PCDIMMDevice *dimm = PC_DIMM(dev);
3722    uint32_t nr_lmbs;
3723    uint64_t size, addr_start, addr;
3724    int i;
3725    SpaprDrc *drc;
3726
3727    if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) {
3728        error_setg(errp, "nvdimm device hot unplug is not supported yet.");
3729        return;
3730    }
3731
3732    size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
3733    nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
3734
3735    addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
3736                                          &error_abort);
3737
3738    /*
3739     * An existing pending dimm state for this DIMM means that there is an
3740     * unplug operation in progress, waiting for the spapr_lmb_release
3741     * callback to complete the job (BQL can't cover that far). In this case,
3742     * bail out to avoid detaching DRCs that were already released.
3743     */
3744    if (spapr_pending_dimm_unplugs_find(spapr, dimm)) {
3745        error_setg(errp, "Memory unplug already in progress for device %s",
3746                   dev->id);
3747        return;
3748    }
3749
3750    spapr_pending_dimm_unplugs_add(spapr, nr_lmbs, dimm);
3751
3752    addr = addr_start;
3753    for (i = 0; i < nr_lmbs; i++) {
3754        drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3755                              addr / SPAPR_MEMORY_BLOCK_SIZE);
3756        g_assert(drc);
3757
3758        spapr_drc_unplug_request(drc);
3759        addr += SPAPR_MEMORY_BLOCK_SIZE;
3760    }
3761
3762    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
3763                          addr_start / SPAPR_MEMORY_BLOCK_SIZE);
3764    spapr_hotplug_req_remove_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB,
3765                                              nr_lmbs, spapr_drc_index(drc));
3766}
3767
3768/* Callback to be called during DRC release. */
3769void spapr_core_release(DeviceState *dev)
3770{
3771    HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
3772
3773    /* Call the unplug handler chain. This can never fail. */
3774    hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
3775    object_unparent(OBJECT(dev));
3776}
3777
3778static void spapr_core_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
3779{
3780    MachineState *ms = MACHINE(hotplug_dev);
3781    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms);
3782    CPUCore *cc = CPU_CORE(dev);
3783    CPUArchId *core_slot = spapr_find_cpu_slot(ms, cc->core_id, NULL);
3784
3785    if (smc->pre_2_10_has_unused_icps) {
3786        SpaprCpuCore *sc = SPAPR_CPU_CORE(OBJECT(dev));
3787        int i;
3788
3789        for (i = 0; i < cc->nr_threads; i++) {
3790            CPUState *cs = CPU(sc->threads[i]);
3791
3792            pre_2_10_vmstate_register_dummy_icp(cs->cpu_index);
3793        }
3794    }
3795
3796    assert(core_slot);
3797    core_slot->cpu = NULL;
3798    qdev_unrealize(dev);
3799}
3800
3801static
3802void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev,
3803                               Error **errp)
3804{
3805    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3806    int index;
3807    SpaprDrc *drc;
3808    CPUCore *cc = CPU_CORE(dev);
3809
3810    if (!spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index)) {
3811        error_setg(errp, "Unable to find CPU core with core-id: %d",
3812                   cc->core_id);
3813        return;
3814    }
3815    if (index == 0) {
3816        error_setg(errp, "Boot CPU core may not be unplugged");
3817        return;
3818    }
3819
3820    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3821                          spapr_vcpu_id(spapr, cc->core_id));
3822    g_assert(drc);
3823
3824    if (!spapr_drc_unplug_requested(drc)) {
3825        spapr_drc_unplug_request(drc);
3826    }
3827
3828    /*
3829     * spapr_hotplug_req_remove_by_index is left unguarded, out of the
3830     * "!spapr_drc_unplug_requested" check, to allow for multiple IRQ
3831     * pulses removing the same CPU. Otherwise, in an failed hotunplug
3832     * attempt (e.g. the kernel will refuse to remove the last online
3833     * CPU), we will never attempt it again because unplug_requested
3834     * will still be 'true' in that case.
3835     */
3836    spapr_hotplug_req_remove_by_index(drc);
3837}
3838
3839int spapr_core_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3840                           void *fdt, int *fdt_start_offset, Error **errp)
3841{
3842    SpaprCpuCore *core = SPAPR_CPU_CORE(drc->dev);
3843    CPUState *cs = CPU(core->threads[0]);
3844    PowerPCCPU *cpu = POWERPC_CPU(cs);
3845    DeviceClass *dc = DEVICE_GET_CLASS(cs);
3846    int id = spapr_get_vcpu_id(cpu);
3847    g_autofree char *nodename = NULL;
3848    int offset;
3849
3850    nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
3851    offset = fdt_add_subnode(fdt, 0, nodename);
3852
3853    spapr_dt_cpu(cs, fdt, offset, spapr);
3854
3855    /*
3856     * spapr_dt_cpu() does not fill the 'name' property in the
3857     * CPU node. The function is called during boot process, before
3858     * and after CAS, and overwriting the 'name' property written
3859     * by SLOF is not allowed.
3860     *
3861     * Write it manually after spapr_dt_cpu(). This makes the hotplug
3862     * CPUs more compatible with the coldplugged ones, which have
3863     * the 'name' property. Linux Kernel also relies on this
3864     * property to identify CPU nodes.
3865     */
3866    _FDT((fdt_setprop_string(fdt, offset, "name", nodename)));
3867
3868    *fdt_start_offset = offset;
3869    return 0;
3870}
3871
3872static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
3873{
3874    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
3875    MachineClass *mc = MACHINE_GET_CLASS(spapr);
3876    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
3877    SpaprCpuCore *core = SPAPR_CPU_CORE(OBJECT(dev));
3878    CPUCore *cc = CPU_CORE(dev);
3879    CPUState *cs;
3880    SpaprDrc *drc;
3881    CPUArchId *core_slot;
3882    int index;
3883    bool hotplugged = spapr_drc_hotplugged(dev);
3884    int i;
3885
3886    core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
3887    g_assert(core_slot); /* Already checked in spapr_core_pre_plug() */
3888
3889    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
3890                          spapr_vcpu_id(spapr, cc->core_id));
3891
3892    g_assert(drc || !mc->has_hotpluggable_cpus);
3893
3894    if (drc) {
3895        /*
3896         * spapr_core_pre_plug() already buys us this is a brand new
3897         * core being plugged into a free slot. Nothing should already
3898         * be attached to the corresponding DRC.
3899         */
3900        spapr_drc_attach(drc, dev);
3901
3902        if (hotplugged) {
3903            /*
3904             * Send hotplug notification interrupt to the guest only
3905             * in case of hotplugged CPUs.
3906             */
3907            spapr_hotplug_req_add_by_index(drc);
3908        } else {
3909            spapr_drc_reset(drc);
3910        }
3911    }
3912
3913    core_slot->cpu = OBJECT(dev);
3914
3915    /*
3916     * Set compatibility mode to match the boot CPU, which was either set
3917     * by the machine reset code or by CAS. This really shouldn't fail at
3918     * this point.
3919     */
3920    if (hotplugged) {
3921        for (i = 0; i < cc->nr_threads; i++) {
3922            ppc_set_compat(core->threads[i], POWERPC_CPU(first_cpu)->compat_pvr,
3923                           &error_abort);
3924        }
3925    }
3926
3927    if (smc->pre_2_10_has_unused_icps) {
3928        for (i = 0; i < cc->nr_threads; i++) {
3929            cs = CPU(core->threads[i]);
3930            pre_2_10_vmstate_unregister_dummy_icp(cs->cpu_index);
3931        }
3932    }
3933}
3934
3935static void spapr_core_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
3936                                Error **errp)
3937{
3938    MachineState *machine = MACHINE(OBJECT(hotplug_dev));
3939    MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
3940    CPUCore *cc = CPU_CORE(dev);
3941    const char *base_core_type = spapr_get_cpu_core_type(machine->cpu_type);
3942    const char *type = object_get_typename(OBJECT(dev));
3943    CPUArchId *core_slot;
3944    int index;
3945    unsigned int smp_threads = machine->smp.threads;
3946
3947    if (dev->hotplugged && !mc->has_hotpluggable_cpus) {
3948        error_setg(errp, "CPU hotplug not supported for this machine");
3949        return;
3950    }
3951
3952    if (strcmp(base_core_type, type)) {
3953        error_setg(errp, "CPU core type should be %s", base_core_type);
3954        return;
3955    }
3956
3957    if (cc->core_id % smp_threads) {
3958        error_setg(errp, "invalid core id %d", cc->core_id);
3959        return;
3960    }
3961
3962    /*
3963     * In general we should have homogeneous threads-per-core, but old
3964     * (pre hotplug support) machine types allow the last core to have
3965     * reduced threads as a compatibility hack for when we allowed
3966     * total vcpus not a multiple of threads-per-core.
3967     */
3968    if (mc->has_hotpluggable_cpus && (cc->nr_threads != smp_threads)) {
3969        error_setg(errp, "invalid nr-threads %d, must be %d", cc->nr_threads,
3970                   smp_threads);
3971        return;
3972    }
3973
3974    core_slot = spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index);
3975    if (!core_slot) {
3976        error_setg(errp, "core id %d out of range", cc->core_id);
3977        return;
3978    }
3979
3980    if (core_slot->cpu) {
3981        error_setg(errp, "core %d already populated", cc->core_id);
3982        return;
3983    }
3984
3985    numa_cpu_pre_plug(core_slot, dev, errp);
3986}
3987
3988int spapr_phb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
3989                          void *fdt, int *fdt_start_offset, Error **errp)
3990{
3991    SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(drc->dev);
3992    int intc_phandle;
3993
3994    intc_phandle = spapr_irq_get_phandle(spapr, spapr->fdt_blob, errp);
3995    if (intc_phandle <= 0) {
3996        return -1;
3997    }
3998
3999    if (spapr_dt_phb(spapr, sphb, intc_phandle, fdt, fdt_start_offset)) {
4000        error_setg(errp, "unable to create FDT node for PHB %d", sphb->index);
4001        return -1;
4002    }
4003
4004    /* generally SLOF creates these, for hotplug it's up to QEMU */
4005    _FDT(fdt_setprop_string(fdt, *fdt_start_offset, "name", "pci"));
4006
4007    return 0;
4008}
4009
4010static bool spapr_phb_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
4011                               Error **errp)
4012{
4013    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4014    SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4015    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
4016    const unsigned windows_supported = spapr_phb_windows_supported(sphb);
4017    SpaprDrc *drc;
4018
4019    if (dev->hotplugged && !smc->dr_phb_enabled) {
4020        error_setg(errp, "PHB hotplug not supported for this machine");
4021        return false;
4022    }
4023
4024    if (sphb->index == (uint32_t)-1) {
4025        error_setg(errp, "\"index\" for PAPR PHB is mandatory");
4026        return false;
4027    }
4028
4029    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4030    if (drc && drc->dev) {
4031        error_setg(errp, "PHB %d already attached", sphb->index);
4032        return false;
4033    }
4034
4035    /*
4036     * This will check that sphb->index doesn't exceed the maximum number of
4037     * PHBs for the current machine type.
4038     */
4039    return
4040        smc->phb_placement(spapr, sphb->index,
4041                           &sphb->buid, &sphb->io_win_addr,
4042                           &sphb->mem_win_addr, &sphb->mem64_win_addr,
4043                           windows_supported, sphb->dma_liobn,
4044                           &sphb->nv2_gpa_win_addr, &sphb->nv2_atsd_win_addr,
4045                           errp);
4046}
4047
4048static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
4049{
4050    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4051    SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
4052    SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4053    SpaprDrc *drc;
4054    bool hotplugged = spapr_drc_hotplugged(dev);
4055
4056    if (!smc->dr_phb_enabled) {
4057        return;
4058    }
4059
4060    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4061    /* hotplug hooks should check it's enabled before getting this far */
4062    assert(drc);
4063
4064    /* spapr_phb_pre_plug() already checked the DRC is attachable */
4065    spapr_drc_attach(drc, dev);
4066
4067    if (hotplugged) {
4068        spapr_hotplug_req_add_by_index(drc);
4069    } else {
4070        spapr_drc_reset(drc);
4071    }
4072}
4073
4074void spapr_phb_release(DeviceState *dev)
4075{
4076    HotplugHandler *hotplug_ctrl = qdev_get_hotplug_handler(dev);
4077
4078    hotplug_handler_unplug(hotplug_ctrl, dev, &error_abort);
4079    object_unparent(OBJECT(dev));
4080}
4081
4082static void spapr_phb_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
4083{
4084    qdev_unrealize(dev);
4085}
4086
4087static void spapr_phb_unplug_request(HotplugHandler *hotplug_dev,
4088                                     DeviceState *dev, Error **errp)
4089{
4090    SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(dev);
4091    SpaprDrc *drc;
4092
4093    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PHB, sphb->index);
4094    assert(drc);
4095
4096    if (!spapr_drc_unplug_requested(drc)) {
4097        spapr_drc_unplug_request(drc);
4098        spapr_hotplug_req_remove_by_index(drc);
4099    } else {
4100        error_setg(errp,
4101                   "PCI Host Bridge unplug already in progress for device %s",
4102                   dev->id);
4103    }
4104}
4105
4106static
4107bool spapr_tpm_proxy_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
4108                              Error **errp)
4109{
4110    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4111
4112    if (spapr->tpm_proxy != NULL) {
4113        error_setg(errp, "Only one TPM proxy can be specified for this machine");
4114        return false;
4115    }
4116
4117    return true;
4118}
4119
4120static void spapr_tpm_proxy_plug(HotplugHandler *hotplug_dev, DeviceState *dev)
4121{
4122    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4123    SpaprTpmProxy *tpm_proxy = SPAPR_TPM_PROXY(dev);
4124
4125    /* Already checked in spapr_tpm_proxy_pre_plug() */
4126    g_assert(spapr->tpm_proxy == NULL);
4127
4128    spapr->tpm_proxy = tpm_proxy;
4129}
4130
4131static void spapr_tpm_proxy_unplug(HotplugHandler *hotplug_dev, DeviceState *dev)
4132{
4133    SpaprMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
4134
4135    qdev_unrealize(dev);
4136    object_unparent(OBJECT(dev));
4137    spapr->tpm_proxy = NULL;
4138}
4139
4140static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
4141                                      DeviceState *dev, Error **errp)
4142{
4143    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4144        spapr_memory_plug(hotplug_dev, dev);
4145    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4146        spapr_core_plug(hotplug_dev, dev);
4147    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4148        spapr_phb_plug(hotplug_dev, dev);
4149    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4150        spapr_tpm_proxy_plug(hotplug_dev, dev);
4151    }
4152}
4153
4154static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
4155                                        DeviceState *dev, Error **errp)
4156{
4157    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4158        spapr_memory_unplug(hotplug_dev, dev);
4159    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4160        spapr_core_unplug(hotplug_dev, dev);
4161    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4162        spapr_phb_unplug(hotplug_dev, dev);
4163    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4164        spapr_tpm_proxy_unplug(hotplug_dev, dev);
4165    }
4166}
4167
4168bool spapr_memory_hot_unplug_supported(SpaprMachineState *spapr)
4169{
4170    return spapr_ovec_test(spapr->ov5_cas, OV5_HP_EVT) ||
4171        /*
4172         * CAS will process all pending unplug requests.
4173         *
4174         * HACK: a guest could theoretically have cleared all bits in OV5,
4175         * but none of the guests we care for do.
4176         */
4177        spapr_ovec_empty(spapr->ov5_cas);
4178}
4179
4180static void spapr_machine_device_unplug_request(HotplugHandler *hotplug_dev,
4181                                                DeviceState *dev, Error **errp)
4182{
4183    SpaprMachineState *sms = SPAPR_MACHINE(OBJECT(hotplug_dev));
4184    MachineClass *mc = MACHINE_GET_CLASS(sms);
4185    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4186
4187    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4188        if (spapr_memory_hot_unplug_supported(sms)) {
4189            spapr_memory_unplug_request(hotplug_dev, dev, errp);
4190        } else {
4191            error_setg(errp, "Memory hot unplug not supported for this guest");
4192        }
4193    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4194        if (!mc->has_hotpluggable_cpus) {
4195            error_setg(errp, "CPU hot unplug not supported on this machine");
4196            return;
4197        }
4198        spapr_core_unplug_request(hotplug_dev, dev, errp);
4199    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4200        if (!smc->dr_phb_enabled) {
4201            error_setg(errp, "PHB hot unplug not supported on this machine");
4202            return;
4203        }
4204        spapr_phb_unplug_request(hotplug_dev, dev, errp);
4205    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4206        spapr_tpm_proxy_unplug(hotplug_dev, dev);
4207    }
4208}
4209
4210static void spapr_machine_device_pre_plug(HotplugHandler *hotplug_dev,
4211                                          DeviceState *dev, Error **errp)
4212{
4213    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
4214        spapr_memory_pre_plug(hotplug_dev, dev, errp);
4215    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE)) {
4216        spapr_core_pre_plug(hotplug_dev, dev, errp);
4217    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE)) {
4218        spapr_phb_pre_plug(hotplug_dev, dev, errp);
4219    } else if (object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4220        spapr_tpm_proxy_pre_plug(hotplug_dev, dev, errp);
4221    }
4222}
4223
4224static HotplugHandler *spapr_get_hotplug_handler(MachineState *machine,
4225                                                 DeviceState *dev)
4226{
4227    if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
4228        object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_CPU_CORE) ||
4229        object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_PCI_HOST_BRIDGE) ||
4230        object_dynamic_cast(OBJECT(dev), TYPE_SPAPR_TPM_PROXY)) {
4231        return HOTPLUG_HANDLER(machine);
4232    }
4233    if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
4234        PCIDevice *pcidev = PCI_DEVICE(dev);
4235        PCIBus *root = pci_device_root_bus(pcidev);
4236        SpaprPhbState *phb =
4237            (SpaprPhbState *)object_dynamic_cast(OBJECT(BUS(root)->parent),
4238                                                 TYPE_SPAPR_PCI_HOST_BRIDGE);
4239
4240        if (phb) {
4241            return HOTPLUG_HANDLER(phb);
4242        }
4243    }
4244    return NULL;
4245}
4246
4247static CpuInstanceProperties
4248spapr_cpu_index_to_props(MachineState *machine, unsigned cpu_index)
4249{
4250    CPUArchId *core_slot;
4251    MachineClass *mc = MACHINE_GET_CLASS(machine);
4252
4253    /* make sure possible_cpu are intialized */
4254    mc->possible_cpu_arch_ids(machine);
4255    /* get CPU core slot containing thread that matches cpu_index */
4256    core_slot = spapr_find_cpu_slot(machine, cpu_index, NULL);
4257    assert(core_slot);
4258    return core_slot->props;
4259}
4260
4261static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx)
4262{
4263    return idx / ms->smp.cores % ms->numa_state->num_nodes;
4264}
4265
4266static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine)
4267{
4268    int i;
4269    unsigned int smp_threads = machine->smp.threads;
4270    unsigned int smp_cpus = machine->smp.cpus;
4271    const char *core_type;
4272    int spapr_max_cores = machine->smp.max_cpus / smp_threads;
4273    MachineClass *mc = MACHINE_GET_CLASS(machine);
4274
4275    if (!mc->has_hotpluggable_cpus) {
4276        spapr_max_cores = QEMU_ALIGN_UP(smp_cpus, smp_threads) / smp_threads;
4277    }
4278    if (machine->possible_cpus) {
4279        assert(machine->possible_cpus->len == spapr_max_cores);
4280        return machine->possible_cpus;
4281    }
4282
4283    core_type = spapr_get_cpu_core_type(machine->cpu_type);
4284    if (!core_type) {
4285        error_report("Unable to find sPAPR CPU Core definition");
4286        exit(1);
4287    }
4288
4289    machine->possible_cpus = g_malloc0(sizeof(CPUArchIdList) +
4290                             sizeof(CPUArchId) * spapr_max_cores);
4291    machine->possible_cpus->len = spapr_max_cores;
4292    for (i = 0; i < machine->possible_cpus->len; i++) {
4293        int core_id = i * smp_threads;
4294
4295        machine->possible_cpus->cpus[i].type = core_type;
4296        machine->possible_cpus->cpus[i].vcpus_count = smp_threads;
4297        machine->possible_cpus->cpus[i].arch_id = core_id;
4298        machine->possible_cpus->cpus[i].props.has_core_id = true;
4299        machine->possible_cpus->cpus[i].props.core_id = core_id;
4300    }
4301    return machine->possible_cpus;
4302}
4303
4304static bool spapr_phb_placement(SpaprMachineState *spapr, uint32_t index,
4305                                uint64_t *buid, hwaddr *pio,
4306                                hwaddr *mmio32, hwaddr *mmio64,
4307                                unsigned n_dma, uint32_t *liobns,
4308                                hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4309{
4310    /*
4311     * New-style PHB window placement.
4312     *
4313     * Goals: Gives large (1TiB), naturally aligned 64-bit MMIO window
4314     * for each PHB, in addition to 2GiB 32-bit MMIO and 64kiB PIO
4315     * windows.
4316     *
4317     * Some guest kernels can't work with MMIO windows above 1<<46
4318     * (64TiB), so we place up to 31 PHBs in the area 32TiB..64TiB
4319     *
4320     * 32TiB..(33TiB+1984kiB) contains the 64kiB PIO windows for each
4321     * PHB stacked together.  (32TiB+2GiB)..(32TiB+64GiB) contains the
4322     * 2GiB 32-bit MMIO windows for each PHB.  Then 33..64TiB has the
4323     * 1TiB 64-bit MMIO windows for each PHB.
4324     */
4325    const uint64_t base_buid = 0x800000020000000ULL;
4326    int i;
4327
4328    /* Sanity check natural alignments */
4329    QEMU_BUILD_BUG_ON((SPAPR_PCI_BASE % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4330    QEMU_BUILD_BUG_ON((SPAPR_PCI_LIMIT % SPAPR_PCI_MEM64_WIN_SIZE) != 0);
4331    QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM64_WIN_SIZE % SPAPR_PCI_MEM32_WIN_SIZE) != 0);
4332    QEMU_BUILD_BUG_ON((SPAPR_PCI_MEM32_WIN_SIZE % SPAPR_PCI_IO_WIN_SIZE) != 0);
4333    /* Sanity check bounds */
4334    QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_IO_WIN_SIZE) >
4335                      SPAPR_PCI_MEM32_WIN_SIZE);
4336    QEMU_BUILD_BUG_ON((SPAPR_MAX_PHBS * SPAPR_PCI_MEM32_WIN_SIZE) >
4337                      SPAPR_PCI_MEM64_WIN_SIZE);
4338
4339    if (index >= SPAPR_MAX_PHBS) {
4340        error_setg(errp, "\"index\" for PAPR PHB is too large (max %llu)",
4341                   SPAPR_MAX_PHBS - 1);
4342        return false;
4343    }
4344
4345    *buid = base_buid + index;
4346    for (i = 0; i < n_dma; ++i) {
4347        liobns[i] = SPAPR_PCI_LIOBN(index, i);
4348    }
4349
4350    *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE;
4351    *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE;
4352    *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
4353
4354    *nv2gpa = SPAPR_PCI_NV2RAM64_WIN_BASE + index * SPAPR_PCI_NV2RAM64_WIN_SIZE;
4355    *nv2atsd = SPAPR_PCI_NV2ATSD_WIN_BASE + index * SPAPR_PCI_NV2ATSD_WIN_SIZE;
4356    return true;
4357}
4358
4359static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
4360{
4361    SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4362
4363    return ics_valid_irq(spapr->ics, irq) ? spapr->ics : NULL;
4364}
4365
4366static void spapr_ics_resend(XICSFabric *dev)
4367{
4368    SpaprMachineState *spapr = SPAPR_MACHINE(dev);
4369
4370    ics_resend(spapr->ics);
4371}
4372
4373static ICPState *spapr_icp_get(XICSFabric *xi, int vcpu_id)
4374{
4375    PowerPCCPU *cpu = spapr_find_cpu(vcpu_id);
4376
4377    return cpu ? spapr_cpu_state(cpu)->icp : NULL;
4378}
4379
4380static void spapr_pic_print_info(InterruptStatsProvider *obj,
4381                                 Monitor *mon)
4382{
4383    SpaprMachineState *spapr = SPAPR_MACHINE(obj);
4384
4385    spapr_irq_print_info(spapr, mon);
4386    monitor_printf(mon, "irqchip: %s\n",
4387                   kvm_irqchip_in_kernel() ? "in-kernel" : "emulated");
4388}
4389
4390/*
4391 * This is a XIVE only operation
4392 */
4393static int spapr_match_nvt(XiveFabric *xfb, uint8_t format,
4394                           uint8_t nvt_blk, uint32_t nvt_idx,
4395                           bool cam_ignore, uint8_t priority,
4396                           uint32_t logic_serv, XiveTCTXMatch *match)
4397{
4398    SpaprMachineState *spapr = SPAPR_MACHINE(xfb);
4399    XivePresenter *xptr = XIVE_PRESENTER(spapr->active_intc);
4400    XivePresenterClass *xpc = XIVE_PRESENTER_GET_CLASS(xptr);
4401    int count;
4402
4403    count = xpc->match_nvt(xptr, format, nvt_blk, nvt_idx, cam_ignore,
4404                           priority, logic_serv, match);
4405    if (count < 0) {
4406        return count;
4407    }
4408
4409    /*
4410     * When we implement the save and restore of the thread interrupt
4411     * contexts in the enter/exit CPU handlers of the machine and the
4412     * escalations in QEMU, we should be able to handle non dispatched
4413     * vCPUs.
4414     *
4415     * Until this is done, the sPAPR machine should find at least one
4416     * matching context always.
4417     */
4418    if (count == 0) {
4419        qemu_log_mask(LOG_GUEST_ERROR, "XIVE: NVT %x/%x is not dispatched\n",
4420                      nvt_blk, nvt_idx);
4421    }
4422
4423    return count;
4424}
4425
4426int spapr_get_vcpu_id(PowerPCCPU *cpu)
4427{
4428    return cpu->vcpu_id;
4429}
4430
4431bool spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp)
4432{
4433    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
4434    MachineState *ms = MACHINE(spapr);
4435    int vcpu_id;
4436
4437    vcpu_id = spapr_vcpu_id(spapr, cpu_index);
4438
4439    if (kvm_enabled() && !kvm_vcpu_id_is_valid(vcpu_id)) {
4440        error_setg(errp, "Can't create CPU with id %d in KVM", vcpu_id);
4441        error_append_hint(errp, "Adjust the number of cpus to %d "
4442                          "or try to raise the number of threads per core\n",
4443                          vcpu_id * ms->smp.threads / spapr->vsmt);
4444        return false;
4445    }
4446
4447    cpu->vcpu_id = vcpu_id;
4448    return true;
4449}
4450
4451PowerPCCPU *spapr_find_cpu(int vcpu_id)
4452{
4453    CPUState *cs;
4454
4455    CPU_FOREACH(cs) {
4456        PowerPCCPU *cpu = POWERPC_CPU(cs);
4457
4458        if (spapr_get_vcpu_id(cpu) == vcpu_id) {
4459            return cpu;
4460        }
4461    }
4462
4463    return NULL;
4464}
4465
4466static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
4467{
4468    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4469
4470    /* These are only called by TCG, KVM maintains dispatch state */
4471
4472    spapr_cpu->prod = false;
4473    if (spapr_cpu->vpa_addr) {
4474        CPUState *cs = CPU(cpu);
4475        uint32_t dispatch;
4476
4477        dispatch = ldl_be_phys(cs->as,
4478                               spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER);
4479        dispatch++;
4480        if ((dispatch & 1) != 0) {
4481            qemu_log_mask(LOG_GUEST_ERROR,
4482                          "VPA: incorrect dispatch counter value for "
4483                          "dispatched partition %u, correcting.\n", dispatch);
4484            dispatch++;
4485        }
4486        stl_be_phys(cs->as,
4487                    spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch);
4488    }
4489}
4490
4491static void spapr_cpu_exec_exit(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
4492{
4493    SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
4494
4495    if (spapr_cpu->vpa_addr) {
4496        CPUState *cs = CPU(cpu);
4497        uint32_t dispatch;
4498
4499        dispatch = ldl_be_phys(cs->as,
4500                               spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER);
4501        dispatch++;
4502        if ((dispatch & 1) != 1) {
4503            qemu_log_mask(LOG_GUEST_ERROR,
4504                          "VPA: incorrect dispatch counter value for "
4505                          "preempted partition %u, correcting.\n", dispatch);
4506            dispatch++;
4507        }
4508        stl_be_phys(cs->as,
4509                    spapr_cpu->vpa_addr + VPA_DISPATCH_COUNTER, dispatch);
4510    }
4511}
4512
4513static void spapr_machine_class_init(ObjectClass *oc, void *data)
4514{
4515    MachineClass *mc = MACHINE_CLASS(oc);
4516    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(oc);
4517    FWPathProviderClass *fwc = FW_PATH_PROVIDER_CLASS(oc);
4518    NMIClass *nc = NMI_CLASS(oc);
4519    HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
4520    PPCVirtualHypervisorClass *vhc = PPC_VIRTUAL_HYPERVISOR_CLASS(oc);
4521    XICSFabricClass *xic = XICS_FABRIC_CLASS(oc);
4522    InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
4523    XiveFabricClass *xfc = XIVE_FABRIC_CLASS(oc);
4524    VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc);
4525
4526    mc->desc = "pSeries Logical Partition (PAPR compliant)";
4527    mc->ignore_boot_device_suffixes = true;
4528
4529    /*
4530     * We set up the default / latest behaviour here.  The class_init
4531     * functions for the specific versioned machine types can override
4532     * these details for backwards compatibility
4533     */
4534    mc->init = spapr_machine_init;
4535    mc->reset = spapr_machine_reset;
4536    mc->block_default_type = IF_SCSI;
4537
4538    /*
4539     * Setting max_cpus to INT32_MAX. Both KVM and TCG max_cpus values
4540     * should be limited by the host capability instead of hardcoded.
4541     * max_cpus for KVM guests will be checked in kvm_init(), and TCG
4542     * guests are welcome to have as many CPUs as the host are capable
4543     * of emulate.
4544     */
4545    mc->max_cpus = INT32_MAX;
4546
4547    mc->no_parallel = 1;
4548    mc->default_boot_order = "";
4549    mc->default_ram_size = 512 * MiB;
4550    mc->default_ram_id = "ppc_spapr.ram";
4551    mc->default_display = "std";
4552    mc->kvm_type = spapr_kvm_type;
4553    machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SPAPR_PCI_HOST_BRIDGE);
4554    mc->pci_allow_0_address = true;
4555    assert(!mc->get_hotplug_handler);
4556    mc->get_hotplug_handler = spapr_get_hotplug_handler;
4557    hc->pre_plug = spapr_machine_device_pre_plug;
4558    hc->plug = spapr_machine_device_plug;
4559    mc->cpu_index_to_instance_props = spapr_cpu_index_to_props;
4560    mc->get_default_cpu_node_id = spapr_get_default_cpu_node_id;
4561    mc->possible_cpu_arch_ids = spapr_possible_cpu_arch_ids;
4562    hc->unplug_request = spapr_machine_device_unplug_request;
4563    hc->unplug = spapr_machine_device_unplug;
4564
4565    smc->dr_lmb_enabled = true;
4566    smc->update_dt_enabled = true;
4567    mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power9_v2.0");
4568    mc->has_hotpluggable_cpus = true;
4569    mc->nvdimm_supported = true;
4570    smc->resize_hpt_default = SPAPR_RESIZE_HPT_ENABLED;
4571    fwc->get_dev_path = spapr_get_fw_dev_path;
4572    nc->nmi_monitor_handler = spapr_nmi;
4573    smc->phb_placement = spapr_phb_placement;
4574    vhc->hypercall = emulate_spapr_hypercall;
4575    vhc->hpt_mask = spapr_hpt_mask;
4576    vhc->map_hptes = spapr_map_hptes;
4577    vhc->unmap_hptes = spapr_unmap_hptes;
4578    vhc->hpte_set_c = spapr_hpte_set_c;
4579    vhc->hpte_set_r = spapr_hpte_set_r;
4580    vhc->get_pate = spapr_get_pate;
4581    vhc->encode_hpt_for_kvm_pr = spapr_encode_hpt_for_kvm_pr;
4582    vhc->cpu_exec_enter = spapr_cpu_exec_enter;
4583    vhc->cpu_exec_exit = spapr_cpu_exec_exit;
4584    xic->ics_get = spapr_ics_get;
4585    xic->ics_resend = spapr_ics_resend;
4586    xic->icp_get = spapr_icp_get;
4587    ispc->print_info = spapr_pic_print_info;
4588    /* Force NUMA node memory size to be a multiple of
4589     * SPAPR_MEMORY_BLOCK_SIZE (256M) since that's the granularity
4590     * in which LMBs are represented and hot-added
4591     */
4592    mc->numa_mem_align_shift = 28;
4593    mc->auto_enable_numa = true;
4594
4595    smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_OFF;
4596    smc->default_caps.caps[SPAPR_CAP_VSX] = SPAPR_CAP_ON;
4597    smc->default_caps.caps[SPAPR_CAP_DFP] = SPAPR_CAP_ON;
4598    smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
4599    smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
4600    smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_WORKAROUND;
4601    smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 16; /* 64kiB */
4602    smc->default_caps.caps[SPAPR_CAP_NESTED_KVM_HV] = SPAPR_CAP_OFF;
4603    smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON;
4604    smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
4605    smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
4606    smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
4607    spapr_caps_add_properties(smc);
4608    smc->irq = &spapr_irq_dual;
4609    smc->dr_phb_enabled = true;
4610    smc->linux_pci_probe = true;
4611    smc->smp_threads_vsmt = true;
4612    smc->nr_xirqs = SPAPR_NR_XIRQS;
4613    xfc->match_nvt = spapr_match_nvt;
4614    vmc->client_architecture_support = spapr_vof_client_architecture_support;
4615    vmc->quiesce = spapr_vof_quiesce;
4616    vmc->setprop = spapr_vof_setprop;
4617}
4618
4619static const TypeInfo spapr_machine_info = {
4620    .name          = TYPE_SPAPR_MACHINE,
4621    .parent        = TYPE_MACHINE,
4622    .abstract      = true,
4623    .instance_size = sizeof(SpaprMachineState),
4624    .instance_init = spapr_instance_init,
4625    .instance_finalize = spapr_machine_finalizefn,
4626    .class_size    = sizeof(SpaprMachineClass),
4627    .class_init    = spapr_machine_class_init,
4628    .interfaces = (InterfaceInfo[]) {
4629        { TYPE_FW_PATH_PROVIDER },
4630        { TYPE_NMI },
4631        { TYPE_HOTPLUG_HANDLER },
4632        { TYPE_PPC_VIRTUAL_HYPERVISOR },
4633        { TYPE_XICS_FABRIC },
4634        { TYPE_INTERRUPT_STATS_PROVIDER },
4635        { TYPE_XIVE_FABRIC },
4636        { TYPE_VOF_MACHINE_IF },
4637        { }
4638    },
4639};
4640
4641static void spapr_machine_latest_class_options(MachineClass *mc)
4642{
4643    mc->alias = "pseries";
4644    mc->is_default = true;
4645}
4646
4647#define DEFINE_SPAPR_MACHINE(suffix, verstr, latest)                 \
4648    static void spapr_machine_##suffix##_class_init(ObjectClass *oc, \
4649                                                    void *data)      \
4650    {                                                                \
4651        MachineClass *mc = MACHINE_CLASS(oc);                        \
4652        spapr_machine_##suffix##_class_options(mc);                  \
4653        if (latest) {                                                \
4654            spapr_machine_latest_class_options(mc);                  \
4655        }                                                            \
4656    }                                                                \
4657    static const TypeInfo spapr_machine_##suffix##_info = {          \
4658        .name = MACHINE_TYPE_NAME("pseries-" verstr),                \
4659        .parent = TYPE_SPAPR_MACHINE,                                \
4660        .class_init = spapr_machine_##suffix##_class_init,           \
4661    };                                                               \
4662    static void spapr_machine_register_##suffix(void)                \
4663    {                                                                \
4664        type_register(&spapr_machine_##suffix##_info);               \
4665    }                                                                \
4666    type_init(spapr_machine_register_##suffix)
4667
4668/*
4669 * pseries-6.2
4670 */
4671static void spapr_machine_6_2_class_options(MachineClass *mc)
4672{
4673    /* Defaults for the latest behaviour inherited from the base class */
4674}
4675
4676DEFINE_SPAPR_MACHINE(6_2, "6.2", true);
4677
4678/*
4679 * pseries-6.1
4680 */
4681static void spapr_machine_6_1_class_options(MachineClass *mc)
4682{
4683    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4684
4685    spapr_machine_6_2_class_options(mc);
4686    compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len);
4687    smc->pre_6_2_numa_affinity = true;
4688    mc->smp_props.prefer_sockets = true;
4689}
4690
4691DEFINE_SPAPR_MACHINE(6_1, "6.1", false);
4692
4693/*
4694 * pseries-6.0
4695 */
4696static void spapr_machine_6_0_class_options(MachineClass *mc)
4697{
4698    spapr_machine_6_1_class_options(mc);
4699    compat_props_add(mc->compat_props, hw_compat_6_0, hw_compat_6_0_len);
4700}
4701
4702DEFINE_SPAPR_MACHINE(6_0, "6.0", false);
4703
4704/*
4705 * pseries-5.2
4706 */
4707static void spapr_machine_5_2_class_options(MachineClass *mc)
4708{
4709    spapr_machine_6_0_class_options(mc);
4710    compat_props_add(mc->compat_props, hw_compat_5_2, hw_compat_5_2_len);
4711}
4712
4713DEFINE_SPAPR_MACHINE(5_2, "5.2", false);
4714
4715/*
4716 * pseries-5.1
4717 */
4718static void spapr_machine_5_1_class_options(MachineClass *mc)
4719{
4720    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4721
4722    spapr_machine_5_2_class_options(mc);
4723    compat_props_add(mc->compat_props, hw_compat_5_1, hw_compat_5_1_len);
4724    smc->pre_5_2_numa_associativity = true;
4725}
4726
4727DEFINE_SPAPR_MACHINE(5_1, "5.1", false);
4728
4729/*
4730 * pseries-5.0
4731 */
4732static void spapr_machine_5_0_class_options(MachineClass *mc)
4733{
4734    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4735    static GlobalProperty compat[] = {
4736        { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-5.1-associativity", "on" },
4737    };
4738
4739    spapr_machine_5_1_class_options(mc);
4740    compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len);
4741    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4742    mc->numa_mem_supported = true;
4743    smc->pre_5_1_assoc_refpoints = true;
4744}
4745
4746DEFINE_SPAPR_MACHINE(5_0, "5.0", false);
4747
4748/*
4749 * pseries-4.2
4750 */
4751static void spapr_machine_4_2_class_options(MachineClass *mc)
4752{
4753    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4754
4755    spapr_machine_5_0_class_options(mc);
4756    compat_props_add(mc->compat_props, hw_compat_4_2, hw_compat_4_2_len);
4757    smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF;
4758    smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_OFF;
4759    smc->rma_limit = 16 * GiB;
4760    mc->nvdimm_supported = false;
4761}
4762
4763DEFINE_SPAPR_MACHINE(4_2, "4.2", false);
4764
4765/*
4766 * pseries-4.1
4767 */
4768static void spapr_machine_4_1_class_options(MachineClass *mc)
4769{
4770    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4771    static GlobalProperty compat[] = {
4772        /* Only allow 4kiB and 64kiB IOMMU pagesizes */
4773        { TYPE_SPAPR_PCI_HOST_BRIDGE, "pgsz", "0x11000" },
4774    };
4775
4776    spapr_machine_4_2_class_options(mc);
4777    smc->linux_pci_probe = false;
4778    smc->smp_threads_vsmt = false;
4779    compat_props_add(mc->compat_props, hw_compat_4_1, hw_compat_4_1_len);
4780    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4781}
4782
4783DEFINE_SPAPR_MACHINE(4_1, "4.1", false);
4784
4785/*
4786 * pseries-4.0
4787 */
4788static bool phb_placement_4_0(SpaprMachineState *spapr, uint32_t index,
4789                              uint64_t *buid, hwaddr *pio,
4790                              hwaddr *mmio32, hwaddr *mmio64,
4791                              unsigned n_dma, uint32_t *liobns,
4792                              hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4793{
4794    if (!spapr_phb_placement(spapr, index, buid, pio, mmio32, mmio64, n_dma,
4795                             liobns, nv2gpa, nv2atsd, errp)) {
4796        return false;
4797    }
4798
4799    *nv2gpa = 0;
4800    *nv2atsd = 0;
4801    return true;
4802}
4803static void spapr_machine_4_0_class_options(MachineClass *mc)
4804{
4805    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4806
4807    spapr_machine_4_1_class_options(mc);
4808    compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len);
4809    smc->phb_placement = phb_placement_4_0;
4810    smc->irq = &spapr_irq_xics;
4811    smc->pre_4_1_migration = true;
4812}
4813
4814DEFINE_SPAPR_MACHINE(4_0, "4.0", false);
4815
4816/*
4817 * pseries-3.1
4818 */
4819static void spapr_machine_3_1_class_options(MachineClass *mc)
4820{
4821    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4822
4823    spapr_machine_4_0_class_options(mc);
4824    compat_props_add(mc->compat_props, hw_compat_3_1, hw_compat_3_1_len);
4825
4826    mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
4827    smc->update_dt_enabled = false;
4828    smc->dr_phb_enabled = false;
4829    smc->broken_host_serial_model = true;
4830    smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN;
4831    smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN;
4832    smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN;
4833    smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF;
4834}
4835
4836DEFINE_SPAPR_MACHINE(3_1, "3.1", false);
4837
4838/*
4839 * pseries-3.0
4840 */
4841
4842static void spapr_machine_3_0_class_options(MachineClass *mc)
4843{
4844    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4845
4846    spapr_machine_3_1_class_options(mc);
4847    compat_props_add(mc->compat_props, hw_compat_3_0, hw_compat_3_0_len);
4848
4849    smc->legacy_irq_allocation = true;
4850    smc->nr_xirqs = 0x400;
4851    smc->irq = &spapr_irq_xics_legacy;
4852}
4853
4854DEFINE_SPAPR_MACHINE(3_0, "3.0", false);
4855
4856/*
4857 * pseries-2.12
4858 */
4859static void spapr_machine_2_12_class_options(MachineClass *mc)
4860{
4861    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4862    static GlobalProperty compat[] = {
4863        { TYPE_POWERPC_CPU, "pre-3.0-migration", "on" },
4864        { TYPE_SPAPR_CPU_CORE, "pre-3.0-migration", "on" },
4865    };
4866
4867    spapr_machine_3_0_class_options(mc);
4868    compat_props_add(mc->compat_props, hw_compat_2_12, hw_compat_2_12_len);
4869    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4870
4871    /* We depend on kvm_enabled() to choose a default value for the
4872     * hpt-max-page-size capability. Of course we can't do it here
4873     * because this is too early and the HW accelerator isn't initialzed
4874     * yet. Postpone this to machine init (see default_caps_with_cpu()).
4875     */
4876    smc->default_caps.caps[SPAPR_CAP_HPT_MAXPAGESIZE] = 0;
4877}
4878
4879DEFINE_SPAPR_MACHINE(2_12, "2.12", false);
4880
4881static void spapr_machine_2_12_sxxm_class_options(MachineClass *mc)
4882{
4883    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4884
4885    spapr_machine_2_12_class_options(mc);
4886    smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_WORKAROUND;
4887    smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_WORKAROUND;
4888    smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_FIXED_CCD;
4889}
4890
4891DEFINE_SPAPR_MACHINE(2_12_sxxm, "2.12-sxxm", false);
4892
4893/*
4894 * pseries-2.11
4895 */
4896
4897static void spapr_machine_2_11_class_options(MachineClass *mc)
4898{
4899    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4900
4901    spapr_machine_2_12_class_options(mc);
4902    smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_ON;
4903    compat_props_add(mc->compat_props, hw_compat_2_11, hw_compat_2_11_len);
4904}
4905
4906DEFINE_SPAPR_MACHINE(2_11, "2.11", false);
4907
4908/*
4909 * pseries-2.10
4910 */
4911
4912static void spapr_machine_2_10_class_options(MachineClass *mc)
4913{
4914    spapr_machine_2_11_class_options(mc);
4915    compat_props_add(mc->compat_props, hw_compat_2_10, hw_compat_2_10_len);
4916}
4917
4918DEFINE_SPAPR_MACHINE(2_10, "2.10", false);
4919
4920/*
4921 * pseries-2.9
4922 */
4923
4924static void spapr_machine_2_9_class_options(MachineClass *mc)
4925{
4926    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
4927    static GlobalProperty compat[] = {
4928        { TYPE_POWERPC_CPU, "pre-2.10-migration", "on" },
4929    };
4930
4931    spapr_machine_2_10_class_options(mc);
4932    compat_props_add(mc->compat_props, hw_compat_2_9, hw_compat_2_9_len);
4933    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4934    smc->pre_2_10_has_unused_icps = true;
4935    smc->resize_hpt_default = SPAPR_RESIZE_HPT_DISABLED;
4936}
4937
4938DEFINE_SPAPR_MACHINE(2_9, "2.9", false);
4939
4940/*
4941 * pseries-2.8
4942 */
4943
4944static void spapr_machine_2_8_class_options(MachineClass *mc)
4945{
4946    static GlobalProperty compat[] = {
4947        { TYPE_SPAPR_PCI_HOST_BRIDGE, "pcie-extended-configuration-space", "off" },
4948    };
4949
4950    spapr_machine_2_9_class_options(mc);
4951    compat_props_add(mc->compat_props, hw_compat_2_8, hw_compat_2_8_len);
4952    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
4953    mc->numa_mem_align_shift = 23;
4954}
4955
4956DEFINE_SPAPR_MACHINE(2_8, "2.8", false);
4957
4958/*
4959 * pseries-2.7
4960 */
4961
4962static bool phb_placement_2_7(SpaprMachineState *spapr, uint32_t index,
4963                              uint64_t *buid, hwaddr *pio,
4964                              hwaddr *mmio32, hwaddr *mmio64,
4965                              unsigned n_dma, uint32_t *liobns,
4966                              hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
4967{
4968    /* Legacy PHB placement for pseries-2.7 and earlier machine types */
4969    const uint64_t base_buid = 0x800000020000000ULL;
4970    const hwaddr phb_spacing = 0x1000000000ULL; /* 64 GiB */
4971    const hwaddr mmio_offset = 0xa0000000; /* 2 GiB + 512 MiB */
4972    const hwaddr pio_offset = 0x80000000; /* 2 GiB */
4973    const uint32_t max_index = 255;
4974    const hwaddr phb0_alignment = 0x10000000000ULL; /* 1 TiB */
4975
4976    uint64_t ram_top = MACHINE(spapr)->ram_size;
4977    hwaddr phb0_base, phb_base;
4978    int i;
4979
4980    /* Do we have device memory? */
4981    if (MACHINE(spapr)->maxram_size > ram_top) {
4982        /* Can't just use maxram_size, because there may be an
4983         * alignment gap between normal and device memory regions
4984         */
4985        ram_top = MACHINE(spapr)->device_memory->base +
4986            memory_region_size(&MACHINE(spapr)->device_memory->mr);
4987    }
4988
4989    phb0_base = QEMU_ALIGN_UP(ram_top, phb0_alignment);
4990
4991    if (index > max_index) {
4992        error_setg(errp, "\"index\" for PAPR PHB is too large (max %u)",
4993                   max_index);
4994        return false;
4995    }
4996
4997    *buid = base_buid + index;
4998    for (i = 0; i < n_dma; ++i) {
4999        liobns[i] = SPAPR_PCI_LIOBN(index, i);
5000    }
5001
5002    phb_base = phb0_base + index * phb_spacing;
5003    *pio = phb_base + pio_offset;
5004    *mmio32 = phb_base + mmio_offset;
5005    /*
5006     * We don't set the 64-bit MMIO window, relying on the PHB's
5007     * fallback behaviour of automatically splitting a large "32-bit"
5008     * window into contiguous 32-bit and 64-bit windows
5009     */
5010
5011    *nv2gpa = 0;
5012    *nv2atsd = 0;
5013    return true;
5014}
5015
5016static void spapr_machine_2_7_class_options(MachineClass *mc)
5017{
5018    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5019    static GlobalProperty compat[] = {
5020        { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0xf80000000", },
5021        { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem64_win_size", "0", },
5022        { TYPE_POWERPC_CPU, "pre-2.8-migration", "on", },
5023        { TYPE_SPAPR_PCI_HOST_BRIDGE, "pre-2.8-migration", "on", },
5024    };
5025
5026    spapr_machine_2_8_class_options(mc);
5027    mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power7_v2.3");
5028    mc->default_machine_opts = "modern-hotplug-events=off";
5029    compat_props_add(mc->compat_props, hw_compat_2_7, hw_compat_2_7_len);
5030    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5031    smc->phb_placement = phb_placement_2_7;
5032}
5033
5034DEFINE_SPAPR_MACHINE(2_7, "2.7", false);
5035
5036/*
5037 * pseries-2.6
5038 */
5039
5040static void spapr_machine_2_6_class_options(MachineClass *mc)
5041{
5042    static GlobalProperty compat[] = {
5043        { TYPE_SPAPR_PCI_HOST_BRIDGE, "ddw", "off" },
5044    };
5045
5046    spapr_machine_2_7_class_options(mc);
5047    mc->has_hotpluggable_cpus = false;
5048    compat_props_add(mc->compat_props, hw_compat_2_6, hw_compat_2_6_len);
5049    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5050}
5051
5052DEFINE_SPAPR_MACHINE(2_6, "2.6", false);
5053
5054/*
5055 * pseries-2.5
5056 */
5057
5058static void spapr_machine_2_5_class_options(MachineClass *mc)
5059{
5060    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5061    static GlobalProperty compat[] = {
5062        { "spapr-vlan", "use-rx-buffer-pools", "off" },
5063    };
5064
5065    spapr_machine_2_6_class_options(mc);
5066    smc->use_ohci_by_default = true;
5067    compat_props_add(mc->compat_props, hw_compat_2_5, hw_compat_2_5_len);
5068    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5069}
5070
5071DEFINE_SPAPR_MACHINE(2_5, "2.5", false);
5072
5073/*
5074 * pseries-2.4
5075 */
5076
5077static void spapr_machine_2_4_class_options(MachineClass *mc)
5078{
5079    SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
5080
5081    spapr_machine_2_5_class_options(mc);
5082    smc->dr_lmb_enabled = false;
5083    compat_props_add(mc->compat_props, hw_compat_2_4, hw_compat_2_4_len);
5084}
5085
5086DEFINE_SPAPR_MACHINE(2_4, "2.4", false);
5087
5088/*
5089 * pseries-2.3
5090 */
5091
5092static void spapr_machine_2_3_class_options(MachineClass *mc)
5093{
5094    static GlobalProperty compat[] = {
5095        { "spapr-pci-host-bridge", "dynamic-reconfiguration", "off" },
5096    };
5097    spapr_machine_2_4_class_options(mc);
5098    compat_props_add(mc->compat_props, hw_compat_2_3, hw_compat_2_3_len);
5099    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5100}
5101DEFINE_SPAPR_MACHINE(2_3, "2.3", false);
5102
5103/*
5104 * pseries-2.2
5105 */
5106
5107static void spapr_machine_2_2_class_options(MachineClass *mc)
5108{
5109    static GlobalProperty compat[] = {
5110        { TYPE_SPAPR_PCI_HOST_BRIDGE, "mem_win_size", "0x20000000" },
5111    };
5112
5113    spapr_machine_2_3_class_options(mc);
5114    compat_props_add(mc->compat_props, hw_compat_2_2, hw_compat_2_2_len);
5115    compat_props_add(mc->compat_props, compat, G_N_ELEMENTS(compat));
5116    mc->default_machine_opts = "modern-hotplug-events=off,suppress-vmdesc=on";
5117}
5118DEFINE_SPAPR_MACHINE(2_2, "2.2", false);
5119
5120/*
5121 * pseries-2.1
5122 */
5123
5124static void spapr_machine_2_1_class_options(MachineClass *mc)
5125{
5126    spapr_machine_2_2_class_options(mc);
5127    compat_props_add(mc->compat_props, hw_compat_2_1, hw_compat_2_1_len);
5128}
5129DEFINE_SPAPR_MACHINE(2_1, "2.1", false);
5130
5131static void spapr_machine_register_types(void)
5132{
5133    type_register_static(&spapr_machine_info);
5134}
5135
5136type_init(spapr_machine_register_types)
5137