qemu/hw/ppc/spapr_nvdimm.c
<<
>>
Prefs
   1/*
   2 * QEMU PAPR Storage Class Memory Interfaces
   3 *
   4 * Copyright (c) 2019-2020, IBM Corporation.
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24#include "qemu/osdep.h"
  25#include "qemu/cutils.h"
  26#include "qapi/error.h"
  27#include "hw/ppc/spapr_drc.h"
  28#include "hw/ppc/spapr_nvdimm.h"
  29#include "hw/mem/nvdimm.h"
  30#include "qemu/nvdimm-utils.h"
  31#include "hw/ppc/fdt.h"
  32#include "qemu/range.h"
  33#include "hw/ppc/spapr_numa.h"
  34#include "block/thread-pool.h"
  35#include "migration/vmstate.h"
  36#include "qemu/pmem.h"
  37#include "hw/qdev-properties.h"
  38
  39/* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
  40/* SCM device is unable to persist memory contents */
  41#define PAPR_PMEM_UNARMED PPC_BIT(0)
  42
  43/*
  44 * The nvdimm size should be aligned to SCM block size.
  45 * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
  46 * in order to have SCM regions not to overlap with dimm memory regions.
  47 * The SCM devices can have variable block sizes. For now, fixing the
  48 * block size to the minimum value.
  49 */
  50#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
  51
  52/* Have an explicit check for alignment */
  53QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
  54
  55#define TYPE_SPAPR_NVDIMM "spapr-nvdimm"
  56OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, SPAPR_NVDIMM)
  57
  58struct SPAPRNVDIMMClass {
  59    /* private */
  60    NVDIMMClass parent_class;
  61
  62    /* public */
  63    void (*realize)(NVDIMMDevice *dimm, Error **errp);
  64    void (*unrealize)(NVDIMMDevice *dimm, Error **errp);
  65};
  66
  67bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
  68                           uint64_t size, Error **errp)
  69{
  70    const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
  71    const MachineState *ms = MACHINE(hotplug_dev);
  72    PCDIMMDevice *dimm = PC_DIMM(nvdimm);
  73    MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
  74    g_autofree char *uuidstr = NULL;
  75    QemuUUID uuid;
  76    int ret;
  77
  78    if (!mc->nvdimm_supported) {
  79        error_setg(errp, "NVDIMM hotplug not supported for this machine");
  80        return false;
  81    }
  82
  83    if (!ms->nvdimms_state->is_enabled) {
  84        error_setg(errp, "nvdimm device found but 'nvdimm=off' was set");
  85        return false;
  86    }
  87
  88    if (object_property_get_int(OBJECT(nvdimm), NVDIMM_LABEL_SIZE_PROP,
  89                                &error_abort) == 0) {
  90        error_setg(errp, "PAPR requires NVDIMM devices to have label-size set");
  91        return false;
  92    }
  93
  94    if (size % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
  95        error_setg(errp, "PAPR requires NVDIMM memory size (excluding label)"
  96                   " to be a multiple of %" PRIu64 "MB",
  97                   SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
  98        return false;
  99    }
 100
 101    uuidstr = object_property_get_str(OBJECT(nvdimm), NVDIMM_UUID_PROP,
 102                                      &error_abort);
 103    ret = qemu_uuid_parse(uuidstr, &uuid);
 104    g_assert(!ret);
 105
 106    if (qemu_uuid_is_null(&uuid)) {
 107        error_setg(errp, "NVDIMM device requires the uuid to be set");
 108        return false;
 109    }
 110
 111    if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) &&
 112        (memory_region_get_fd(mr) < 0)) {
 113        error_setg(errp, "spapr-nvdimm device requires the "
 114                   "memdev %s to be of memory-backend-file type",
 115                   object_get_canonical_path_component(OBJECT(dimm->hostmem)));
 116        return false;
 117    }
 118
 119    return true;
 120}
 121
 122
 123void spapr_add_nvdimm(DeviceState *dev, uint64_t slot)
 124{
 125    SpaprDrc *drc;
 126    bool hotplugged = spapr_drc_hotplugged(dev);
 127
 128    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot);
 129    g_assert(drc);
 130
 131    /*
 132     * pc_dimm_get_free_slot() provided a free slot at pre-plug. The
 133     * corresponding DRC is thus assumed to be attachable.
 134     */
 135    spapr_drc_attach(drc, dev);
 136
 137    if (hotplugged) {
 138        spapr_hotplug_req_add_by_index(drc);
 139    }
 140}
 141
 142static int spapr_dt_nvdimm(SpaprMachineState *spapr, void *fdt,
 143                           int parent_offset, NVDIMMDevice *nvdimm)
 144{
 145    int child_offset;
 146    char *buf;
 147    SpaprDrc *drc;
 148    uint32_t drc_idx;
 149    uint32_t node = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_NODE_PROP,
 150                                             &error_abort);
 151    uint64_t slot = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_SLOT_PROP,
 152                                             &error_abort);
 153    uint64_t lsize = nvdimm->label_size;
 154    uint64_t size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
 155                                            NULL);
 156
 157    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot);
 158    g_assert(drc);
 159
 160    drc_idx = spapr_drc_index(drc);
 161
 162    buf = g_strdup_printf("ibm,pmemory@%x", drc_idx);
 163    child_offset = fdt_add_subnode(fdt, parent_offset, buf);
 164    g_free(buf);
 165
 166    _FDT(child_offset);
 167
 168    _FDT((fdt_setprop_cell(fdt, child_offset, "reg", drc_idx)));
 169    _FDT((fdt_setprop_string(fdt, child_offset, "compatible", "ibm,pmemory")));
 170    _FDT((fdt_setprop_string(fdt, child_offset, "device_type", "ibm,pmemory")));
 171
 172    spapr_numa_write_associativity_dt(spapr, fdt, child_offset, node);
 173
 174    buf = qemu_uuid_unparse_strdup(&nvdimm->uuid);
 175    _FDT((fdt_setprop_string(fdt, child_offset, "ibm,unit-guid", buf)));
 176    g_free(buf);
 177
 178    _FDT((fdt_setprop_cell(fdt, child_offset, "ibm,my-drc-index", drc_idx)));
 179
 180    _FDT((fdt_setprop_u64(fdt, child_offset, "ibm,block-size",
 181                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
 182    _FDT((fdt_setprop_u64(fdt, child_offset, "ibm,number-of-blocks",
 183                          size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
 184    _FDT((fdt_setprop_cell(fdt, child_offset, "ibm,metadata-size", lsize)));
 185
 186    _FDT((fdt_setprop_string(fdt, child_offset, "ibm,pmem-application",
 187                             "operating-system")));
 188    _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 189
 190    if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
 191        bool is_pmem = false, pmem_override = false;
 192        PCDIMMDevice *dimm = PC_DIMM(nvdimm);
 193        HostMemoryBackend *hostmem = dimm->hostmem;
 194
 195        is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem", NULL);
 196        pmem_override = object_property_get_bool(OBJECT(nvdimm),
 197                                                 "pmem-override", NULL);
 198        if (!is_pmem || pmem_override) {
 199            _FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
 200                             NULL, 0));
 201        }
 202    }
 203
 204    return child_offset;
 205}
 206
 207int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
 208                           void *fdt, int *fdt_start_offset, Error **errp)
 209{
 210    NVDIMMDevice *nvdimm = NVDIMM(drc->dev);
 211
 212    *fdt_start_offset = spapr_dt_nvdimm(spapr, fdt, 0, nvdimm);
 213
 214    return 0;
 215}
 216
 217void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt)
 218{
 219    int offset = fdt_subnode_offset(fdt, 0, "ibm,persistent-memory");
 220    GSList *iter, *nvdimms = nvdimm_get_device_list();
 221
 222    if (offset < 0) {
 223        offset = fdt_add_subnode(fdt, 0, "ibm,persistent-memory");
 224        _FDT(offset);
 225        _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x1)));
 226        _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
 227        _FDT((fdt_setprop_string(fdt, offset, "device_type",
 228                                 "ibm,persistent-memory")));
 229    }
 230
 231    /* Create DT entries for cold plugged NVDIMM devices */
 232    for (iter = nvdimms; iter; iter = iter->next) {
 233        NVDIMMDevice *nvdimm = iter->data;
 234
 235        spapr_dt_nvdimm(spapr, fdt, offset, nvdimm);
 236    }
 237    g_slist_free(nvdimms);
 238
 239    return;
 240}
 241
 242static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
 243                                        SpaprMachineState *spapr,
 244                                        target_ulong opcode,
 245                                        target_ulong *args)
 246{
 247    uint32_t drc_index = args[0];
 248    uint64_t offset = args[1];
 249    uint64_t len = args[2];
 250    SpaprDrc *drc = spapr_drc_by_index(drc_index);
 251    NVDIMMDevice *nvdimm;
 252    NVDIMMClass *ddc;
 253    uint64_t data = 0;
 254    uint8_t buf[8] = { 0 };
 255
 256    if (!drc || !drc->dev ||
 257        spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
 258        return H_PARAMETER;
 259    }
 260
 261    if (len != 1 && len != 2 &&
 262        len != 4 && len != 8) {
 263        return H_P3;
 264    }
 265
 266    nvdimm = NVDIMM(drc->dev);
 267    if ((offset + len < offset) ||
 268        (nvdimm->label_size < len + offset)) {
 269        return H_P2;
 270    }
 271
 272    ddc = NVDIMM_GET_CLASS(nvdimm);
 273    ddc->read_label_data(nvdimm, buf, len, offset);
 274
 275    switch (len) {
 276    case 1:
 277        data = ldub_p(buf);
 278        break;
 279    case 2:
 280        data = lduw_be_p(buf);
 281        break;
 282    case 4:
 283        data = ldl_be_p(buf);
 284        break;
 285    case 8:
 286        data = ldq_be_p(buf);
 287        break;
 288    default:
 289        g_assert_not_reached();
 290    }
 291
 292    args[0] = data;
 293
 294    return H_SUCCESS;
 295}
 296
 297static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
 298                                         SpaprMachineState *spapr,
 299                                         target_ulong opcode,
 300                                         target_ulong *args)
 301{
 302    uint32_t drc_index = args[0];
 303    uint64_t offset = args[1];
 304    uint64_t data = args[2];
 305    uint64_t len = args[3];
 306    SpaprDrc *drc = spapr_drc_by_index(drc_index);
 307    NVDIMMDevice *nvdimm;
 308    NVDIMMClass *ddc;
 309    uint8_t buf[8] = { 0 };
 310
 311    if (!drc || !drc->dev ||
 312        spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
 313        return H_PARAMETER;
 314    }
 315
 316    if (len != 1 && len != 2 &&
 317        len != 4 && len != 8) {
 318        return H_P4;
 319    }
 320
 321    nvdimm = NVDIMM(drc->dev);
 322    if ((offset + len < offset) ||
 323        (nvdimm->label_size < len + offset)) {
 324        return H_P2;
 325    }
 326
 327    switch (len) {
 328    case 1:
 329        if (data & 0xffffffffffffff00) {
 330            return H_P2;
 331        }
 332        stb_p(buf, data);
 333        break;
 334    case 2:
 335        if (data & 0xffffffffffff0000) {
 336            return H_P2;
 337        }
 338        stw_be_p(buf, data);
 339        break;
 340    case 4:
 341        if (data & 0xffffffff00000000) {
 342            return H_P2;
 343        }
 344        stl_be_p(buf, data);
 345        break;
 346    case 8:
 347        stq_be_p(buf, data);
 348        break;
 349    default:
 350            g_assert_not_reached();
 351    }
 352
 353    ddc = NVDIMM_GET_CLASS(nvdimm);
 354    ddc->write_label_data(nvdimm, buf, len, offset);
 355
 356    return H_SUCCESS;
 357}
 358
 359static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
 360                                   target_ulong opcode, target_ulong *args)
 361{
 362    uint32_t drc_index = args[0];
 363    uint64_t starting_idx = args[1];
 364    uint64_t no_of_scm_blocks_to_bind = args[2];
 365    uint64_t target_logical_mem_addr = args[3];
 366    uint64_t continue_token = args[4];
 367    uint64_t size;
 368    uint64_t total_no_of_scm_blocks;
 369    SpaprDrc *drc = spapr_drc_by_index(drc_index);
 370    hwaddr addr;
 371    NVDIMMDevice *nvdimm;
 372
 373    if (!drc || !drc->dev ||
 374        spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
 375        return H_PARAMETER;
 376    }
 377
 378    /*
 379     * Currently continue token should be zero qemu has already bound
 380     * everything and this hcall doesnt return H_BUSY.
 381     */
 382    if (continue_token > 0) {
 383        return H_P5;
 384    }
 385
 386    /* Currently qemu assigns the address. */
 387    if (target_logical_mem_addr != 0xffffffffffffffff) {
 388        return H_OVERLAP;
 389    }
 390
 391    nvdimm = NVDIMM(drc->dev);
 392
 393    size = object_property_get_uint(OBJECT(nvdimm),
 394                                    PC_DIMM_SIZE_PROP, &error_abort);
 395
 396    total_no_of_scm_blocks = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
 397
 398    if (starting_idx > total_no_of_scm_blocks) {
 399        return H_P2;
 400    }
 401
 402    if (((starting_idx + no_of_scm_blocks_to_bind) < starting_idx) ||
 403        ((starting_idx + no_of_scm_blocks_to_bind) > total_no_of_scm_blocks)) {
 404        return H_P3;
 405    }
 406
 407    addr = object_property_get_uint(OBJECT(nvdimm),
 408                                    PC_DIMM_ADDR_PROP, &error_abort);
 409
 410    addr += starting_idx * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
 411
 412    /* Already bound, Return target logical address in R5 */
 413    args[1] = addr;
 414    args[2] = no_of_scm_blocks_to_bind;
 415
 416    return H_SUCCESS;
 417}
 418
 419typedef struct SpaprNVDIMMDeviceFlushState {
 420    uint64_t continue_token;
 421    int64_t hcall_ret;
 422    uint32_t drcidx;
 423
 424    QLIST_ENTRY(SpaprNVDIMMDeviceFlushState) node;
 425} SpaprNVDIMMDeviceFlushState;
 426
 427typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
 428struct SpaprNVDIMMDevice {
 429    /* private */
 430    NVDIMMDevice parent_obj;
 431
 432    bool hcall_flush_required;
 433    uint64_t nvdimm_flush_token;
 434    QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
 435    QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
 436
 437    /* public */
 438
 439    /*
 440     * The 'on' value for this property forced the qemu to enable the hcall
 441     * flush for the nvdimm device even if the backend is a pmem
 442     */
 443    bool pmem_override;
 444};
 445
 446static int flush_worker_cb(void *opaque)
 447{
 448    SpaprNVDIMMDeviceFlushState *state = opaque;
 449    SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
 450    PCDIMMDevice *dimm;
 451    HostMemoryBackend *backend;
 452    int backend_fd;
 453
 454    g_assert(drc != NULL);
 455
 456    dimm = PC_DIMM(drc->dev);
 457    backend = MEMORY_BACKEND(dimm->hostmem);
 458    backend_fd = memory_region_get_fd(&backend->mr);
 459
 460    if (object_property_get_bool(OBJECT(backend), "pmem", NULL)) {
 461        MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
 462        void *ptr = memory_region_get_ram_ptr(mr);
 463        size_t size = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
 464                                               NULL);
 465
 466        /* flush pmem backend */
 467        pmem_persist(ptr, size);
 468    } else {
 469        /* flush raw backing image */
 470        if (qemu_fdatasync(backend_fd) < 0) {
 471            error_report("papr_scm: Could not sync nvdimm to backend file: %s",
 472                         strerror(errno));
 473            return H_HARDWARE;
 474        }
 475    }
 476
 477    return H_SUCCESS;
 478}
 479
 480static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret)
 481{
 482    SpaprNVDIMMDeviceFlushState *state = opaque;
 483    SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
 484    SpaprNVDIMMDevice *s_nvdimm;
 485
 486    g_assert(drc != NULL);
 487
 488    s_nvdimm = SPAPR_NVDIMM(drc->dev);
 489
 490    state->hcall_ret = hcall_ret;
 491    QLIST_REMOVE(state, node);
 492    QLIST_INSERT_HEAD(&s_nvdimm->completed_nvdimm_flush_states, state, node);
 493}
 494
 495static int spapr_nvdimm_flush_post_load(void *opaque, int version_id)
 496{
 497    SpaprNVDIMMDevice *s_nvdimm = (SpaprNVDIMMDevice *)opaque;
 498    SpaprNVDIMMDeviceFlushState *state;
 499    ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
 500    HostMemoryBackend *backend = MEMORY_BACKEND(PC_DIMM(s_nvdimm)->hostmem);
 501    bool is_pmem = object_property_get_bool(OBJECT(backend), "pmem", NULL);
 502    bool pmem_override = object_property_get_bool(OBJECT(s_nvdimm),
 503                                                  "pmem-override", NULL);
 504    bool dest_hcall_flush_required = pmem_override || !is_pmem;
 505
 506    if (!s_nvdimm->hcall_flush_required && dest_hcall_flush_required) {
 507        error_report("The file backend for the spapr-nvdimm device %s at "
 508                     "source is a pmem, use pmem=on and pmem-override=off to "
 509                     "continue.", DEVICE(s_nvdimm)->id);
 510        return -EINVAL;
 511    }
 512    if (s_nvdimm->hcall_flush_required && !dest_hcall_flush_required) {
 513        error_report("The guest expects hcall-flush support for the "
 514                     "spapr-nvdimm device %s, use pmem_override=on to "
 515                     "continue.", DEVICE(s_nvdimm)->id);
 516        return -EINVAL;
 517    }
 518
 519    QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) {
 520        thread_pool_submit_aio(pool, flush_worker_cb, state,
 521                               spapr_nvdimm_flush_completion_cb, state);
 522    }
 523
 524    return 0;
 525}
 526
 527static const VMStateDescription vmstate_spapr_nvdimm_flush_state = {
 528     .name = "spapr_nvdimm_flush_state",
 529     .version_id = 1,
 530     .minimum_version_id = 1,
 531     .fields = (VMStateField[]) {
 532         VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
 533         VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
 534         VMSTATE_UINT32(drcidx, SpaprNVDIMMDeviceFlushState),
 535         VMSTATE_END_OF_LIST()
 536     },
 537};
 538
 539const VMStateDescription vmstate_spapr_nvdimm_states = {
 540    .name = "spapr_nvdimm_states",
 541    .version_id = 1,
 542    .minimum_version_id = 1,
 543    .post_load = spapr_nvdimm_flush_post_load,
 544    .fields = (VMStateField[]) {
 545        VMSTATE_BOOL(hcall_flush_required, SpaprNVDIMMDevice),
 546        VMSTATE_UINT64(nvdimm_flush_token, SpaprNVDIMMDevice),
 547        VMSTATE_QLIST_V(completed_nvdimm_flush_states, SpaprNVDIMMDevice, 1,
 548                        vmstate_spapr_nvdimm_flush_state,
 549                        SpaprNVDIMMDeviceFlushState, node),
 550        VMSTATE_QLIST_V(pending_nvdimm_flush_states, SpaprNVDIMMDevice, 1,
 551                        vmstate_spapr_nvdimm_flush_state,
 552                        SpaprNVDIMMDeviceFlushState, node),
 553        VMSTATE_END_OF_LIST()
 554    },
 555};
 556
 557/*
 558 * Assign a token and reserve it for the new flush state.
 559 */
 560static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(
 561                                                SpaprNVDIMMDevice *spapr_nvdimm)
 562{
 563    SpaprNVDIMMDeviceFlushState *state;
 564
 565    state = g_malloc0(sizeof(*state));
 566
 567    spapr_nvdimm->nvdimm_flush_token++;
 568    /* Token zero is presumed as no job pending. Assert on overflow to zero */
 569    g_assert(spapr_nvdimm->nvdimm_flush_token != 0);
 570
 571    state->continue_token = spapr_nvdimm->nvdimm_flush_token;
 572
 573    QLIST_INSERT_HEAD(&spapr_nvdimm->pending_nvdimm_flush_states, state, node);
 574
 575    return state;
 576}
 577
 578/*
 579 * spapr_nvdimm_finish_flushes
 580 *      Waits for all pending flush requests to complete
 581 *      their execution and free the states
 582 */
 583void spapr_nvdimm_finish_flushes(void)
 584{
 585    SpaprNVDIMMDeviceFlushState *state, *next;
 586    GSList *list, *nvdimms;
 587
 588    /*
 589     * Called on reset path, the main loop thread which calls
 590     * the pending BHs has gotten out running in the reset path,
 591     * finally reaching here. Other code path being guest
 592     * h_client_architecture_support, thats early boot up.
 593     */
 594    nvdimms = nvdimm_get_device_list();
 595    for (list = nvdimms; list; list = list->next) {
 596        NVDIMMDevice *nvdimm = list->data;
 597        if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
 598            SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(nvdimm);
 599            while (!QLIST_EMPTY(&s_nvdimm->pending_nvdimm_flush_states)) {
 600                aio_poll(qemu_get_aio_context(), true);
 601            }
 602
 603            QLIST_FOREACH_SAFE(state, &s_nvdimm->completed_nvdimm_flush_states,
 604                               node, next) {
 605                QLIST_REMOVE(state, node);
 606                g_free(state);
 607            }
 608        }
 609    }
 610    g_slist_free(nvdimms);
 611}
 612
 613/*
 614 * spapr_nvdimm_get_flush_status
 615 *      Fetches the status of the hcall worker and returns
 616 *      H_LONG_BUSY_ORDER_10_MSEC if the worker is still running.
 617 */
 618static int spapr_nvdimm_get_flush_status(SpaprNVDIMMDevice *s_nvdimm,
 619                                         uint64_t token)
 620{
 621    SpaprNVDIMMDeviceFlushState *state, *node;
 622
 623    QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) {
 624        if (state->continue_token == token) {
 625            return H_LONG_BUSY_ORDER_10_MSEC;
 626        }
 627    }
 628
 629    QLIST_FOREACH_SAFE(state, &s_nvdimm->completed_nvdimm_flush_states,
 630                       node, node) {
 631        if (state->continue_token == token) {
 632            int ret = state->hcall_ret;
 633            QLIST_REMOVE(state, node);
 634            g_free(state);
 635            return ret;
 636        }
 637    }
 638
 639    /* If not found in complete list too, invalid token */
 640    return H_P2;
 641}
 642
 643/*
 644 * H_SCM_FLUSH
 645 * Input: drc_index, continue-token
 646 * Out: continue-token
 647 * Return Value: H_SUCCESS, H_Parameter, H_P2, H_LONG_BUSY_ORDER_10_MSEC,
 648 *               H_UNSUPPORTED
 649 *
 650 * Given a DRC Index Flush the data to backend NVDIMM device. The hcall returns
 651 * H_LONG_BUSY_ORDER_10_MSEC when the flush takes longer time and the hcall
 652 * needs to be issued multiple times in order to be completely serviced. The
 653 * continue-token from the output to be passed in the argument list of
 654 * subsequent hcalls until the hcall is completely serviced at which point
 655 * H_SUCCESS or other error is returned.
 656 */
 657static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr,
 658                                target_ulong opcode, target_ulong *args)
 659{
 660    int ret;
 661    uint32_t drc_index = args[0];
 662    uint64_t continue_token = args[1];
 663    SpaprDrc *drc = spapr_drc_by_index(drc_index);
 664    PCDIMMDevice *dimm;
 665    HostMemoryBackend *backend = NULL;
 666    SpaprNVDIMMDeviceFlushState *state;
 667    ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
 668    int fd;
 669
 670    if (!drc || !drc->dev ||
 671        spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
 672        return H_PARAMETER;
 673    }
 674
 675    dimm = PC_DIMM(drc->dev);
 676    if (!object_dynamic_cast(OBJECT(dimm), TYPE_SPAPR_NVDIMM)) {
 677        return H_PARAMETER;
 678    }
 679    if (continue_token == 0) {
 680        bool is_pmem = false, pmem_override = false;
 681        backend = MEMORY_BACKEND(dimm->hostmem);
 682        fd = memory_region_get_fd(&backend->mr);
 683
 684        if (fd < 0) {
 685            return H_UNSUPPORTED;
 686        }
 687
 688        is_pmem = object_property_get_bool(OBJECT(backend), "pmem", NULL);
 689        pmem_override = object_property_get_bool(OBJECT(dimm),
 690                                                "pmem-override", NULL);
 691        if (is_pmem && !pmem_override) {
 692            return H_UNSUPPORTED;
 693        }
 694
 695        state = spapr_nvdimm_init_new_flush_state(SPAPR_NVDIMM(dimm));
 696        if (!state) {
 697            return H_HARDWARE;
 698        }
 699
 700        state->drcidx = drc_index;
 701
 702        thread_pool_submit_aio(pool, flush_worker_cb, state,
 703                               spapr_nvdimm_flush_completion_cb, state);
 704
 705        continue_token = state->continue_token;
 706    }
 707
 708    ret = spapr_nvdimm_get_flush_status(SPAPR_NVDIMM(dimm), continue_token);
 709    if (H_IS_LONG_BUSY(ret)) {
 710        args[0] = continue_token;
 711    }
 712
 713    return ret;
 714}
 715
 716static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
 717                                     target_ulong opcode, target_ulong *args)
 718{
 719    uint32_t drc_index = args[0];
 720    uint64_t starting_scm_logical_addr = args[1];
 721    uint64_t no_of_scm_blocks_to_unbind = args[2];
 722    uint64_t continue_token = args[3];
 723    uint64_t size_to_unbind;
 724    Range blockrange = range_empty;
 725    Range nvdimmrange = range_empty;
 726    SpaprDrc *drc = spapr_drc_by_index(drc_index);
 727    NVDIMMDevice *nvdimm;
 728    uint64_t size, addr;
 729
 730    if (!drc || !drc->dev ||
 731        spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
 732        return H_PARAMETER;
 733    }
 734
 735    /* continue_token should be zero as this hcall doesn't return H_BUSY. */
 736    if (continue_token > 0) {
 737        return H_P4;
 738    }
 739
 740    /* Check if starting_scm_logical_addr is block aligned */
 741    if (!QEMU_IS_ALIGNED(starting_scm_logical_addr,
 742                         SPAPR_MINIMUM_SCM_BLOCK_SIZE)) {
 743        return H_P2;
 744    }
 745
 746    size_to_unbind = no_of_scm_blocks_to_unbind * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
 747    if (no_of_scm_blocks_to_unbind == 0 || no_of_scm_blocks_to_unbind !=
 748                               size_to_unbind / SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
 749        return H_P3;
 750    }
 751
 752    nvdimm = NVDIMM(drc->dev);
 753    size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
 754                                   &error_abort);
 755    addr = object_property_get_int(OBJECT(nvdimm), PC_DIMM_ADDR_PROP,
 756                                   &error_abort);
 757
 758    range_init_nofail(&nvdimmrange, addr, size);
 759    range_init_nofail(&blockrange, starting_scm_logical_addr, size_to_unbind);
 760
 761    if (!range_contains_range(&nvdimmrange, &blockrange)) {
 762        return H_P3;
 763    }
 764
 765    args[1] = no_of_scm_blocks_to_unbind;
 766
 767    /* let unplug take care of actual unbind */
 768    return H_SUCCESS;
 769}
 770
 771#define H_UNBIND_SCOPE_ALL 0x1
 772#define H_UNBIND_SCOPE_DRC 0x2
 773
 774static target_ulong h_scm_unbind_all(PowerPCCPU *cpu, SpaprMachineState *spapr,
 775                                     target_ulong opcode, target_ulong *args)
 776{
 777    uint64_t target_scope = args[0];
 778    uint32_t drc_index = args[1];
 779    uint64_t continue_token = args[2];
 780    NVDIMMDevice *nvdimm;
 781    uint64_t size;
 782    uint64_t no_of_scm_blocks_unbound = 0;
 783
 784    /* continue_token should be zero as this hcall doesn't return H_BUSY. */
 785    if (continue_token > 0) {
 786        return H_P4;
 787    }
 788
 789    if (target_scope == H_UNBIND_SCOPE_DRC) {
 790        SpaprDrc *drc = spapr_drc_by_index(drc_index);
 791
 792        if (!drc || !drc->dev ||
 793            spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
 794            return H_P2;
 795        }
 796
 797        nvdimm = NVDIMM(drc->dev);
 798        size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
 799                                       &error_abort);
 800
 801        no_of_scm_blocks_unbound = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
 802    } else if (target_scope ==  H_UNBIND_SCOPE_ALL) {
 803        GSList *list, *nvdimms;
 804
 805        nvdimms = nvdimm_get_device_list();
 806        for (list = nvdimms; list; list = list->next) {
 807            nvdimm = list->data;
 808            size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
 809                                           &error_abort);
 810
 811            no_of_scm_blocks_unbound += size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
 812        }
 813        g_slist_free(nvdimms);
 814    } else {
 815        return H_PARAMETER;
 816    }
 817
 818    args[1] = no_of_scm_blocks_unbound;
 819
 820    /* let unplug take care of actual unbind */
 821    return H_SUCCESS;
 822}
 823
 824static target_ulong h_scm_health(PowerPCCPU *cpu, SpaprMachineState *spapr,
 825                                 target_ulong opcode, target_ulong *args)
 826{
 827
 828    NVDIMMDevice *nvdimm;
 829    uint64_t hbitmap = 0;
 830    uint32_t drc_index = args[0];
 831    SpaprDrc *drc = spapr_drc_by_index(drc_index);
 832    const uint64_t hbitmap_mask = PAPR_PMEM_UNARMED;
 833
 834
 835    /* Ensure that the drc is valid & is valid PMEM dimm and is plugged in */
 836    if (!drc || !drc->dev ||
 837        spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
 838        return H_PARAMETER;
 839    }
 840
 841    nvdimm = NVDIMM(drc->dev);
 842
 843    /* Update if the nvdimm is unarmed and send its status via health bitmaps */
 844    if (object_property_get_bool(OBJECT(nvdimm), NVDIMM_UNARMED_PROP, NULL)) {
 845        hbitmap |= PAPR_PMEM_UNARMED;
 846    }
 847
 848    /* Update the out args with health bitmap/mask */
 849    args[0] = hbitmap;
 850    args[1] = hbitmap_mask;
 851
 852    return H_SUCCESS;
 853}
 854
 855static void spapr_scm_register_types(void)
 856{
 857    /* qemu/scm specific hcalls */
 858    spapr_register_hypercall(H_SCM_READ_METADATA, h_scm_read_metadata);
 859    spapr_register_hypercall(H_SCM_WRITE_METADATA, h_scm_write_metadata);
 860    spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
 861    spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
 862    spapr_register_hypercall(H_SCM_UNBIND_ALL, h_scm_unbind_all);
 863    spapr_register_hypercall(H_SCM_HEALTH, h_scm_health);
 864    spapr_register_hypercall(H_SCM_FLUSH, h_scm_flush);
 865}
 866
 867type_init(spapr_scm_register_types)
 868
 869static void spapr_nvdimm_realize(NVDIMMDevice *dimm, Error **errp)
 870{
 871    SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(dimm);
 872    HostMemoryBackend *backend = MEMORY_BACKEND(PC_DIMM(dimm)->hostmem);
 873    bool is_pmem = object_property_get_bool(OBJECT(backend),  "pmem", NULL);
 874    bool pmem_override = object_property_get_bool(OBJECT(dimm), "pmem-override",
 875                                             NULL);
 876    if (!is_pmem || pmem_override) {
 877        s_nvdimm->hcall_flush_required = true;
 878    }
 879
 880    vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY,
 881                     &vmstate_spapr_nvdimm_states, dimm);
 882}
 883
 884static void spapr_nvdimm_unrealize(NVDIMMDevice *dimm)
 885{
 886    vmstate_unregister(NULL, &vmstate_spapr_nvdimm_states, dimm);
 887}
 888
 889static Property spapr_nvdimm_properties[] = {
 890#ifdef CONFIG_LIBPMEM
 891    DEFINE_PROP_BOOL("pmem-override", SpaprNVDIMMDevice, pmem_override, false),
 892#endif
 893    DEFINE_PROP_END_OF_LIST(),
 894};
 895
 896static void spapr_nvdimm_class_init(ObjectClass *oc, void *data)
 897{
 898    DeviceClass *dc = DEVICE_CLASS(oc);
 899    NVDIMMClass *nvc = NVDIMM_CLASS(oc);
 900
 901    nvc->realize = spapr_nvdimm_realize;
 902    nvc->unrealize = spapr_nvdimm_unrealize;
 903
 904    device_class_set_props(dc, spapr_nvdimm_properties);
 905}
 906
 907static void spapr_nvdimm_init(Object *obj)
 908{
 909    SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(obj);
 910
 911    s_nvdimm->hcall_flush_required = false;
 912    QLIST_INIT(&s_nvdimm->pending_nvdimm_flush_states);
 913    QLIST_INIT(&s_nvdimm->completed_nvdimm_flush_states);
 914}
 915
 916static TypeInfo spapr_nvdimm_info = {
 917    .name          = TYPE_SPAPR_NVDIMM,
 918    .parent        = TYPE_NVDIMM,
 919    .class_init    = spapr_nvdimm_class_init,
 920    .class_size    = sizeof(SPAPRNVDIMMClass),
 921    .instance_size = sizeof(SpaprNVDIMMDevice),
 922    .instance_init = spapr_nvdimm_init,
 923};
 924
 925static void spapr_nvdimm_register_types(void)
 926{
 927    type_register_static(&spapr_nvdimm_info);
 928}
 929
 930type_init(spapr_nvdimm_register_types)
 931