qemu/hw/remote/vfio-user-obj.c
<<
>>
Prefs
   1/**
   2 * QEMU vfio-user-server server object
   3 *
   4 * Copyright © 2022 Oracle and/or its affiliates.
   5 *
   6 * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
   7 *
   8 * See the COPYING file in the top-level directory.
   9 *
  10 */
  11
  12/**
  13 * Usage: add options:
  14 *     -machine x-remote,vfio-user=on,auto-shutdown=on
  15 *     -device <PCI-device>,id=<pci-dev-id>
  16 *     -object x-vfio-user-server,id=<id>,type=unix,path=<socket-path>,
  17 *             device=<pci-dev-id>
  18 *
  19 * Note that x-vfio-user-server object must be used with x-remote machine only.
  20 * This server could only support PCI devices for now.
  21 *
  22 * type - SocketAddress type - presently "unix" alone is supported. Required
  23 *        option
  24 *
  25 * path - named unix socket, it will be created by the server. It is
  26 *        a required option
  27 *
  28 * device - id of a device on the server, a required option. PCI devices
  29 *          alone are supported presently.
  30 *
  31 * notes - x-vfio-user-server could block IO and monitor during the
  32 *         initialization phase.
  33 *
  34 *         When x-remote machine has the auto-shutdown property
  35 *         enabled (default), x-vfio-user-server terminates after the last
  36 *         client disconnects. Otherwise, it will continue running until
  37 *         explicitly killed.
  38 */
  39
  40#include "qemu/osdep.h"
  41
  42#include "qom/object.h"
  43#include "qom/object_interfaces.h"
  44#include "qemu/error-report.h"
  45#include "trace.h"
  46#include "sysemu/runstate.h"
  47#include "hw/boards.h"
  48#include "hw/remote/machine.h"
  49#include "qapi/error.h"
  50#include "qapi/qapi-visit-sockets.h"
  51#include "qapi/qapi-events-misc.h"
  52#include "qemu/notify.h"
  53#include "qemu/thread.h"
  54#include "qemu/main-loop.h"
  55#include "sysemu/sysemu.h"
  56#include "libvfio-user.h"
  57#include "hw/qdev-core.h"
  58#include "hw/pci/pci.h"
  59#include "qemu/timer.h"
  60#include "exec/memory.h"
  61#include "hw/pci/msi.h"
  62#include "hw/pci/msix.h"
  63#include "hw/remote/vfio-user-obj.h"
  64
  65#define TYPE_VFU_OBJECT "x-vfio-user-server"
  66OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
  67
  68/**
  69 * VFU_OBJECT_ERROR - reports an error message.
  70 *
  71 * If auto_shutdown is set, it aborts the machine on error. Otherwise,
  72 * it logs an error message without aborting. auto_shutdown is disabled
  73 * when the server serves clients from multiple VMs; as such, an error
  74 * from one VM shouldn't be able to disrupt other VM's services.
  75 */
  76#define VFU_OBJECT_ERROR(o, fmt, ...)                                     \
  77    {                                                                     \
  78        if (vfu_object_auto_shutdown()) {                                 \
  79            error_setg(&error_abort, (fmt), ## __VA_ARGS__);              \
  80        } else {                                                          \
  81            error_report((fmt), ## __VA_ARGS__);                          \
  82        }                                                                 \
  83    }                                                                     \
  84
  85struct VfuObjectClass {
  86    ObjectClass parent_class;
  87
  88    unsigned int nr_devs;
  89};
  90
  91struct VfuObject {
  92    /* private */
  93    Object parent;
  94
  95    SocketAddress *socket;
  96
  97    char *device;
  98
  99    Error *err;
 100
 101    Notifier machine_done;
 102
 103    vfu_ctx_t *vfu_ctx;
 104
 105    PCIDevice *pci_dev;
 106
 107    Error *unplug_blocker;
 108
 109    int vfu_poll_fd;
 110
 111    MSITriggerFunc *default_msi_trigger;
 112    MSIPrepareMessageFunc *default_msi_prepare_message;
 113    MSIxPrepareMessageFunc *default_msix_prepare_message;
 114};
 115
 116static void vfu_object_init_ctx(VfuObject *o, Error **errp);
 117
 118static bool vfu_object_auto_shutdown(void)
 119{
 120    bool auto_shutdown = true;
 121    Error *local_err = NULL;
 122
 123    if (!current_machine) {
 124        return auto_shutdown;
 125    }
 126
 127    auto_shutdown = object_property_get_bool(OBJECT(current_machine),
 128                                             "auto-shutdown",
 129                                             &local_err);
 130
 131    /*
 132     * local_err would be set if no such property exists - safe to ignore.
 133     * Unlikely scenario as auto-shutdown is always defined for
 134     * TYPE_REMOTE_MACHINE, and  TYPE_VFU_OBJECT only works with
 135     * TYPE_REMOTE_MACHINE
 136     */
 137    if (local_err) {
 138        auto_shutdown = true;
 139        error_free(local_err);
 140    }
 141
 142    return auto_shutdown;
 143}
 144
 145static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
 146                                  void *opaque, Error **errp)
 147{
 148    VfuObject *o = VFU_OBJECT(obj);
 149
 150    if (o->vfu_ctx) {
 151        error_setg(errp, "vfu: Unable to set socket property - server busy");
 152        return;
 153    }
 154
 155    qapi_free_SocketAddress(o->socket);
 156
 157    o->socket = NULL;
 158
 159    visit_type_SocketAddress(v, name, &o->socket, errp);
 160
 161    if (o->socket->type != SOCKET_ADDRESS_TYPE_UNIX) {
 162        error_setg(errp, "vfu: Unsupported socket type - %s",
 163                   SocketAddressType_str(o->socket->type));
 164        qapi_free_SocketAddress(o->socket);
 165        o->socket = NULL;
 166        return;
 167    }
 168
 169    trace_vfu_prop("socket", o->socket->u.q_unix.path);
 170
 171    vfu_object_init_ctx(o, errp);
 172}
 173
 174static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
 175{
 176    VfuObject *o = VFU_OBJECT(obj);
 177
 178    if (o->vfu_ctx) {
 179        error_setg(errp, "vfu: Unable to set device property - server busy");
 180        return;
 181    }
 182
 183    g_free(o->device);
 184
 185    o->device = g_strdup(str);
 186
 187    trace_vfu_prop("device", str);
 188
 189    vfu_object_init_ctx(o, errp);
 190}
 191
 192static void vfu_object_ctx_run(void *opaque)
 193{
 194    VfuObject *o = opaque;
 195    const char *vfu_id;
 196    char *vfu_path, *pci_dev_path;
 197    int ret = -1;
 198
 199    while (ret != 0) {
 200        ret = vfu_run_ctx(o->vfu_ctx);
 201        if (ret < 0) {
 202            if (errno == EINTR) {
 203                continue;
 204            } else if (errno == ENOTCONN) {
 205                vfu_id = object_get_canonical_path_component(OBJECT(o));
 206                vfu_path = object_get_canonical_path(OBJECT(o));
 207                g_assert(o->pci_dev);
 208                pci_dev_path = object_get_canonical_path(OBJECT(o->pci_dev));
 209                 /* o->device is a required property and is non-NULL here */
 210                g_assert(o->device);
 211                qapi_event_send_vfu_client_hangup(vfu_id, vfu_path,
 212                                                  o->device, pci_dev_path);
 213                qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
 214                o->vfu_poll_fd = -1;
 215                object_unparent(OBJECT(o));
 216                g_free(vfu_path);
 217                g_free(pci_dev_path);
 218                break;
 219            } else {
 220                VFU_OBJECT_ERROR(o, "vfu: Failed to run device %s - %s",
 221                                 o->device, strerror(errno));
 222                break;
 223            }
 224        }
 225    }
 226}
 227
 228static void vfu_object_attach_ctx(void *opaque)
 229{
 230    VfuObject *o = opaque;
 231    GPollFD pfds[1];
 232    int ret;
 233
 234    qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
 235
 236    pfds[0].fd = o->vfu_poll_fd;
 237    pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
 238
 239retry_attach:
 240    ret = vfu_attach_ctx(o->vfu_ctx);
 241    if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
 242        /**
 243         * vfu_object_attach_ctx can block QEMU's main loop
 244         * during attach - the monitor and other IO
 245         * could be unresponsive during this time.
 246         */
 247        (void)qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
 248        goto retry_attach;
 249    } else if (ret < 0) {
 250        VFU_OBJECT_ERROR(o, "vfu: Failed to attach device %s to context - %s",
 251                         o->device, strerror(errno));
 252        return;
 253    }
 254
 255    o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
 256    if (o->vfu_poll_fd < 0) {
 257        VFU_OBJECT_ERROR(o, "vfu: Failed to get poll fd %s", o->device);
 258        return;
 259    }
 260
 261    qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_ctx_run, NULL, o);
 262}
 263
 264static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
 265                                     size_t count, loff_t offset,
 266                                     const bool is_write)
 267{
 268    VfuObject *o = vfu_get_private(vfu_ctx);
 269    uint32_t pci_access_width = sizeof(uint32_t);
 270    size_t bytes = count;
 271    uint32_t val = 0;
 272    char *ptr = buf;
 273    int len;
 274
 275    /*
 276     * Writes to the BAR registers would trigger an update to the
 277     * global Memory and IO AddressSpaces. But the remote device
 278     * never uses the global AddressSpaces, therefore overlapping
 279     * memory regions are not a problem
 280     */
 281    while (bytes > 0) {
 282        len = (bytes > pci_access_width) ? pci_access_width : bytes;
 283        if (is_write) {
 284            memcpy(&val, ptr, len);
 285            pci_host_config_write_common(o->pci_dev, offset,
 286                                         pci_config_size(o->pci_dev),
 287                                         val, len);
 288            trace_vfu_cfg_write(offset, val);
 289        } else {
 290            val = pci_host_config_read_common(o->pci_dev, offset,
 291                                              pci_config_size(o->pci_dev), len);
 292            memcpy(ptr, &val, len);
 293            trace_vfu_cfg_read(offset, val);
 294        }
 295        offset += len;
 296        ptr += len;
 297        bytes -= len;
 298    }
 299
 300    return count;
 301}
 302
 303static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
 304{
 305    VfuObject *o = vfu_get_private(vfu_ctx);
 306    AddressSpace *dma_as = NULL;
 307    MemoryRegion *subregion = NULL;
 308    g_autofree char *name = NULL;
 309    struct iovec *iov = &info->iova;
 310
 311    if (!info->vaddr) {
 312        return;
 313    }
 314
 315    name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
 316                           (uint64_t)info->vaddr);
 317
 318    subregion = g_new0(MemoryRegion, 1);
 319
 320    memory_region_init_ram_ptr(subregion, NULL, name,
 321                               iov->iov_len, info->vaddr);
 322
 323    dma_as = pci_device_iommu_address_space(o->pci_dev);
 324
 325    memory_region_add_subregion(dma_as->root, (hwaddr)iov->iov_base, subregion);
 326
 327    trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len);
 328}
 329
 330static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
 331{
 332    VfuObject *o = vfu_get_private(vfu_ctx);
 333    AddressSpace *dma_as = NULL;
 334    MemoryRegion *mr = NULL;
 335    ram_addr_t offset;
 336
 337    mr = memory_region_from_host(info->vaddr, &offset);
 338    if (!mr) {
 339        return;
 340    }
 341
 342    dma_as = pci_device_iommu_address_space(o->pci_dev);
 343
 344    memory_region_del_subregion(dma_as->root, mr);
 345
 346    object_unparent((OBJECT(mr)));
 347
 348    trace_vfu_dma_unregister((uint64_t)info->iova.iov_base);
 349}
 350
 351static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf, hwaddr offset,
 352                            hwaddr size, const bool is_write)
 353{
 354    uint8_t *ptr = buf;
 355    bool release_lock = false;
 356    uint8_t *ram_ptr = NULL;
 357    MemTxResult result;
 358    int access_size;
 359    uint64_t val;
 360
 361    if (memory_access_is_direct(mr, is_write)) {
 362        /**
 363         * Some devices expose a PCI expansion ROM, which could be buffer
 364         * based as compared to other regions which are primarily based on
 365         * MemoryRegionOps. memory_region_find() would already check
 366         * for buffer overflow, we don't need to repeat it here.
 367         */
 368        ram_ptr = memory_region_get_ram_ptr(mr);
 369
 370        if (is_write) {
 371            memcpy((ram_ptr + offset), buf, size);
 372        } else {
 373            memcpy(buf, (ram_ptr + offset), size);
 374        }
 375
 376        return 0;
 377    }
 378
 379    while (size) {
 380        /**
 381         * The read/write logic used below is similar to the ones in
 382         * flatview_read/write_continue()
 383         */
 384        release_lock = prepare_mmio_access(mr);
 385
 386        access_size = memory_access_size(mr, size, offset);
 387
 388        if (is_write) {
 389            val = ldn_he_p(ptr, access_size);
 390
 391            result = memory_region_dispatch_write(mr, offset, val,
 392                                                  size_memop(access_size),
 393                                                  MEMTXATTRS_UNSPECIFIED);
 394        } else {
 395            result = memory_region_dispatch_read(mr, offset, &val,
 396                                                 size_memop(access_size),
 397                                                 MEMTXATTRS_UNSPECIFIED);
 398
 399            stn_he_p(ptr, access_size, val);
 400        }
 401
 402        if (release_lock) {
 403            qemu_mutex_unlock_iothread();
 404            release_lock = false;
 405        }
 406
 407        if (result != MEMTX_OK) {
 408            return -1;
 409        }
 410
 411        size -= access_size;
 412        ptr += access_size;
 413        offset += access_size;
 414    }
 415
 416    return 0;
 417}
 418
 419static size_t vfu_object_bar_rw(PCIDevice *pci_dev, int pci_bar,
 420                                hwaddr bar_offset, char * const buf,
 421                                hwaddr len, const bool is_write)
 422{
 423    MemoryRegionSection section = { 0 };
 424    uint8_t *ptr = (uint8_t *)buf;
 425    MemoryRegion *section_mr = NULL;
 426    uint64_t section_size;
 427    hwaddr section_offset;
 428    hwaddr size = 0;
 429
 430    while (len) {
 431        section = memory_region_find(pci_dev->io_regions[pci_bar].memory,
 432                                     bar_offset, len);
 433
 434        if (!section.mr) {
 435            warn_report("vfu: invalid address 0x%"PRIx64"", bar_offset);
 436            return size;
 437        }
 438
 439        section_mr = section.mr;
 440        section_offset = section.offset_within_region;
 441        section_size = int128_get64(section.size);
 442
 443        if (is_write && section_mr->readonly) {
 444            warn_report("vfu: attempting to write to readonly region in "
 445                        "bar %d - [0x%"PRIx64" - 0x%"PRIx64"]",
 446                        pci_bar, bar_offset,
 447                        (bar_offset + section_size));
 448            memory_region_unref(section_mr);
 449            return size;
 450        }
 451
 452        if (vfu_object_mr_rw(section_mr, ptr, section_offset,
 453                             section_size, is_write)) {
 454            warn_report("vfu: failed to %s "
 455                        "[0x%"PRIx64" - 0x%"PRIx64"] in bar %d",
 456                        is_write ? "write to" : "read from", bar_offset,
 457                        (bar_offset + section_size), pci_bar);
 458            memory_region_unref(section_mr);
 459            return size;
 460        }
 461
 462        size += section_size;
 463        bar_offset += section_size;
 464        ptr += section_size;
 465        len -= section_size;
 466
 467        memory_region_unref(section_mr);
 468    }
 469
 470    return size;
 471}
 472
 473/**
 474 * VFU_OBJECT_BAR_HANDLER - macro for defining handlers for PCI BARs.
 475 *
 476 * To create handler for BAR number 2, VFU_OBJECT_BAR_HANDLER(2) would
 477 * define vfu_object_bar2_handler
 478 */
 479#define VFU_OBJECT_BAR_HANDLER(BAR_NO)                                         \
 480    static ssize_t vfu_object_bar##BAR_NO##_handler(vfu_ctx_t *vfu_ctx,        \
 481                                        char * const buf, size_t count,        \
 482                                        loff_t offset, const bool is_write)    \
 483    {                                                                          \
 484        VfuObject *o = vfu_get_private(vfu_ctx);                               \
 485        PCIDevice *pci_dev = o->pci_dev;                                       \
 486                                                                               \
 487        return vfu_object_bar_rw(pci_dev, BAR_NO, offset,                      \
 488                                 buf, count, is_write);                        \
 489    }                                                                          \
 490
 491VFU_OBJECT_BAR_HANDLER(0)
 492VFU_OBJECT_BAR_HANDLER(1)
 493VFU_OBJECT_BAR_HANDLER(2)
 494VFU_OBJECT_BAR_HANDLER(3)
 495VFU_OBJECT_BAR_HANDLER(4)
 496VFU_OBJECT_BAR_HANDLER(5)
 497VFU_OBJECT_BAR_HANDLER(6)
 498
 499static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = {
 500    &vfu_object_bar0_handler,
 501    &vfu_object_bar1_handler,
 502    &vfu_object_bar2_handler,
 503    &vfu_object_bar3_handler,
 504    &vfu_object_bar4_handler,
 505    &vfu_object_bar5_handler,
 506    &vfu_object_bar6_handler,
 507};
 508
 509/**
 510 * vfu_object_register_bars - Identify active BAR regions of pdev and setup
 511 *                            callbacks to handle read/write accesses
 512 */
 513static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
 514{
 515    int flags = VFU_REGION_FLAG_RW;
 516    int i;
 517
 518    for (i = 0; i < PCI_NUM_REGIONS; i++) {
 519        if (!pdev->io_regions[i].size) {
 520            continue;
 521        }
 522
 523        if ((i == VFU_PCI_DEV_ROM_REGION_IDX) ||
 524            pdev->io_regions[i].memory->readonly) {
 525            flags &= ~VFU_REGION_FLAG_WRITE;
 526        }
 527
 528        vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX + i,
 529                         (size_t)pdev->io_regions[i].size,
 530                         vfu_object_bar_handlers[i],
 531                         flags, NULL, 0, -1, 0);
 532
 533        trace_vfu_bar_register(i, pdev->io_regions[i].addr,
 534                               pdev->io_regions[i].size);
 535    }
 536}
 537
 538static int vfu_object_map_irq(PCIDevice *pci_dev, int intx)
 539{
 540    int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
 541                                pci_dev->devfn);
 542
 543    return pci_bdf;
 544}
 545
 546static void vfu_object_set_irq(void *opaque, int pirq, int level)
 547{
 548    PCIBus *pci_bus = opaque;
 549    PCIDevice *pci_dev = NULL;
 550    vfu_ctx_t *vfu_ctx = NULL;
 551    int pci_bus_num, devfn;
 552
 553    if (level) {
 554        pci_bus_num = PCI_BUS_NUM(pirq);
 555        devfn = PCI_BDF_TO_DEVFN(pirq);
 556
 557        /*
 558         * pci_find_device() performs at O(1) if the device is attached
 559         * to the root PCI bus. Whereas, if the device is attached to a
 560         * secondary PCI bus (such as when a root port is involved),
 561         * finding the parent PCI bus could take O(n)
 562         */
 563        pci_dev = pci_find_device(pci_bus, pci_bus_num, devfn);
 564
 565        vfu_ctx = pci_dev->irq_opaque;
 566
 567        g_assert(vfu_ctx);
 568
 569        vfu_irq_trigger(vfu_ctx, 0);
 570    }
 571}
 572
 573static MSIMessage vfu_object_msi_prepare_msg(PCIDevice *pci_dev,
 574                                             unsigned int vector)
 575{
 576    MSIMessage msg;
 577
 578    msg.address = 0;
 579    msg.data = vector;
 580
 581    return msg;
 582}
 583
 584static void vfu_object_msi_trigger(PCIDevice *pci_dev, MSIMessage msg)
 585{
 586    vfu_ctx_t *vfu_ctx = pci_dev->irq_opaque;
 587
 588    vfu_irq_trigger(vfu_ctx, msg.data);
 589}
 590
 591static void vfu_object_setup_msi_cbs(VfuObject *o)
 592{
 593    o->default_msi_trigger = o->pci_dev->msi_trigger;
 594    o->default_msi_prepare_message = o->pci_dev->msi_prepare_message;
 595    o->default_msix_prepare_message = o->pci_dev->msix_prepare_message;
 596
 597    o->pci_dev->msi_trigger = vfu_object_msi_trigger;
 598    o->pci_dev->msi_prepare_message = vfu_object_msi_prepare_msg;
 599    o->pci_dev->msix_prepare_message = vfu_object_msi_prepare_msg;
 600}
 601
 602static void vfu_object_restore_msi_cbs(VfuObject *o)
 603{
 604    o->pci_dev->msi_trigger = o->default_msi_trigger;
 605    o->pci_dev->msi_prepare_message = o->default_msi_prepare_message;
 606    o->pci_dev->msix_prepare_message = o->default_msix_prepare_message;
 607}
 608
 609static void vfu_msix_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
 610                               uint32_t count, bool mask)
 611{
 612    VfuObject *o = vfu_get_private(vfu_ctx);
 613    uint32_t vector;
 614
 615    for (vector = start; vector < count; vector++) {
 616        msix_set_mask(o->pci_dev, vector, mask);
 617    }
 618}
 619
 620static void vfu_msi_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
 621                              uint32_t count, bool mask)
 622{
 623    VfuObject *o = vfu_get_private(vfu_ctx);
 624    Error *err = NULL;
 625    uint32_t vector;
 626
 627    for (vector = start; vector < count; vector++) {
 628        msi_set_mask(o->pci_dev, vector, mask, &err);
 629        if (err) {
 630            VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device,
 631                             error_get_pretty(err));
 632            error_free(err);
 633            err = NULL;
 634        }
 635    }
 636}
 637
 638static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev)
 639{
 640    vfu_ctx_t *vfu_ctx = o->vfu_ctx;
 641    int ret;
 642
 643    ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
 644    if (ret < 0) {
 645        return ret;
 646    }
 647
 648    if (msix_nr_vectors_allocated(pci_dev)) {
 649        ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ,
 650                                       msix_nr_vectors_allocated(pci_dev));
 651        vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSIX_IRQ,
 652                                     &vfu_msix_irq_state);
 653    } else if (msi_nr_vectors_allocated(pci_dev)) {
 654        ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ,
 655                                       msi_nr_vectors_allocated(pci_dev));
 656        vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSI_IRQ,
 657                                     &vfu_msi_irq_state);
 658    }
 659
 660    if (ret < 0) {
 661        return ret;
 662    }
 663
 664    vfu_object_setup_msi_cbs(o);
 665
 666    pci_dev->irq_opaque = vfu_ctx;
 667
 668    return 0;
 669}
 670
 671void vfu_object_set_bus_irq(PCIBus *pci_bus)
 672{
 673    int bus_num = pci_bus_num(pci_bus);
 674    int max_bdf = PCI_BUILD_BDF(bus_num, PCI_DEVFN_MAX - 1);
 675
 676    pci_bus_irqs(pci_bus, vfu_object_set_irq, pci_bus, max_bdf);
 677    pci_bus_map_irqs(pci_bus, vfu_object_map_irq);
 678}
 679
 680static int vfu_object_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type)
 681{
 682    VfuObject *o = vfu_get_private(vfu_ctx);
 683
 684    /* vfu_object_ctx_run() handles lost connection */
 685    if (type == VFU_RESET_LOST_CONN) {
 686        return 0;
 687    }
 688
 689    device_cold_reset(DEVICE(o->pci_dev));
 690
 691    return 0;
 692}
 693
 694/*
 695 * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
 696 * properties. It also depends on devices instantiated in QEMU. These
 697 * dependencies are not available during the instance_init phase of this
 698 * object's life-cycle. As such, the server is initialized after the
 699 * machine is setup. machine_init_done_notifier notifies TYPE_VFU_OBJECT
 700 * when the machine is setup, and the dependencies are available.
 701 */
 702static void vfu_object_machine_done(Notifier *notifier, void *data)
 703{
 704    VfuObject *o = container_of(notifier, VfuObject, machine_done);
 705    Error *err = NULL;
 706
 707    vfu_object_init_ctx(o, &err);
 708
 709    if (err) {
 710        error_propagate(&error_abort, err);
 711    }
 712}
 713
 714/**
 715 * vfu_object_init_ctx: Create and initialize libvfio-user context. Add
 716 *     an unplug blocker for the associated PCI device. Setup a FD handler
 717 *     to process incoming messages in the context's socket.
 718 *
 719 *     The socket and device properties are mandatory, and this function
 720 *     will not create the context without them - the setters for these
 721 *     properties should call this function when the property is set. The
 722 *     machine should also be ready when this function is invoked - it is
 723 *     because QEMU objects are initialized before devices, and the
 724 *     associated PCI device wouldn't be available at the object
 725 *     initialization time. Until these conditions are satisfied, this
 726 *     function would return early without performing any task.
 727 */
 728static void vfu_object_init_ctx(VfuObject *o, Error **errp)
 729{
 730    DeviceState *dev = NULL;
 731    vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL;
 732    int ret;
 733
 734    if (o->vfu_ctx || !o->socket || !o->device ||
 735            !phase_check(PHASE_MACHINE_READY)) {
 736        return;
 737    }
 738
 739    if (o->err) {
 740        error_propagate(errp, o->err);
 741        o->err = NULL;
 742        return;
 743    }
 744
 745    o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path,
 746                                LIBVFIO_USER_FLAG_ATTACH_NB,
 747                                o, VFU_DEV_TYPE_PCI);
 748    if (o->vfu_ctx == NULL) {
 749        error_setg(errp, "vfu: Failed to create context - %s", strerror(errno));
 750        return;
 751    }
 752
 753    dev = qdev_find_recursive(sysbus_get_default(), o->device);
 754    if (dev == NULL) {
 755        error_setg(errp, "vfu: Device %s not found", o->device);
 756        goto fail;
 757    }
 758
 759    if (!object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
 760        error_setg(errp, "vfu: %s not a PCI device", o->device);
 761        goto fail;
 762    }
 763
 764    o->pci_dev = PCI_DEVICE(dev);
 765
 766    object_ref(OBJECT(o->pci_dev));
 767
 768    if (pci_is_express(o->pci_dev)) {
 769        pci_type = VFU_PCI_TYPE_EXPRESS;
 770    }
 771
 772    ret = vfu_pci_init(o->vfu_ctx, pci_type, PCI_HEADER_TYPE_NORMAL, 0);
 773    if (ret < 0) {
 774        error_setg(errp,
 775                   "vfu: Failed to attach PCI device %s to context - %s",
 776                   o->device, strerror(errno));
 777        goto fail;
 778    }
 779
 780    error_setg(&o->unplug_blocker,
 781               "vfu: %s for %s must be deleted before unplugging",
 782               TYPE_VFU_OBJECT, o->device);
 783    qdev_add_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
 784
 785    ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX,
 786                           pci_config_size(o->pci_dev), &vfu_object_cfg_access,
 787                           VFU_REGION_FLAG_RW | VFU_REGION_FLAG_ALWAYS_CB,
 788                           NULL, 0, -1, 0);
 789    if (ret < 0) {
 790        error_setg(errp,
 791                   "vfu: Failed to setup config space handlers for %s- %s",
 792                   o->device, strerror(errno));
 793        goto fail;
 794    }
 795
 796    ret = vfu_setup_device_dma(o->vfu_ctx, &dma_register, &dma_unregister);
 797    if (ret < 0) {
 798        error_setg(errp, "vfu: Failed to setup DMA handlers for %s",
 799                   o->device);
 800        goto fail;
 801    }
 802
 803    vfu_object_register_bars(o->vfu_ctx, o->pci_dev);
 804
 805    ret = vfu_object_setup_irqs(o, o->pci_dev);
 806    if (ret < 0) {
 807        error_setg(errp, "vfu: Failed to setup interrupts for %s",
 808                   o->device);
 809        goto fail;
 810    }
 811
 812    ret = vfu_setup_device_reset_cb(o->vfu_ctx, &vfu_object_device_reset);
 813    if (ret < 0) {
 814        error_setg(errp, "vfu: Failed to setup reset callback");
 815        goto fail;
 816    }
 817
 818    ret = vfu_realize_ctx(o->vfu_ctx);
 819    if (ret < 0) {
 820        error_setg(errp, "vfu: Failed to realize device %s- %s",
 821                   o->device, strerror(errno));
 822        goto fail;
 823    }
 824
 825    o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
 826    if (o->vfu_poll_fd < 0) {
 827        error_setg(errp, "vfu: Failed to get poll fd %s", o->device);
 828        goto fail;
 829    }
 830
 831    qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_attach_ctx, NULL, o);
 832
 833    return;
 834
 835fail:
 836    vfu_destroy_ctx(o->vfu_ctx);
 837    if (o->unplug_blocker && o->pci_dev) {
 838        qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
 839        error_free(o->unplug_blocker);
 840        o->unplug_blocker = NULL;
 841    }
 842    if (o->pci_dev) {
 843        vfu_object_restore_msi_cbs(o);
 844        o->pci_dev->irq_opaque = NULL;
 845        object_unref(OBJECT(o->pci_dev));
 846        o->pci_dev = NULL;
 847    }
 848    o->vfu_ctx = NULL;
 849}
 850
 851static void vfu_object_init(Object *obj)
 852{
 853    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
 854    VfuObject *o = VFU_OBJECT(obj);
 855
 856    k->nr_devs++;
 857
 858    if (!object_dynamic_cast(OBJECT(current_machine), TYPE_REMOTE_MACHINE)) {
 859        error_setg(&o->err, "vfu: %s only compatible with %s machine",
 860                   TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
 861        return;
 862    }
 863
 864    if (!phase_check(PHASE_MACHINE_READY)) {
 865        o->machine_done.notify = vfu_object_machine_done;
 866        qemu_add_machine_init_done_notifier(&o->machine_done);
 867    }
 868
 869    o->vfu_poll_fd = -1;
 870}
 871
 872static void vfu_object_finalize(Object *obj)
 873{
 874    VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
 875    VfuObject *o = VFU_OBJECT(obj);
 876
 877    k->nr_devs--;
 878
 879    qapi_free_SocketAddress(o->socket);
 880
 881    o->socket = NULL;
 882
 883    if (o->vfu_poll_fd != -1) {
 884        qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
 885        o->vfu_poll_fd = -1;
 886    }
 887
 888    if (o->vfu_ctx) {
 889        vfu_destroy_ctx(o->vfu_ctx);
 890        o->vfu_ctx = NULL;
 891    }
 892
 893    g_free(o->device);
 894
 895    o->device = NULL;
 896
 897    if (o->unplug_blocker && o->pci_dev) {
 898        qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
 899        error_free(o->unplug_blocker);
 900        o->unplug_blocker = NULL;
 901    }
 902
 903    if (o->pci_dev) {
 904        vfu_object_restore_msi_cbs(o);
 905        o->pci_dev->irq_opaque = NULL;
 906        object_unref(OBJECT(o->pci_dev));
 907        o->pci_dev = NULL;
 908    }
 909
 910    if (!k->nr_devs && vfu_object_auto_shutdown()) {
 911        qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
 912    }
 913
 914    if (o->machine_done.notify) {
 915        qemu_remove_machine_init_done_notifier(&o->machine_done);
 916        o->machine_done.notify = NULL;
 917    }
 918}
 919
 920static void vfu_object_class_init(ObjectClass *klass, void *data)
 921{
 922    VfuObjectClass *k = VFU_OBJECT_CLASS(klass);
 923
 924    k->nr_devs = 0;
 925
 926    object_class_property_add(klass, "socket", "SocketAddress", NULL,
 927                              vfu_object_set_socket, NULL, NULL);
 928    object_class_property_set_description(klass, "socket",
 929                                          "SocketAddress "
 930                                          "(ex: type=unix,path=/tmp/sock). "
 931                                          "Only UNIX is presently supported");
 932    object_class_property_add_str(klass, "device", NULL,
 933                                  vfu_object_set_device);
 934    object_class_property_set_description(klass, "device",
 935                                          "device ID - only PCI devices "
 936                                          "are presently supported");
 937}
 938
 939static const TypeInfo vfu_object_info = {
 940    .name = TYPE_VFU_OBJECT,
 941    .parent = TYPE_OBJECT,
 942    .instance_size = sizeof(VfuObject),
 943    .instance_init = vfu_object_init,
 944    .instance_finalize = vfu_object_finalize,
 945    .class_size = sizeof(VfuObjectClass),
 946    .class_init = vfu_object_class_init,
 947    .interfaces = (InterfaceInfo[]) {
 948        { TYPE_USER_CREATABLE },
 949        { }
 950    }
 951};
 952
 953static void vfu_register_types(void)
 954{
 955    type_register_static(&vfu_object_info);
 956}
 957
 958type_init(vfu_register_types);
 959