qemu/hw/vfio/pci.c
<<
>>
Prefs
   1/*
   2 * vfio based device assignment support
   3 *
   4 * Copyright Red Hat, Inc. 2012
   5 *
   6 * Authors:
   7 *  Alex Williamson <alex.williamson@redhat.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Based on qemu-kvm device-assignment:
  13 *  Adapted for KVM by Qumranet.
  14 *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
  15 *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
  16 *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
  17 *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
  18 *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
  19 */
  20
  21#include "qemu/osdep.h"
  22#include <linux/vfio.h>
  23#include <sys/ioctl.h>
  24
  25#include "hw/hw.h"
  26#include "hw/pci/msi.h"
  27#include "hw/pci/msix.h"
  28#include "hw/pci/pci_bridge.h"
  29#include "hw/qdev-properties.h"
  30#include "migration/vmstate.h"
  31#include "qemu/error-report.h"
  32#include "qemu/main-loop.h"
  33#include "qemu/module.h"
  34#include "qemu/option.h"
  35#include "qemu/range.h"
  36#include "qemu/units.h"
  37#include "sysemu/kvm.h"
  38#include "sysemu/runstate.h"
  39#include "sysemu/sysemu.h"
  40#include "pci.h"
  41#include "trace.h"
  42#include "qapi/error.h"
  43#include "migration/blocker.h"
  44
  45#define TYPE_VFIO_PCI "vfio-pci"
  46#define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
  47
  48#define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
  49
  50static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
  51static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
  52
  53/*
  54 * Disabling BAR mmaping can be slow, but toggling it around INTx can
  55 * also be a huge overhead.  We try to get the best of both worlds by
  56 * waiting until an interrupt to disable mmaps (subsequent transitions
  57 * to the same state are effectively no overhead).  If the interrupt has
  58 * been serviced and the time gap is long enough, we re-enable mmaps for
  59 * performance.  This works well for things like graphics cards, which
  60 * may not use their interrupt at all and are penalized to an unusable
  61 * level by read/write BAR traps.  Other devices, like NICs, have more
  62 * regular interrupts and see much better latency by staying in non-mmap
  63 * mode.  We therefore set the default mmap_timeout such that a ping
  64 * is just enough to keep the mmap disabled.  Users can experiment with
  65 * other options with the x-intx-mmap-timeout-ms parameter (a value of
  66 * zero disables the timer).
  67 */
  68static void vfio_intx_mmap_enable(void *opaque)
  69{
  70    VFIOPCIDevice *vdev = opaque;
  71
  72    if (vdev->intx.pending) {
  73        timer_mod(vdev->intx.mmap_timer,
  74                       qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
  75        return;
  76    }
  77
  78    vfio_mmap_set_enabled(vdev, true);
  79}
  80
  81static void vfio_intx_interrupt(void *opaque)
  82{
  83    VFIOPCIDevice *vdev = opaque;
  84
  85    if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
  86        return;
  87    }
  88
  89    trace_vfio_intx_interrupt(vdev->vbasedev.name, 'A' + vdev->intx.pin);
  90
  91    vdev->intx.pending = true;
  92    pci_irq_assert(&vdev->pdev);
  93    vfio_mmap_set_enabled(vdev, false);
  94    if (vdev->intx.mmap_timeout) {
  95        timer_mod(vdev->intx.mmap_timer,
  96                       qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->intx.mmap_timeout);
  97    }
  98}
  99
 100static void vfio_intx_eoi(VFIODevice *vbasedev)
 101{
 102    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
 103
 104    if (!vdev->intx.pending) {
 105        return;
 106    }
 107
 108    trace_vfio_intx_eoi(vbasedev->name);
 109
 110    vdev->intx.pending = false;
 111    pci_irq_deassert(&vdev->pdev);
 112    vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 113}
 114
 115static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
 116{
 117#ifdef CONFIG_KVM
 118    struct kvm_irqfd irqfd = {
 119        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 120        .gsi = vdev->intx.route.irq,
 121        .flags = KVM_IRQFD_FLAG_RESAMPLE,
 122    };
 123    Error *err = NULL;
 124
 125    if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
 126        vdev->intx.route.mode != PCI_INTX_ENABLED ||
 127        !kvm_resamplefds_enabled()) {
 128        return;
 129    }
 130
 131    /* Get to a known interrupt state */
 132    qemu_set_fd_handler(irqfd.fd, NULL, NULL, vdev);
 133    vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 134    vdev->intx.pending = false;
 135    pci_irq_deassert(&vdev->pdev);
 136
 137    /* Get an eventfd for resample/unmask */
 138    if (event_notifier_init(&vdev->intx.unmask, 0)) {
 139        error_setg(errp, "event_notifier_init failed eoi");
 140        goto fail;
 141    }
 142
 143    /* KVM triggers it, VFIO listens for it */
 144    irqfd.resamplefd = event_notifier_get_fd(&vdev->intx.unmask);
 145
 146    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 147        error_setg_errno(errp, errno, "failed to setup resample irqfd");
 148        goto fail_irqfd;
 149    }
 150
 151    if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
 152                               VFIO_IRQ_SET_ACTION_UNMASK,
 153                               irqfd.resamplefd, &err)) {
 154        error_propagate(errp, err);
 155        goto fail_vfio;
 156    }
 157
 158    /* Let'em rip */
 159    vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 160
 161    vdev->intx.kvm_accel = true;
 162
 163    trace_vfio_intx_enable_kvm(vdev->vbasedev.name);
 164
 165    return;
 166
 167fail_vfio:
 168    irqfd.flags = KVM_IRQFD_FLAG_DEASSIGN;
 169    kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd);
 170fail_irqfd:
 171    event_notifier_cleanup(&vdev->intx.unmask);
 172fail:
 173    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 174    vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 175#endif
 176}
 177
 178static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev)
 179{
 180#ifdef CONFIG_KVM
 181    struct kvm_irqfd irqfd = {
 182        .fd = event_notifier_get_fd(&vdev->intx.interrupt),
 183        .gsi = vdev->intx.route.irq,
 184        .flags = KVM_IRQFD_FLAG_DEASSIGN,
 185    };
 186
 187    if (!vdev->intx.kvm_accel) {
 188        return;
 189    }
 190
 191    /*
 192     * Get to a known state, hardware masked, QEMU ready to accept new
 193     * interrupts, QEMU IRQ de-asserted.
 194     */
 195    vfio_mask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 196    vdev->intx.pending = false;
 197    pci_irq_deassert(&vdev->pdev);
 198
 199    /* Tell KVM to stop listening for an INTx irqfd */
 200    if (kvm_vm_ioctl(kvm_state, KVM_IRQFD, &irqfd)) {
 201        error_report("vfio: Error: Failed to disable INTx irqfd: %m");
 202    }
 203
 204    /* We only need to close the eventfd for VFIO to cleanup the kernel side */
 205    event_notifier_cleanup(&vdev->intx.unmask);
 206
 207    /* QEMU starts listening for interrupt events. */
 208    qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
 209
 210    vdev->intx.kvm_accel = false;
 211
 212    /* If we've missed an event, let it re-fire through QEMU */
 213    vfio_unmask_single_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 214
 215    trace_vfio_intx_disable_kvm(vdev->vbasedev.name);
 216#endif
 217}
 218
 219static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route)
 220{
 221    Error *err = NULL;
 222
 223    trace_vfio_intx_update(vdev->vbasedev.name,
 224                           vdev->intx.route.irq, route->irq);
 225
 226    vfio_intx_disable_kvm(vdev);
 227
 228    vdev->intx.route = *route;
 229
 230    if (route->mode != PCI_INTX_ENABLED) {
 231        return;
 232    }
 233
 234    vfio_intx_enable_kvm(vdev, &err);
 235    if (err) {
 236        warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
 237    }
 238
 239    /* Re-enable the interrupt in cased we missed an EOI */
 240    vfio_intx_eoi(&vdev->vbasedev);
 241}
 242
 243static void vfio_intx_routing_notifier(PCIDevice *pdev)
 244{
 245    VFIOPCIDevice *vdev = PCI_VFIO(pdev);
 246    PCIINTxRoute route;
 247
 248    if (vdev->interrupt != VFIO_INT_INTx) {
 249        return;
 250    }
 251
 252    route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
 253
 254    if (pci_intx_route_changed(&vdev->intx.route, &route)) {
 255        vfio_intx_update(vdev, &route);
 256    }
 257}
 258
 259static void vfio_irqchip_change(Notifier *notify, void *data)
 260{
 261    VFIOPCIDevice *vdev = container_of(notify, VFIOPCIDevice,
 262                                       irqchip_change_notifier);
 263
 264    vfio_intx_update(vdev, &vdev->intx.route);
 265}
 266
 267static int vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp)
 268{
 269    uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
 270    Error *err = NULL;
 271    int32_t fd;
 272    int ret;
 273
 274
 275    if (!pin) {
 276        return 0;
 277    }
 278
 279    vfio_disable_interrupts(vdev);
 280
 281    vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
 282    pci_config_set_interrupt_pin(vdev->pdev.config, pin);
 283
 284#ifdef CONFIG_KVM
 285    /*
 286     * Only conditional to avoid generating error messages on platforms
 287     * where we won't actually use the result anyway.
 288     */
 289    if (kvm_irqfds_enabled() && kvm_resamplefds_enabled()) {
 290        vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
 291                                                        vdev->intx.pin);
 292    }
 293#endif
 294
 295    ret = event_notifier_init(&vdev->intx.interrupt, 0);
 296    if (ret) {
 297        error_setg_errno(errp, -ret, "event_notifier_init failed");
 298        return ret;
 299    }
 300    fd = event_notifier_get_fd(&vdev->intx.interrupt);
 301    qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
 302
 303    if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
 304                               VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
 305        error_propagate(errp, err);
 306        qemu_set_fd_handler(fd, NULL, NULL, vdev);
 307        event_notifier_cleanup(&vdev->intx.interrupt);
 308        return -errno;
 309    }
 310
 311    vfio_intx_enable_kvm(vdev, &err);
 312    if (err) {
 313        warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
 314    }
 315
 316    vdev->interrupt = VFIO_INT_INTx;
 317
 318    trace_vfio_intx_enable(vdev->vbasedev.name);
 319    return 0;
 320}
 321
 322static void vfio_intx_disable(VFIOPCIDevice *vdev)
 323{
 324    int fd;
 325
 326    timer_del(vdev->intx.mmap_timer);
 327    vfio_intx_disable_kvm(vdev);
 328    vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 329    vdev->intx.pending = false;
 330    pci_irq_deassert(&vdev->pdev);
 331    vfio_mmap_set_enabled(vdev, true);
 332
 333    fd = event_notifier_get_fd(&vdev->intx.interrupt);
 334    qemu_set_fd_handler(fd, NULL, NULL, vdev);
 335    event_notifier_cleanup(&vdev->intx.interrupt);
 336
 337    vdev->interrupt = VFIO_INT_NONE;
 338
 339    trace_vfio_intx_disable(vdev->vbasedev.name);
 340}
 341
 342/*
 343 * MSI/X
 344 */
 345static void vfio_msi_interrupt(void *opaque)
 346{
 347    VFIOMSIVector *vector = opaque;
 348    VFIOPCIDevice *vdev = vector->vdev;
 349    MSIMessage (*get_msg)(PCIDevice *dev, unsigned vector);
 350    void (*notify)(PCIDevice *dev, unsigned vector);
 351    MSIMessage msg;
 352    int nr = vector - vdev->msi_vectors;
 353
 354    if (!event_notifier_test_and_clear(&vector->interrupt)) {
 355        return;
 356    }
 357
 358    if (vdev->interrupt == VFIO_INT_MSIX) {
 359        get_msg = msix_get_message;
 360        notify = msix_notify;
 361
 362        /* A masked vector firing needs to use the PBA, enable it */
 363        if (msix_is_masked(&vdev->pdev, nr)) {
 364            set_bit(nr, vdev->msix->pending);
 365            memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, true);
 366            trace_vfio_msix_pba_enable(vdev->vbasedev.name);
 367        }
 368    } else if (vdev->interrupt == VFIO_INT_MSI) {
 369        get_msg = msi_get_message;
 370        notify = msi_notify;
 371    } else {
 372        abort();
 373    }
 374
 375    msg = get_msg(&vdev->pdev, nr);
 376    trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data);
 377    notify(&vdev->pdev, nr);
 378}
 379
 380static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
 381{
 382    struct vfio_irq_set *irq_set;
 383    int ret = 0, i, argsz;
 384    int32_t *fds;
 385
 386    argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
 387
 388    irq_set = g_malloc0(argsz);
 389    irq_set->argsz = argsz;
 390    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
 391    irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
 392    irq_set->start = 0;
 393    irq_set->count = vdev->nr_vectors;
 394    fds = (int32_t *)&irq_set->data;
 395
 396    for (i = 0; i < vdev->nr_vectors; i++) {
 397        int fd = -1;
 398
 399        /*
 400         * MSI vs MSI-X - The guest has direct access to MSI mask and pending
 401         * bits, therefore we always use the KVM signaling path when setup.
 402         * MSI-X mask and pending bits are emulated, so we want to use the
 403         * KVM signaling path only when configured and unmasked.
 404         */
 405        if (vdev->msi_vectors[i].use) {
 406            if (vdev->msi_vectors[i].virq < 0 ||
 407                (msix && msix_is_masked(&vdev->pdev, i))) {
 408                fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
 409            } else {
 410                fd = event_notifier_get_fd(&vdev->msi_vectors[i].kvm_interrupt);
 411            }
 412        }
 413
 414        fds[i] = fd;
 415    }
 416
 417    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
 418
 419    g_free(irq_set);
 420
 421    return ret;
 422}
 423
 424static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
 425                                  int vector_n, bool msix)
 426{
 427    int virq;
 428
 429    if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) {
 430        return;
 431    }
 432
 433    if (event_notifier_init(&vector->kvm_interrupt, 0)) {
 434        return;
 435    }
 436
 437    virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev);
 438    if (virq < 0) {
 439        event_notifier_cleanup(&vector->kvm_interrupt);
 440        return;
 441    }
 442
 443    if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
 444                                       NULL, virq) < 0) {
 445        kvm_irqchip_release_virq(kvm_state, virq);
 446        event_notifier_cleanup(&vector->kvm_interrupt);
 447        return;
 448    }
 449
 450    vector->virq = virq;
 451}
 452
 453static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
 454{
 455    kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
 456                                          vector->virq);
 457    kvm_irqchip_release_virq(kvm_state, vector->virq);
 458    vector->virq = -1;
 459    event_notifier_cleanup(&vector->kvm_interrupt);
 460}
 461
 462static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg,
 463                                     PCIDevice *pdev)
 464{
 465    kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg, pdev);
 466    kvm_irqchip_commit_routes(kvm_state);
 467}
 468
 469static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
 470                                   MSIMessage *msg, IOHandler *handler)
 471{
 472    VFIOPCIDevice *vdev = PCI_VFIO(pdev);
 473    VFIOMSIVector *vector;
 474    int ret;
 475
 476    trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
 477
 478    vector = &vdev->msi_vectors[nr];
 479
 480    if (!vector->use) {
 481        vector->vdev = vdev;
 482        vector->virq = -1;
 483        if (event_notifier_init(&vector->interrupt, 0)) {
 484            error_report("vfio: Error: event_notifier_init failed");
 485        }
 486        vector->use = true;
 487        msix_vector_use(pdev, nr);
 488    }
 489
 490    qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 491                        handler, NULL, vector);
 492
 493    /*
 494     * Attempt to enable route through KVM irqchip,
 495     * default to userspace handling if unavailable.
 496     */
 497    if (vector->virq >= 0) {
 498        if (!msg) {
 499            vfio_remove_kvm_msi_virq(vector);
 500        } else {
 501            vfio_update_kvm_msi_virq(vector, *msg, pdev);
 502        }
 503    } else {
 504        if (msg) {
 505            vfio_add_kvm_msi_virq(vdev, vector, nr, true);
 506        }
 507    }
 508
 509    /*
 510     * We don't want to have the host allocate all possible MSI vectors
 511     * for a device if they're not in use, so we shutdown and incrementally
 512     * increase them as needed.
 513     */
 514    if (vdev->nr_vectors < nr + 1) {
 515        vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 516        vdev->nr_vectors = nr + 1;
 517        ret = vfio_enable_vectors(vdev, true);
 518        if (ret) {
 519            error_report("vfio: failed to enable vectors, %d", ret);
 520        }
 521    } else {
 522        Error *err = NULL;
 523        int32_t fd;
 524
 525        if (vector->virq >= 0) {
 526            fd = event_notifier_get_fd(&vector->kvm_interrupt);
 527        } else {
 528            fd = event_notifier_get_fd(&vector->interrupt);
 529        }
 530
 531        if (vfio_set_irq_signaling(&vdev->vbasedev,
 532                                     VFIO_PCI_MSIX_IRQ_INDEX, nr,
 533                                     VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
 534            error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
 535        }
 536    }
 537
 538    /* Disable PBA emulation when nothing more is pending. */
 539    clear_bit(nr, vdev->msix->pending);
 540    if (find_first_bit(vdev->msix->pending,
 541                       vdev->nr_vectors) == vdev->nr_vectors) {
 542        memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
 543        trace_vfio_msix_pba_disable(vdev->vbasedev.name);
 544    }
 545
 546    return 0;
 547}
 548
 549static int vfio_msix_vector_use(PCIDevice *pdev,
 550                                unsigned int nr, MSIMessage msg)
 551{
 552    return vfio_msix_vector_do_use(pdev, nr, &msg, vfio_msi_interrupt);
 553}
 554
 555static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
 556{
 557    VFIOPCIDevice *vdev = PCI_VFIO(pdev);
 558    VFIOMSIVector *vector = &vdev->msi_vectors[nr];
 559
 560    trace_vfio_msix_vector_release(vdev->vbasedev.name, nr);
 561
 562    /*
 563     * There are still old guests that mask and unmask vectors on every
 564     * interrupt.  If we're using QEMU bypass with a KVM irqfd, leave all of
 565     * the KVM setup in place, simply switch VFIO to use the non-bypass
 566     * eventfd.  We'll then fire the interrupt through QEMU and the MSI-X
 567     * core will mask the interrupt and set pending bits, allowing it to
 568     * be re-asserted on unmask.  Nothing to do if already using QEMU mode.
 569     */
 570    if (vector->virq >= 0) {
 571        int32_t fd = event_notifier_get_fd(&vector->interrupt);
 572        Error *err = NULL;
 573
 574        if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, nr,
 575                                   VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
 576            error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
 577        }
 578    }
 579}
 580
 581static void vfio_msix_enable(VFIOPCIDevice *vdev)
 582{
 583    vfio_disable_interrupts(vdev);
 584
 585    vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
 586
 587    vdev->interrupt = VFIO_INT_MSIX;
 588
 589    /*
 590     * Some communication channels between VF & PF or PF & fw rely on the
 591     * physical state of the device and expect that enabling MSI-X from the
 592     * guest enables the same on the host.  When our guest is Linux, the
 593     * guest driver call to pci_enable_msix() sets the enabling bit in the
 594     * MSI-X capability, but leaves the vector table masked.  We therefore
 595     * can't rely on a vector_use callback (from request_irq() in the guest)
 596     * to switch the physical device into MSI-X mode because that may come a
 597     * long time after pci_enable_msix().  This code enables vector 0 with
 598     * triggering to userspace, then immediately release the vector, leaving
 599     * the physical device with no vectors enabled, but MSI-X enabled, just
 600     * like the guest view.
 601     */
 602    vfio_msix_vector_do_use(&vdev->pdev, 0, NULL, NULL);
 603    vfio_msix_vector_release(&vdev->pdev, 0);
 604
 605    if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
 606                                  vfio_msix_vector_release, NULL)) {
 607        error_report("vfio: msix_set_vector_notifiers failed");
 608    }
 609
 610    trace_vfio_msix_enable(vdev->vbasedev.name);
 611}
 612
 613static void vfio_msi_enable(VFIOPCIDevice *vdev)
 614{
 615    int ret, i;
 616
 617    vfio_disable_interrupts(vdev);
 618
 619    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
 620retry:
 621    vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
 622
 623    for (i = 0; i < vdev->nr_vectors; i++) {
 624        VFIOMSIVector *vector = &vdev->msi_vectors[i];
 625
 626        vector->vdev = vdev;
 627        vector->virq = -1;
 628        vector->use = true;
 629
 630        if (event_notifier_init(&vector->interrupt, 0)) {
 631            error_report("vfio: Error: event_notifier_init failed");
 632        }
 633
 634        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 635                            vfio_msi_interrupt, NULL, vector);
 636
 637        /*
 638         * Attempt to enable route through KVM irqchip,
 639         * default to userspace handling if unavailable.
 640         */
 641        vfio_add_kvm_msi_virq(vdev, vector, i, false);
 642    }
 643
 644    /* Set interrupt type prior to possible interrupts */
 645    vdev->interrupt = VFIO_INT_MSI;
 646
 647    ret = vfio_enable_vectors(vdev, false);
 648    if (ret) {
 649        if (ret < 0) {
 650            error_report("vfio: Error: Failed to setup MSI fds: %m");
 651        } else if (ret != vdev->nr_vectors) {
 652            error_report("vfio: Error: Failed to enable %d "
 653                         "MSI vectors, retry with %d", vdev->nr_vectors, ret);
 654        }
 655
 656        for (i = 0; i < vdev->nr_vectors; i++) {
 657            VFIOMSIVector *vector = &vdev->msi_vectors[i];
 658            if (vector->virq >= 0) {
 659                vfio_remove_kvm_msi_virq(vector);
 660            }
 661            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 662                                NULL, NULL, NULL);
 663            event_notifier_cleanup(&vector->interrupt);
 664        }
 665
 666        g_free(vdev->msi_vectors);
 667        vdev->msi_vectors = NULL;
 668
 669        if (ret > 0 && ret != vdev->nr_vectors) {
 670            vdev->nr_vectors = ret;
 671            goto retry;
 672        }
 673        vdev->nr_vectors = 0;
 674
 675        /*
 676         * Failing to setup MSI doesn't really fall within any specification.
 677         * Let's try leaving interrupts disabled and hope the guest figures
 678         * out to fall back to INTx for this device.
 679         */
 680        error_report("vfio: Error: Failed to enable MSI");
 681        vdev->interrupt = VFIO_INT_NONE;
 682
 683        return;
 684    }
 685
 686    trace_vfio_msi_enable(vdev->vbasedev.name, vdev->nr_vectors);
 687}
 688
 689static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
 690{
 691    Error *err = NULL;
 692    int i;
 693
 694    for (i = 0; i < vdev->nr_vectors; i++) {
 695        VFIOMSIVector *vector = &vdev->msi_vectors[i];
 696        if (vdev->msi_vectors[i].use) {
 697            if (vector->virq >= 0) {
 698                vfio_remove_kvm_msi_virq(vector);
 699            }
 700            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
 701                                NULL, NULL, NULL);
 702            event_notifier_cleanup(&vector->interrupt);
 703        }
 704    }
 705
 706    g_free(vdev->msi_vectors);
 707    vdev->msi_vectors = NULL;
 708    vdev->nr_vectors = 0;
 709    vdev->interrupt = VFIO_INT_NONE;
 710
 711    vfio_intx_enable(vdev, &err);
 712    if (err) {
 713        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
 714    }
 715}
 716
 717static void vfio_msix_disable(VFIOPCIDevice *vdev)
 718{
 719    int i;
 720
 721    msix_unset_vector_notifiers(&vdev->pdev);
 722
 723    /*
 724     * MSI-X will only release vectors if MSI-X is still enabled on the
 725     * device, check through the rest and release it ourselves if necessary.
 726     */
 727    for (i = 0; i < vdev->nr_vectors; i++) {
 728        if (vdev->msi_vectors[i].use) {
 729            vfio_msix_vector_release(&vdev->pdev, i);
 730            msix_vector_unuse(&vdev->pdev, i);
 731        }
 732    }
 733
 734    if (vdev->nr_vectors) {
 735        vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 736    }
 737
 738    vfio_msi_disable_common(vdev);
 739
 740    memset(vdev->msix->pending, 0,
 741           BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
 742
 743    trace_vfio_msix_disable(vdev->vbasedev.name);
 744}
 745
 746static void vfio_msi_disable(VFIOPCIDevice *vdev)
 747{
 748    vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
 749    vfio_msi_disable_common(vdev);
 750
 751    trace_vfio_msi_disable(vdev->vbasedev.name);
 752}
 753
 754static void vfio_update_msi(VFIOPCIDevice *vdev)
 755{
 756    int i;
 757
 758    for (i = 0; i < vdev->nr_vectors; i++) {
 759        VFIOMSIVector *vector = &vdev->msi_vectors[i];
 760        MSIMessage msg;
 761
 762        if (!vector->use || vector->virq < 0) {
 763            continue;
 764        }
 765
 766        msg = msi_get_message(&vdev->pdev, i);
 767        vfio_update_kvm_msi_virq(vector, msg, &vdev->pdev);
 768    }
 769}
 770
 771static void vfio_pci_load_rom(VFIOPCIDevice *vdev)
 772{
 773    struct vfio_region_info *reg_info;
 774    uint64_t size;
 775    off_t off = 0;
 776    ssize_t bytes;
 777
 778    if (vfio_get_region_info(&vdev->vbasedev,
 779                             VFIO_PCI_ROM_REGION_INDEX, &reg_info)) {
 780        error_report("vfio: Error getting ROM info: %m");
 781        return;
 782    }
 783
 784    trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size,
 785                            (unsigned long)reg_info->offset,
 786                            (unsigned long)reg_info->flags);
 787
 788    vdev->rom_size = size = reg_info->size;
 789    vdev->rom_offset = reg_info->offset;
 790
 791    g_free(reg_info);
 792
 793    if (!vdev->rom_size) {
 794        vdev->rom_read_failed = true;
 795        error_report("vfio-pci: Cannot read device rom at "
 796                    "%s", vdev->vbasedev.name);
 797        error_printf("Device option ROM contents are probably invalid "
 798                    "(check dmesg).\nSkip option ROM probe with rombar=0, "
 799                    "or load from file with romfile=\n");
 800        return;
 801    }
 802
 803    vdev->rom = g_malloc(size);
 804    memset(vdev->rom, 0xff, size);
 805
 806    while (size) {
 807        bytes = pread(vdev->vbasedev.fd, vdev->rom + off,
 808                      size, vdev->rom_offset + off);
 809        if (bytes == 0) {
 810            break;
 811        } else if (bytes > 0) {
 812            off += bytes;
 813            size -= bytes;
 814        } else {
 815            if (errno == EINTR || errno == EAGAIN) {
 816                continue;
 817            }
 818            error_report("vfio: Error reading device ROM: %m");
 819            break;
 820        }
 821    }
 822
 823    /*
 824     * Test the ROM signature against our device, if the vendor is correct
 825     * but the device ID doesn't match, store the correct device ID and
 826     * recompute the checksum.  Intel IGD devices need this and are known
 827     * to have bogus checksums so we can't simply adjust the checksum.
 828     */
 829    if (pci_get_word(vdev->rom) == 0xaa55 &&
 830        pci_get_word(vdev->rom + 0x18) + 8 < vdev->rom_size &&
 831        !memcmp(vdev->rom + pci_get_word(vdev->rom + 0x18), "PCIR", 4)) {
 832        uint16_t vid, did;
 833
 834        vid = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 4);
 835        did = pci_get_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6);
 836
 837        if (vid == vdev->vendor_id && did != vdev->device_id) {
 838            int i;
 839            uint8_t csum, *data = vdev->rom;
 840
 841            pci_set_word(vdev->rom + pci_get_word(vdev->rom + 0x18) + 6,
 842                         vdev->device_id);
 843            data[6] = 0;
 844
 845            for (csum = 0, i = 0; i < vdev->rom_size; i++) {
 846                csum += data[i];
 847            }
 848
 849            data[6] = -csum;
 850        }
 851    }
 852}
 853
 854static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size)
 855{
 856    VFIOPCIDevice *vdev = opaque;
 857    union {
 858        uint8_t byte;
 859        uint16_t word;
 860        uint32_t dword;
 861        uint64_t qword;
 862    } val;
 863    uint64_t data = 0;
 864
 865    /* Load the ROM lazily when the guest tries to read it */
 866    if (unlikely(!vdev->rom && !vdev->rom_read_failed)) {
 867        vfio_pci_load_rom(vdev);
 868    }
 869
 870    memcpy(&val, vdev->rom + addr,
 871           (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0);
 872
 873    switch (size) {
 874    case 1:
 875        data = val.byte;
 876        break;
 877    case 2:
 878        data = le16_to_cpu(val.word);
 879        break;
 880    case 4:
 881        data = le32_to_cpu(val.dword);
 882        break;
 883    default:
 884        hw_error("vfio: unsupported read size, %d bytes\n", size);
 885        break;
 886    }
 887
 888    trace_vfio_rom_read(vdev->vbasedev.name, addr, size, data);
 889
 890    return data;
 891}
 892
 893static void vfio_rom_write(void *opaque, hwaddr addr,
 894                           uint64_t data, unsigned size)
 895{
 896}
 897
 898static const MemoryRegionOps vfio_rom_ops = {
 899    .read = vfio_rom_read,
 900    .write = vfio_rom_write,
 901    .endianness = DEVICE_LITTLE_ENDIAN,
 902};
 903
 904static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
 905{
 906    uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK);
 907    off_t offset = vdev->config_offset + PCI_ROM_ADDRESS;
 908    DeviceState *dev = DEVICE(vdev);
 909    char *name;
 910    int fd = vdev->vbasedev.fd;
 911
 912    if (vdev->pdev.romfile || !vdev->pdev.rom_bar) {
 913        /* Since pci handles romfile, just print a message and return */
 914        if (vfio_blacklist_opt_rom(vdev) && vdev->pdev.romfile) {
 915            warn_report("Device at %s is known to cause system instability"
 916                        " issues during option rom execution",
 917                        vdev->vbasedev.name);
 918            error_printf("Proceeding anyway since user specified romfile\n");
 919        }
 920        return;
 921    }
 922
 923    /*
 924     * Use the same size ROM BAR as the physical device.  The contents
 925     * will get filled in later when the guest tries to read it.
 926     */
 927    if (pread(fd, &orig, 4, offset) != 4 ||
 928        pwrite(fd, &size, 4, offset) != 4 ||
 929        pread(fd, &size, 4, offset) != 4 ||
 930        pwrite(fd, &orig, 4, offset) != 4) {
 931        error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name);
 932        return;
 933    }
 934
 935    size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1;
 936
 937    if (!size) {
 938        return;
 939    }
 940
 941    if (vfio_blacklist_opt_rom(vdev)) {
 942        if (dev->opts && qemu_opt_get(dev->opts, "rombar")) {
 943            warn_report("Device at %s is known to cause system instability"
 944                        " issues during option rom execution",
 945                        vdev->vbasedev.name);
 946            error_printf("Proceeding anyway since user specified"
 947                         " non zero value for rombar\n");
 948        } else {
 949            warn_report("Rom loading for device at %s has been disabled"
 950                        " due to system instability issues",
 951                        vdev->vbasedev.name);
 952            error_printf("Specify rombar=1 or romfile to force\n");
 953            return;
 954        }
 955    }
 956
 957    trace_vfio_pci_size_rom(vdev->vbasedev.name, size);
 958
 959    name = g_strdup_printf("vfio[%s].rom", vdev->vbasedev.name);
 960
 961    memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev),
 962                          &vfio_rom_ops, vdev, name, size);
 963    g_free(name);
 964
 965    pci_register_bar(&vdev->pdev, PCI_ROM_SLOT,
 966                     PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom);
 967
 968    vdev->rom_read_failed = false;
 969}
 970
 971void vfio_vga_write(void *opaque, hwaddr addr,
 972                           uint64_t data, unsigned size)
 973{
 974    VFIOVGARegion *region = opaque;
 975    VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
 976    union {
 977        uint8_t byte;
 978        uint16_t word;
 979        uint32_t dword;
 980        uint64_t qword;
 981    } buf;
 982    off_t offset = vga->fd_offset + region->offset + addr;
 983
 984    switch (size) {
 985    case 1:
 986        buf.byte = data;
 987        break;
 988    case 2:
 989        buf.word = cpu_to_le16(data);
 990        break;
 991    case 4:
 992        buf.dword = cpu_to_le32(data);
 993        break;
 994    default:
 995        hw_error("vfio: unsupported write size, %d bytes", size);
 996        break;
 997    }
 998
 999    if (pwrite(vga->fd, &buf, size, offset) != size) {
1000        error_report("%s(,0x%"HWADDR_PRIx", 0x%"PRIx64", %d) failed: %m",
1001                     __func__, region->offset + addr, data, size);
1002    }
1003
1004    trace_vfio_vga_write(region->offset + addr, data, size);
1005}
1006
1007uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
1008{
1009    VFIOVGARegion *region = opaque;
1010    VFIOVGA *vga = container_of(region, VFIOVGA, region[region->nr]);
1011    union {
1012        uint8_t byte;
1013        uint16_t word;
1014        uint32_t dword;
1015        uint64_t qword;
1016    } buf;
1017    uint64_t data = 0;
1018    off_t offset = vga->fd_offset + region->offset + addr;
1019
1020    if (pread(vga->fd, &buf, size, offset) != size) {
1021        error_report("%s(,0x%"HWADDR_PRIx", %d) failed: %m",
1022                     __func__, region->offset + addr, size);
1023        return (uint64_t)-1;
1024    }
1025
1026    switch (size) {
1027    case 1:
1028        data = buf.byte;
1029        break;
1030    case 2:
1031        data = le16_to_cpu(buf.word);
1032        break;
1033    case 4:
1034        data = le32_to_cpu(buf.dword);
1035        break;
1036    default:
1037        hw_error("vfio: unsupported read size, %d bytes", size);
1038        break;
1039    }
1040
1041    trace_vfio_vga_read(region->offset + addr, size, data);
1042
1043    return data;
1044}
1045
1046static const MemoryRegionOps vfio_vga_ops = {
1047    .read = vfio_vga_read,
1048    .write = vfio_vga_write,
1049    .endianness = DEVICE_LITTLE_ENDIAN,
1050};
1051
1052/*
1053 * Expand memory region of sub-page(size < PAGE_SIZE) MMIO BAR to page
1054 * size if the BAR is in an exclusive page in host so that we could map
1055 * this BAR to guest. But this sub-page BAR may not occupy an exclusive
1056 * page in guest. So we should set the priority of the expanded memory
1057 * region to zero in case of overlap with BARs which share the same page
1058 * with the sub-page BAR in guest. Besides, we should also recover the
1059 * size of this sub-page BAR when its base address is changed in guest
1060 * and not page aligned any more.
1061 */
1062static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
1063{
1064    VFIOPCIDevice *vdev = PCI_VFIO(pdev);
1065    VFIORegion *region = &vdev->bars[bar].region;
1066    MemoryRegion *mmap_mr, *region_mr, *base_mr;
1067    PCIIORegion *r;
1068    pcibus_t bar_addr;
1069    uint64_t size = region->size;
1070
1071    /* Make sure that the whole region is allowed to be mmapped */
1072    if (region->nr_mmaps != 1 || !region->mmaps[0].mmap ||
1073        region->mmaps[0].size != region->size) {
1074        return;
1075    }
1076
1077    r = &pdev->io_regions[bar];
1078    bar_addr = r->addr;
1079    base_mr = vdev->bars[bar].mr;
1080    region_mr = region->mem;
1081    mmap_mr = &region->mmaps[0].mem;
1082
1083    /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
1084    if (bar_addr != PCI_BAR_UNMAPPED &&
1085        !(bar_addr & ~qemu_real_host_page_mask)) {
1086        size = qemu_real_host_page_size;
1087    }
1088
1089    memory_region_transaction_begin();
1090
1091    if (vdev->bars[bar].size < size) {
1092        memory_region_set_size(base_mr, size);
1093    }
1094    memory_region_set_size(region_mr, size);
1095    memory_region_set_size(mmap_mr, size);
1096    if (size != vdev->bars[bar].size && memory_region_is_mapped(base_mr)) {
1097        memory_region_del_subregion(r->address_space, base_mr);
1098        memory_region_add_subregion_overlap(r->address_space,
1099                                            bar_addr, base_mr, 0);
1100    }
1101
1102    memory_region_transaction_commit();
1103}
1104
1105/*
1106 * PCI config space
1107 */
1108uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
1109{
1110    VFIOPCIDevice *vdev = PCI_VFIO(pdev);
1111    uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
1112
1113    memcpy(&emu_bits, vdev->emulated_config_bits + addr, len);
1114    emu_bits = le32_to_cpu(emu_bits);
1115
1116    if (emu_bits) {
1117        emu_val = pci_default_read_config(pdev, addr, len);
1118    }
1119
1120    if (~emu_bits & (0xffffffffU >> (32 - len * 8))) {
1121        ssize_t ret;
1122
1123        ret = pread(vdev->vbasedev.fd, &phys_val, len,
1124                    vdev->config_offset + addr);
1125        if (ret != len) {
1126            error_report("%s(%s, 0x%x, 0x%x) failed: %m",
1127                         __func__, vdev->vbasedev.name, addr, len);
1128            return -errno;
1129        }
1130        phys_val = le32_to_cpu(phys_val);
1131    }
1132
1133    val = (emu_val & emu_bits) | (phys_val & ~emu_bits);
1134
1135    trace_vfio_pci_read_config(vdev->vbasedev.name, addr, len, val);
1136
1137    return val;
1138}
1139
1140void vfio_pci_write_config(PCIDevice *pdev,
1141                           uint32_t addr, uint32_t val, int len)
1142{
1143    VFIOPCIDevice *vdev = PCI_VFIO(pdev);
1144    uint32_t val_le = cpu_to_le32(val);
1145
1146    trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len);
1147
1148    /* Write everything to VFIO, let it filter out what we can't write */
1149    if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr)
1150                != len) {
1151        error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m",
1152                     __func__, vdev->vbasedev.name, addr, val, len);
1153    }
1154
1155    /* MSI/MSI-X Enabling/Disabling */
1156    if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
1157        ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
1158        int is_enabled, was_enabled = msi_enabled(pdev);
1159
1160        pci_default_write_config(pdev, addr, val, len);
1161
1162        is_enabled = msi_enabled(pdev);
1163
1164        if (!was_enabled) {
1165            if (is_enabled) {
1166                vfio_msi_enable(vdev);
1167            }
1168        } else {
1169            if (!is_enabled) {
1170                vfio_msi_disable(vdev);
1171            } else {
1172                vfio_update_msi(vdev);
1173            }
1174        }
1175    } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
1176        ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
1177        int is_enabled, was_enabled = msix_enabled(pdev);
1178
1179        pci_default_write_config(pdev, addr, val, len);
1180
1181        is_enabled = msix_enabled(pdev);
1182
1183        if (!was_enabled && is_enabled) {
1184            vfio_msix_enable(vdev);
1185        } else if (was_enabled && !is_enabled) {
1186            vfio_msix_disable(vdev);
1187        }
1188    } else if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_0, 24) ||
1189        range_covers_byte(addr, len, PCI_COMMAND)) {
1190        pcibus_t old_addr[PCI_NUM_REGIONS - 1];
1191        int bar;
1192
1193        for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
1194            old_addr[bar] = pdev->io_regions[bar].addr;
1195        }
1196
1197        pci_default_write_config(pdev, addr, val, len);
1198
1199        for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
1200            if (old_addr[bar] != pdev->io_regions[bar].addr &&
1201                vdev->bars[bar].region.size > 0 &&
1202                vdev->bars[bar].region.size < qemu_real_host_page_size) {
1203                vfio_sub_page_bar_update_mapping(pdev, bar);
1204            }
1205        }
1206    } else {
1207        /* Write everything to QEMU to keep emulated bits correct */
1208        pci_default_write_config(pdev, addr, val, len);
1209    }
1210}
1211
1212/*
1213 * Interrupt setup
1214 */
1215static void vfio_disable_interrupts(VFIOPCIDevice *vdev)
1216{
1217    /*
1218     * More complicated than it looks.  Disabling MSI/X transitions the
1219     * device to INTx mode (if supported).  Therefore we need to first
1220     * disable MSI/X and then cleanup by disabling INTx.
1221     */
1222    if (vdev->interrupt == VFIO_INT_MSIX) {
1223        vfio_msix_disable(vdev);
1224    } else if (vdev->interrupt == VFIO_INT_MSI) {
1225        vfio_msi_disable(vdev);
1226    }
1227
1228    if (vdev->interrupt == VFIO_INT_INTx) {
1229        vfio_intx_disable(vdev);
1230    }
1231}
1232
1233static int vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1234{
1235    uint16_t ctrl;
1236    bool msi_64bit, msi_maskbit;
1237    int ret, entries;
1238    Error *err = NULL;
1239
1240    if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl),
1241              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
1242        error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS");
1243        return -errno;
1244    }
1245    ctrl = le16_to_cpu(ctrl);
1246
1247    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
1248    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
1249    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
1250
1251    trace_vfio_msi_setup(vdev->vbasedev.name, pos);
1252
1253    ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit, &err);
1254    if (ret < 0) {
1255        if (ret == -ENOTSUP) {
1256            return 0;
1257        }
1258        error_propagate_prepend(errp, err, "msi_init failed: ");
1259        return ret;
1260    }
1261    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
1262
1263    return 0;
1264}
1265
1266static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
1267{
1268    off_t start, end;
1269    VFIORegion *region = &vdev->bars[vdev->msix->table_bar].region;
1270
1271    /*
1272     * If the host driver allows mapping of a MSIX data, we are going to
1273     * do map the entire BAR and emulate MSIX table on top of that.
1274     */
1275    if (vfio_has_region_cap(&vdev->vbasedev, region->nr,
1276                            VFIO_REGION_INFO_CAP_MSIX_MAPPABLE)) {
1277        return;
1278    }
1279
1280    /*
1281     * We expect to find a single mmap covering the whole BAR, anything else
1282     * means it's either unsupported or already setup.
1283     */
1284    if (region->nr_mmaps != 1 || region->mmaps[0].offset ||
1285        region->size != region->mmaps[0].size) {
1286        return;
1287    }
1288
1289    /* MSI-X table start and end aligned to host page size */
1290    start = vdev->msix->table_offset & qemu_real_host_page_mask;
1291    end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
1292                               (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
1293
1294    /*
1295     * Does the MSI-X table cover the beginning of the BAR?  The whole BAR?
1296     * NB - Host page size is necessarily a power of two and so is the PCI
1297     * BAR (not counting EA yet), therefore if we have host page aligned
1298     * @start and @end, then any remainder of the BAR before or after those
1299     * must be at least host page sized and therefore mmap'able.
1300     */
1301    if (!start) {
1302        if (end >= region->size) {
1303            region->nr_mmaps = 0;
1304            g_free(region->mmaps);
1305            region->mmaps = NULL;
1306            trace_vfio_msix_fixup(vdev->vbasedev.name,
1307                                  vdev->msix->table_bar, 0, 0);
1308        } else {
1309            region->mmaps[0].offset = end;
1310            region->mmaps[0].size = region->size - end;
1311            trace_vfio_msix_fixup(vdev->vbasedev.name,
1312                              vdev->msix->table_bar, region->mmaps[0].offset,
1313                              region->mmaps[0].offset + region->mmaps[0].size);
1314        }
1315
1316    /* Maybe it's aligned at the end of the BAR */
1317    } else if (end >= region->size) {
1318        region->mmaps[0].size = start;
1319        trace_vfio_msix_fixup(vdev->vbasedev.name,
1320                              vdev->msix->table_bar, region->mmaps[0].offset,
1321                              region->mmaps[0].offset + region->mmaps[0].size);
1322
1323    /* Otherwise it must split the BAR */
1324    } else {
1325        region->nr_mmaps = 2;
1326        region->mmaps = g_renew(VFIOMmap, region->mmaps, 2);
1327
1328        memcpy(&region->mmaps[1], &region->mmaps[0], sizeof(VFIOMmap));
1329
1330        region->mmaps[0].size = start;
1331        trace_vfio_msix_fixup(vdev->vbasedev.name,
1332                              vdev->msix->table_bar, region->mmaps[0].offset,
1333                              region->mmaps[0].offset + region->mmaps[0].size);
1334
1335        region->mmaps[1].offset = end;
1336        region->mmaps[1].size = region->size - end;
1337        trace_vfio_msix_fixup(vdev->vbasedev.name,
1338                              vdev->msix->table_bar, region->mmaps[1].offset,
1339                              region->mmaps[1].offset + region->mmaps[1].size);
1340    }
1341}
1342
1343static void vfio_pci_relocate_msix(VFIOPCIDevice *vdev, Error **errp)
1344{
1345    int target_bar = -1;
1346    size_t msix_sz;
1347
1348    if (!vdev->msix || vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
1349        return;
1350    }
1351
1352    /* The actual minimum size of MSI-X structures */
1353    msix_sz = (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE) +
1354              (QEMU_ALIGN_UP(vdev->msix->entries, 64) / 8);
1355    /* Round up to host pages, we don't want to share a page */
1356    msix_sz = REAL_HOST_PAGE_ALIGN(msix_sz);
1357    /* PCI BARs must be a power of 2 */
1358    msix_sz = pow2ceil(msix_sz);
1359
1360    if (vdev->msix_relo == OFF_AUTOPCIBAR_AUTO) {
1361        /*
1362         * TODO: Lookup table for known devices.
1363         *
1364         * Logically we might use an algorithm here to select the BAR adding
1365         * the least additional MMIO space, but we cannot programatically
1366         * predict the driver dependency on BAR ordering or sizing, therefore
1367         * 'auto' becomes a lookup for combinations reported to work.
1368         */
1369        if (target_bar < 0) {
1370            error_setg(errp, "No automatic MSI-X relocation available for "
1371                       "device %04x:%04x", vdev->vendor_id, vdev->device_id);
1372            return;
1373        }
1374    } else {
1375        target_bar = (int)(vdev->msix_relo - OFF_AUTOPCIBAR_BAR0);
1376    }
1377
1378    /* I/O port BARs cannot host MSI-X structures */
1379    if (vdev->bars[target_bar].ioport) {
1380        error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1381                   "I/O port BAR", target_bar);
1382        return;
1383    }
1384
1385    /* Cannot use a BAR in the "shadow" of a 64-bit BAR */
1386    if (!vdev->bars[target_bar].size &&
1387         target_bar > 0 && vdev->bars[target_bar - 1].mem64) {
1388        error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1389                   "consumed by 64-bit BAR %d", target_bar, target_bar - 1);
1390        return;
1391    }
1392
1393    /* 2GB max size for 32-bit BARs, cannot double if already > 1G */
1394    if (vdev->bars[target_bar].size > 1 * GiB &&
1395        !vdev->bars[target_bar].mem64) {
1396        error_setg(errp, "Invalid MSI-X relocation BAR %d, "
1397                   "no space to extend 32-bit BAR", target_bar);
1398        return;
1399    }
1400
1401    /*
1402     * If adding a new BAR, test if we can make it 64bit.  We make it
1403     * prefetchable since QEMU MSI-X emulation has no read side effects
1404     * and doing so makes mapping more flexible.
1405     */
1406    if (!vdev->bars[target_bar].size) {
1407        if (target_bar < (PCI_ROM_SLOT - 1) &&
1408            !vdev->bars[target_bar + 1].size) {
1409            vdev->bars[target_bar].mem64 = true;
1410            vdev->bars[target_bar].type = PCI_BASE_ADDRESS_MEM_TYPE_64;
1411        }
1412        vdev->bars[target_bar].type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
1413        vdev->bars[target_bar].size = msix_sz;
1414        vdev->msix->table_offset = 0;
1415    } else {
1416        vdev->bars[target_bar].size = MAX(vdev->bars[target_bar].size * 2,
1417                                          msix_sz * 2);
1418        /*
1419         * Due to above size calc, MSI-X always starts halfway into the BAR,
1420         * which will always be a separate host page.
1421         */
1422        vdev->msix->table_offset = vdev->bars[target_bar].size / 2;
1423    }
1424
1425    vdev->msix->table_bar = target_bar;
1426    vdev->msix->pba_bar = target_bar;
1427    /* Requires 8-byte alignment, but PCI_MSIX_ENTRY_SIZE guarantees that */
1428    vdev->msix->pba_offset = vdev->msix->table_offset +
1429                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE);
1430
1431    trace_vfio_msix_relo(vdev->vbasedev.name,
1432                         vdev->msix->table_bar, vdev->msix->table_offset);
1433}
1434
1435/*
1436 * We don't have any control over how pci_add_capability() inserts
1437 * capabilities into the chain.  In order to setup MSI-X we need a
1438 * MemoryRegion for the BAR.  In order to setup the BAR and not
1439 * attempt to mmap the MSI-X table area, which VFIO won't allow, we
1440 * need to first look for where the MSI-X table lives.  So we
1441 * unfortunately split MSI-X setup across two functions.
1442 */
1443static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
1444{
1445    uint8_t pos;
1446    uint16_t ctrl;
1447    uint32_t table, pba;
1448    int fd = vdev->vbasedev.fd;
1449    VFIOMSIXInfo *msix;
1450
1451    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
1452    if (!pos) {
1453        return;
1454    }
1455
1456    if (pread(fd, &ctrl, sizeof(ctrl),
1457              vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) {
1458        error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS");
1459        return;
1460    }
1461
1462    if (pread(fd, &table, sizeof(table),
1463              vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
1464        error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE");
1465        return;
1466    }
1467
1468    if (pread(fd, &pba, sizeof(pba),
1469              vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
1470        error_setg_errno(errp, errno, "failed to read PCI MSIX PBA");
1471        return;
1472    }
1473
1474    ctrl = le16_to_cpu(ctrl);
1475    table = le32_to_cpu(table);
1476    pba = le32_to_cpu(pba);
1477
1478    msix = g_malloc0(sizeof(*msix));
1479    msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
1480    msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
1481    msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
1482    msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
1483    msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
1484
1485    /*
1486     * Test the size of the pba_offset variable and catch if it extends outside
1487     * of the specified BAR. If it is the case, we need to apply a hardware
1488     * specific quirk if the device is known or we have a broken configuration.
1489     */
1490    if (msix->pba_offset >= vdev->bars[msix->pba_bar].region.size) {
1491        /*
1492         * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5
1493         * adapters. The T5 hardware returns an incorrect value of 0x8000 for
1494         * the VF PBA offset while the BAR itself is only 8k. The correct value
1495         * is 0x1000, so we hard code that here.
1496         */
1497        if (vdev->vendor_id == PCI_VENDOR_ID_CHELSIO &&
1498            (vdev->device_id & 0xff00) == 0x5800) {
1499            msix->pba_offset = 0x1000;
1500        } else if (vdev->msix_relo == OFF_AUTOPCIBAR_OFF) {
1501            error_setg(errp, "hardware reports invalid configuration, "
1502                       "MSIX PBA outside of specified BAR");
1503            g_free(msix);
1504            return;
1505        }
1506    }
1507
1508    trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
1509                                msix->table_offset, msix->entries);
1510    vdev->msix = msix;
1511
1512    vfio_pci_fixup_msix_region(vdev);
1513
1514    vfio_pci_relocate_msix(vdev, errp);
1515}
1516
1517static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
1518{
1519    int ret;
1520    Error *err = NULL;
1521
1522    vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
1523                                    sizeof(unsigned long));
1524    ret = msix_init(&vdev->pdev, vdev->msix->entries,
1525                    vdev->bars[vdev->msix->table_bar].mr,
1526                    vdev->msix->table_bar, vdev->msix->table_offset,
1527                    vdev->bars[vdev->msix->pba_bar].mr,
1528                    vdev->msix->pba_bar, vdev->msix->pba_offset, pos,
1529                    &err);
1530    if (ret < 0) {
1531        if (ret == -ENOTSUP) {
1532            warn_report_err(err);
1533            return 0;
1534        }
1535
1536        error_propagate(errp, err);
1537        return ret;
1538    }
1539
1540    /*
1541     * The PCI spec suggests that devices provide additional alignment for
1542     * MSI-X structures and avoid overlapping non-MSI-X related registers.
1543     * For an assigned device, this hopefully means that emulation of MSI-X
1544     * structures does not affect the performance of the device.  If devices
1545     * fail to provide that alignment, a significant performance penalty may
1546     * result, for instance Mellanox MT27500 VFs:
1547     * http://www.spinics.net/lists/kvm/msg125881.html
1548     *
1549     * The PBA is simply not that important for such a serious regression and
1550     * most drivers do not appear to look at it.  The solution for this is to
1551     * disable the PBA MemoryRegion unless it's being used.  We disable it
1552     * here and only enable it if a masked vector fires through QEMU.  As the
1553     * vector-use notifier is called, which occurs on unmask, we test whether
1554     * PBA emulation is needed and again disable if not.
1555     */
1556    memory_region_set_enabled(&vdev->pdev.msix_pba_mmio, false);
1557
1558    /*
1559     * The emulated machine may provide a paravirt interface for MSIX setup
1560     * so it is not strictly necessary to emulate MSIX here. This becomes
1561     * helpful when frequently accessed MMIO registers are located in
1562     * subpages adjacent to the MSIX table but the MSIX data containing page
1563     * cannot be mapped because of a host page size bigger than the MSIX table
1564     * alignment.
1565     */
1566    if (object_property_get_bool(OBJECT(qdev_get_machine()),
1567                                 "vfio-no-msix-emulation", NULL)) {
1568        memory_region_set_enabled(&vdev->pdev.msix_table_mmio, false);
1569    }
1570
1571    return 0;
1572}
1573
1574static void vfio_teardown_msi(VFIOPCIDevice *vdev)
1575{
1576    msi_uninit(&vdev->pdev);
1577
1578    if (vdev->msix) {
1579        msix_uninit(&vdev->pdev,
1580                    vdev->bars[vdev->msix->table_bar].mr,
1581                    vdev->bars[vdev->msix->pba_bar].mr);
1582        g_free(vdev->msix->pending);
1583    }
1584}
1585
1586/*
1587 * Resource setup
1588 */
1589static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
1590{
1591    int i;
1592
1593    for (i = 0; i < PCI_ROM_SLOT; i++) {
1594        vfio_region_mmaps_set_enabled(&vdev->bars[i].region, enabled);
1595    }
1596}
1597
1598static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
1599{
1600    VFIOBAR *bar = &vdev->bars[nr];
1601
1602    uint32_t pci_bar;
1603    int ret;
1604
1605    /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
1606    if (!bar->region.size) {
1607        return;
1608    }
1609
1610    /* Determine what type of BAR this is for registration */
1611    ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar),
1612                vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
1613    if (ret != sizeof(pci_bar)) {
1614        error_report("vfio: Failed to read BAR %d (%m)", nr);
1615        return;
1616    }
1617
1618    pci_bar = le32_to_cpu(pci_bar);
1619    bar->ioport = (pci_bar & PCI_BASE_ADDRESS_SPACE_IO);
1620    bar->mem64 = bar->ioport ? 0 : (pci_bar & PCI_BASE_ADDRESS_MEM_TYPE_64);
1621    bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK :
1622                                         ~PCI_BASE_ADDRESS_MEM_MASK);
1623    bar->size = bar->region.size;
1624}
1625
1626static void vfio_bars_prepare(VFIOPCIDevice *vdev)
1627{
1628    int i;
1629
1630    for (i = 0; i < PCI_ROM_SLOT; i++) {
1631        vfio_bar_prepare(vdev, i);
1632    }
1633}
1634
1635static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
1636{
1637    VFIOBAR *bar = &vdev->bars[nr];
1638    char *name;
1639
1640    if (!bar->size) {
1641        return;
1642    }
1643
1644    bar->mr = g_new0(MemoryRegion, 1);
1645    name = g_strdup_printf("%s base BAR %d", vdev->vbasedev.name, nr);
1646    memory_region_init_io(bar->mr, OBJECT(vdev), NULL, NULL, name, bar->size);
1647    g_free(name);
1648
1649    if (bar->region.size) {
1650        memory_region_add_subregion(bar->mr, 0, bar->region.mem);
1651
1652        if (vfio_region_mmap(&bar->region)) {
1653            error_report("Failed to mmap %s BAR %d. Performance may be slow",
1654                         vdev->vbasedev.name, nr);
1655        }
1656    }
1657
1658    pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
1659}
1660
1661static void vfio_bars_register(VFIOPCIDevice *vdev)
1662{
1663    int i;
1664
1665    for (i = 0; i < PCI_ROM_SLOT; i++) {
1666        vfio_bar_register(vdev, i);
1667    }
1668}
1669
1670static void vfio_bars_exit(VFIOPCIDevice *vdev)
1671{
1672    int i;
1673
1674    for (i = 0; i < PCI_ROM_SLOT; i++) {
1675        VFIOBAR *bar = &vdev->bars[i];
1676
1677        vfio_bar_quirk_exit(vdev, i);
1678        vfio_region_exit(&bar->region);
1679        if (bar->region.size) {
1680            memory_region_del_subregion(bar->mr, bar->region.mem);
1681        }
1682    }
1683
1684    if (vdev->vga) {
1685        pci_unregister_vga(&vdev->pdev);
1686        vfio_vga_quirk_exit(vdev);
1687    }
1688}
1689
1690static void vfio_bars_finalize(VFIOPCIDevice *vdev)
1691{
1692    int i;
1693
1694    for (i = 0; i < PCI_ROM_SLOT; i++) {
1695        VFIOBAR *bar = &vdev->bars[i];
1696
1697        vfio_bar_quirk_finalize(vdev, i);
1698        vfio_region_finalize(&bar->region);
1699        if (bar->size) {
1700            object_unparent(OBJECT(bar->mr));
1701            g_free(bar->mr);
1702        }
1703    }
1704
1705    if (vdev->vga) {
1706        vfio_vga_quirk_finalize(vdev);
1707        for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1708            object_unparent(OBJECT(&vdev->vga->region[i].mem));
1709        }
1710        g_free(vdev->vga);
1711    }
1712}
1713
1714/*
1715 * General setup
1716 */
1717static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
1718{
1719    uint8_t tmp;
1720    uint16_t next = PCI_CONFIG_SPACE_SIZE;
1721
1722    for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
1723         tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT]) {
1724        if (tmp > pos && tmp < next) {
1725            next = tmp;
1726        }
1727    }
1728
1729    return next - pos;
1730}
1731
1732
1733static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos)
1734{
1735    uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE;
1736
1737    for (tmp = PCI_CONFIG_SPACE_SIZE; tmp;
1738        tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) {
1739        if (tmp > pos && tmp < next) {
1740            next = tmp;
1741        }
1742    }
1743
1744    return next - pos;
1745}
1746
1747static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
1748{
1749    pci_set_word(buf, (pci_get_word(buf) & ~mask) | val);
1750}
1751
1752static void vfio_add_emulated_word(VFIOPCIDevice *vdev, int pos,
1753                                   uint16_t val, uint16_t mask)
1754{
1755    vfio_set_word_bits(vdev->pdev.config + pos, val, mask);
1756    vfio_set_word_bits(vdev->pdev.wmask + pos, ~mask, mask);
1757    vfio_set_word_bits(vdev->emulated_config_bits + pos, mask, mask);
1758}
1759
1760static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
1761{
1762    pci_set_long(buf, (pci_get_long(buf) & ~mask) | val);
1763}
1764
1765static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
1766                                   uint32_t val, uint32_t mask)
1767{
1768    vfio_set_long_bits(vdev->pdev.config + pos, val, mask);
1769    vfio_set_long_bits(vdev->pdev.wmask + pos, ~mask, mask);
1770    vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
1771}
1772
1773static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
1774                               Error **errp)
1775{
1776    uint16_t flags;
1777    uint8_t type;
1778
1779    flags = pci_get_word(vdev->pdev.config + pos + PCI_CAP_FLAGS);
1780    type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
1781
1782    if (type != PCI_EXP_TYPE_ENDPOINT &&
1783        type != PCI_EXP_TYPE_LEG_END &&
1784        type != PCI_EXP_TYPE_RC_END) {
1785
1786        error_setg(errp, "assignment of PCIe type 0x%x "
1787                   "devices is not currently supported", type);
1788        return -EINVAL;
1789    }
1790
1791    if (!pci_bus_is_express(pci_get_bus(&vdev->pdev))) {
1792        PCIBus *bus = pci_get_bus(&vdev->pdev);
1793        PCIDevice *bridge;
1794
1795        /*
1796         * Traditionally PCI device assignment exposes the PCIe capability
1797         * as-is on non-express buses.  The reason being that some drivers
1798         * simply assume that it's there, for example tg3.  However when
1799         * we're running on a native PCIe machine type, like Q35, we need
1800         * to hide the PCIe capability.  The reason for this is twofold;
1801         * first Windows guests get a Code 10 error when the PCIe capability
1802         * is exposed in this configuration.  Therefore express devices won't
1803         * work at all unless they're attached to express buses in the VM.
1804         * Second, a native PCIe machine introduces the possibility of fine
1805         * granularity IOMMUs supporting both translation and isolation.
1806         * Guest code to discover the IOMMU visibility of a device, such as
1807         * IOMMU grouping code on Linux, is very aware of device types and
1808         * valid transitions between bus types.  An express device on a non-
1809         * express bus is not a valid combination on bare metal systems.
1810         *
1811         * Drivers that require a PCIe capability to make the device
1812         * functional are simply going to need to have their devices placed
1813         * on a PCIe bus in the VM.
1814         */
1815        while (!pci_bus_is_root(bus)) {
1816            bridge = pci_bridge_get_device(bus);
1817            bus = pci_get_bus(bridge);
1818        }
1819
1820        if (pci_bus_is_express(bus)) {
1821            return 0;
1822        }
1823
1824    } else if (pci_bus_is_root(pci_get_bus(&vdev->pdev))) {
1825        /*
1826         * On a Root Complex bus Endpoints become Root Complex Integrated
1827         * Endpoints, which changes the type and clears the LNK & LNK2 fields.
1828         */
1829        if (type == PCI_EXP_TYPE_ENDPOINT) {
1830            vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
1831                                   PCI_EXP_TYPE_RC_END << 4,
1832                                   PCI_EXP_FLAGS_TYPE);
1833
1834            /* Link Capabilities, Status, and Control goes away */
1835            if (size > PCI_EXP_LNKCTL) {
1836                vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
1837                vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
1838                vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
1839
1840#ifndef PCI_EXP_LNKCAP2
1841#define PCI_EXP_LNKCAP2 44
1842#endif
1843#ifndef PCI_EXP_LNKSTA2
1844#define PCI_EXP_LNKSTA2 50
1845#endif
1846                /* Link 2 Capabilities, Status, and Control goes away */
1847                if (size > PCI_EXP_LNKCAP2) {
1848                    vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
1849                    vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
1850                    vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
1851                }
1852            }
1853
1854        } else if (type == PCI_EXP_TYPE_LEG_END) {
1855            /*
1856             * Legacy endpoints don't belong on the root complex.  Windows
1857             * seems to be happier with devices if we skip the capability.
1858             */
1859            return 0;
1860        }
1861
1862    } else {
1863        /*
1864         * Convert Root Complex Integrated Endpoints to regular endpoints.
1865         * These devices don't support LNK/LNK2 capabilities, so make them up.
1866         */
1867        if (type == PCI_EXP_TYPE_RC_END) {
1868            vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
1869                                   PCI_EXP_TYPE_ENDPOINT << 4,
1870                                   PCI_EXP_FLAGS_TYPE);
1871            vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP,
1872                           QEMU_PCI_EXP_LNKCAP_MLW(QEMU_PCI_EXP_LNK_X1) |
1873                           QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0);
1874            vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
1875        }
1876    }
1877
1878    /*
1879     * Intel 82599 SR-IOV VFs report an invalid PCIe capability version 0
1880     * (Niantic errate #35) causing Windows to error with a Code 10 for the
1881     * device on Q35.  Fixup any such devices to report version 1.  If we
1882     * were to remove the capability entirely the guest would lose extended
1883     * config space.
1884     */
1885    if ((flags & PCI_EXP_FLAGS_VERS) == 0) {
1886        vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
1887                               1, PCI_EXP_FLAGS_VERS);
1888    }
1889
1890    pos = pci_add_capability(&vdev->pdev, PCI_CAP_ID_EXP, pos, size,
1891                             errp);
1892    if (pos < 0) {
1893        return pos;
1894    }
1895
1896    vdev->pdev.exp.exp_cap = pos;
1897
1898    return pos;
1899}
1900
1901static void vfio_check_pcie_flr(VFIOPCIDevice *vdev, uint8_t pos)
1902{
1903    uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP);
1904
1905    if (cap & PCI_EXP_DEVCAP_FLR) {
1906        trace_vfio_check_pcie_flr(vdev->vbasedev.name);
1907        vdev->has_flr = true;
1908    }
1909}
1910
1911static void vfio_check_pm_reset(VFIOPCIDevice *vdev, uint8_t pos)
1912{
1913    uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL);
1914
1915    if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) {
1916        trace_vfio_check_pm_reset(vdev->vbasedev.name);
1917        vdev->has_pm_reset = true;
1918    }
1919}
1920
1921static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
1922{
1923    uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP);
1924
1925    if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) {
1926        trace_vfio_check_af_flr(vdev->vbasedev.name);
1927        vdev->has_flr = true;
1928    }
1929}
1930
1931static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
1932{
1933    PCIDevice *pdev = &vdev->pdev;
1934    uint8_t cap_id, next, size;
1935    int ret;
1936
1937    cap_id = pdev->config[pos];
1938    next = pdev->config[pos + PCI_CAP_LIST_NEXT];
1939
1940    /*
1941     * If it becomes important to configure capabilities to their actual
1942     * size, use this as the default when it's something we don't recognize.
1943     * Since QEMU doesn't actually handle many of the config accesses,
1944     * exact size doesn't seem worthwhile.
1945     */
1946    size = vfio_std_cap_max_size(pdev, pos);
1947
1948    /*
1949     * pci_add_capability always inserts the new capability at the head
1950     * of the chain.  Therefore to end up with a chain that matches the
1951     * physical device, we insert from the end by making this recursive.
1952     * This is also why we pre-calculate size above as cached config space
1953     * will be changed as we unwind the stack.
1954     */
1955    if (next) {
1956        ret = vfio_add_std_cap(vdev, next, errp);
1957        if (ret) {
1958            return ret;
1959        }
1960    } else {
1961        /* Begin the rebuild, use QEMU emulated list bits */
1962        pdev->config[PCI_CAPABILITY_LIST] = 0;
1963        vdev->emulated_config_bits[PCI_CAPABILITY_LIST] = 0xff;
1964        vdev->emulated_config_bits[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
1965
1966        ret = vfio_add_virt_caps(vdev, errp);
1967        if (ret) {
1968            return ret;
1969        }
1970    }
1971
1972    /* Scale down size, esp in case virt caps were added above */
1973    size = MIN(size, vfio_std_cap_max_size(pdev, pos));
1974
1975    /* Use emulated next pointer to allow dropping caps */
1976    pci_set_byte(vdev->emulated_config_bits + pos + PCI_CAP_LIST_NEXT, 0xff);
1977
1978    switch (cap_id) {
1979    case PCI_CAP_ID_MSI:
1980        ret = vfio_msi_setup(vdev, pos, errp);
1981        break;
1982    case PCI_CAP_ID_EXP:
1983        vfio_check_pcie_flr(vdev, pos);
1984        ret = vfio_setup_pcie_cap(vdev, pos, size, errp);
1985        break;
1986    case PCI_CAP_ID_MSIX:
1987        ret = vfio_msix_setup(vdev, pos, errp);
1988        break;
1989    case PCI_CAP_ID_PM:
1990        vfio_check_pm_reset(vdev, pos);
1991        vdev->pm_cap = pos;
1992        ret = pci_add_capability(pdev, cap_id, pos, size, errp);
1993        break;
1994    case PCI_CAP_ID_AF:
1995        vfio_check_af_flr(vdev, pos);
1996        ret = pci_add_capability(pdev, cap_id, pos, size, errp);
1997        break;
1998    default:
1999        ret = pci_add_capability(pdev, cap_id, pos, size, errp);
2000        break;
2001    }
2002
2003    if (ret < 0) {
2004        error_prepend(errp,
2005                      "failed to add PCI capability 0x%x[0x%x]@0x%x: ",
2006                      cap_id, size, pos);
2007        return ret;
2008    }
2009
2010    return 0;
2011}
2012
2013static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
2014{
2015    PCIDevice *pdev = &vdev->pdev;
2016    uint32_t header;
2017    uint16_t cap_id, next, size;
2018    uint8_t cap_ver;
2019    uint8_t *config;
2020
2021    /* Only add extended caps if we have them and the guest can see them */
2022    if (!pci_is_express(pdev) || !pci_bus_is_express(pci_get_bus(pdev)) ||
2023        !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) {
2024        return;
2025    }
2026
2027    /*
2028     * pcie_add_capability always inserts the new capability at the tail
2029     * of the chain.  Therefore to end up with a chain that matches the
2030     * physical device, we cache the config space to avoid overwriting
2031     * the original config space when we parse the extended capabilities.
2032     */
2033    config = g_memdup(pdev->config, vdev->config_size);
2034
2035    /*
2036     * Extended capabilities are chained with each pointing to the next, so we
2037     * can drop anything other than the head of the chain simply by modifying
2038     * the previous next pointer.  Seed the head of the chain here such that
2039     * we can simply skip any capabilities we want to drop below, regardless
2040     * of their position in the chain.  If this stub capability still exists
2041     * after we add the capabilities we want to expose, update the capability
2042     * ID to zero.  Note that we cannot seed with the capability header being
2043     * zero as this conflicts with definition of an absent capability chain
2044     * and prevents capabilities beyond the head of the list from being added.
2045     * By replacing the dummy capability ID with zero after walking the device
2046     * chain, we also transparently mark extended capabilities as absent if
2047     * no capabilities were added.  Note that the PCIe spec defines an absence
2048     * of extended capabilities to be determined by a value of zero for the
2049     * capability ID, version, AND next pointer.  A non-zero next pointer
2050     * should be sufficient to indicate additional capabilities are present,
2051     * which will occur if we call pcie_add_capability() below.  The entire
2052     * first dword is emulated to support this.
2053     *
2054     * NB. The kernel side does similar masking, so be prepared that our
2055     * view of the device may also contain a capability ID zero in the head
2056     * of the chain.  Skip it for the same reason that we cannot seed the
2057     * chain with a zero capability.
2058     */
2059    pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE,
2060                 PCI_EXT_CAP(0xFFFF, 0, 0));
2061    pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0);
2062    pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0);
2063
2064    for (next = PCI_CONFIG_SPACE_SIZE; next;
2065         next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) {
2066        header = pci_get_long(config + next);
2067        cap_id = PCI_EXT_CAP_ID(header);
2068        cap_ver = PCI_EXT_CAP_VER(header);
2069
2070        /*
2071         * If it becomes important to configure extended capabilities to their
2072         * actual size, use this as the default when it's something we don't
2073         * recognize. Since QEMU doesn't actually handle many of the config
2074         * accesses, exact size doesn't seem worthwhile.
2075         */
2076        size = vfio_ext_cap_max_size(config, next);
2077
2078        /* Use emulated next pointer to allow dropping extended caps */
2079        pci_long_test_and_set_mask(vdev->emulated_config_bits + next,
2080                                   PCI_EXT_CAP_NEXT_MASK);
2081
2082        switch (cap_id) {
2083        case 0: /* kernel masked capability */
2084        case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
2085        case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
2086        case PCI_EXT_CAP_ID_REBAR: /* Can't expose read-only */
2087            trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
2088            break;
2089        default:
2090            pcie_add_capability(pdev, cap_id, cap_ver, next, size);
2091        }
2092
2093    }
2094
2095    /* Cleanup chain head ID if necessary */
2096    if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) {
2097        pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0);
2098    }
2099
2100    g_free(config);
2101    return;
2102}
2103
2104static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
2105{
2106    PCIDevice *pdev = &vdev->pdev;
2107    int ret;
2108
2109    if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
2110        !pdev->config[PCI_CAPABILITY_LIST]) {
2111        return 0; /* Nothing to add */
2112    }
2113
2114    ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST], errp);
2115    if (ret) {
2116        return ret;
2117    }
2118
2119    vfio_add_ext_cap(vdev);
2120    return 0;
2121}
2122
2123static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
2124{
2125    PCIDevice *pdev = &vdev->pdev;
2126    uint16_t cmd;
2127
2128    vfio_disable_interrupts(vdev);
2129
2130    /* Make sure the device is in D0 */
2131    if (vdev->pm_cap) {
2132        uint16_t pmcsr;
2133        uint8_t state;
2134
2135        pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2136        state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2137        if (state) {
2138            pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
2139            vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2);
2140            /* vfio handles the necessary delay here */
2141            pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2);
2142            state = pmcsr & PCI_PM_CTRL_STATE_MASK;
2143            if (state) {
2144                error_report("vfio: Unable to power on device, stuck in D%d",
2145                             state);
2146            }
2147        }
2148    }
2149
2150    /*
2151     * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
2152     * Also put INTx Disable in known state.
2153     */
2154    cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2);
2155    cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER |
2156             PCI_COMMAND_INTX_DISABLE);
2157    vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
2158}
2159
2160static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
2161{
2162    Error *err = NULL;
2163    int nr;
2164
2165    vfio_intx_enable(vdev, &err);
2166    if (err) {
2167        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2168    }
2169
2170    for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) {
2171        off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr);
2172        uint32_t val = 0;
2173        uint32_t len = sizeof(val);
2174
2175        if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) {
2176            error_report("%s(%s) reset bar %d failed: %m", __func__,
2177                         vdev->vbasedev.name, nr);
2178        }
2179    }
2180
2181    vfio_quirk_reset(vdev);
2182}
2183
2184static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
2185{
2186    char tmp[13];
2187
2188    sprintf(tmp, "%04x:%02x:%02x.%1x", addr->domain,
2189            addr->bus, addr->slot, addr->function);
2190
2191    return (strcmp(tmp, name) == 0);
2192}
2193
2194static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
2195{
2196    VFIOGroup *group;
2197    struct vfio_pci_hot_reset_info *info;
2198    struct vfio_pci_dependent_device *devices;
2199    struct vfio_pci_hot_reset *reset;
2200    int32_t *fds;
2201    int ret, i, count;
2202    bool multi = false;
2203
2204    trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
2205
2206    if (!single) {
2207        vfio_pci_pre_reset(vdev);
2208    }
2209    vdev->vbasedev.needs_reset = false;
2210
2211    info = g_malloc0(sizeof(*info));
2212    info->argsz = sizeof(*info);
2213
2214    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2215    if (ret && errno != ENOSPC) {
2216        ret = -errno;
2217        if (!vdev->has_pm_reset) {
2218            error_report("vfio: Cannot reset device %s, "
2219                         "no available reset mechanism.", vdev->vbasedev.name);
2220        }
2221        goto out_single;
2222    }
2223
2224    count = info->count;
2225    info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
2226    info->argsz = sizeof(*info) + (count * sizeof(*devices));
2227    devices = &info->devices[0];
2228
2229    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
2230    if (ret) {
2231        ret = -errno;
2232        error_report("vfio: hot reset info failed: %m");
2233        goto out_single;
2234    }
2235
2236    trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
2237
2238    /* Verify that we have all the groups required */
2239    for (i = 0; i < info->count; i++) {
2240        PCIHostDeviceAddress host;
2241        VFIOPCIDevice *tmp;
2242        VFIODevice *vbasedev_iter;
2243
2244        host.domain = devices[i].segment;
2245        host.bus = devices[i].bus;
2246        host.slot = PCI_SLOT(devices[i].devfn);
2247        host.function = PCI_FUNC(devices[i].devfn);
2248
2249        trace_vfio_pci_hot_reset_dep_devices(host.domain,
2250                host.bus, host.slot, host.function, devices[i].group_id);
2251
2252        if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
2253            continue;
2254        }
2255
2256        QLIST_FOREACH(group, &vfio_group_list, next) {
2257            if (group->groupid == devices[i].group_id) {
2258                break;
2259            }
2260        }
2261
2262        if (!group) {
2263            if (!vdev->has_pm_reset) {
2264                error_report("vfio: Cannot reset device %s, "
2265                             "depends on group %d which is not owned.",
2266                             vdev->vbasedev.name, devices[i].group_id);
2267            }
2268            ret = -EPERM;
2269            goto out;
2270        }
2271
2272        /* Prep dependent devices for reset and clear our marker. */
2273        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2274            if (!vbasedev_iter->dev->realized ||
2275                vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
2276                continue;
2277            }
2278            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
2279            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
2280                if (single) {
2281                    ret = -EINVAL;
2282                    goto out_single;
2283                }
2284                vfio_pci_pre_reset(tmp);
2285                tmp->vbasedev.needs_reset = false;
2286                multi = true;
2287                break;
2288            }
2289        }
2290    }
2291
2292    if (!single && !multi) {
2293        ret = -EINVAL;
2294        goto out_single;
2295    }
2296
2297    /* Determine how many group fds need to be passed */
2298    count = 0;
2299    QLIST_FOREACH(group, &vfio_group_list, next) {
2300        for (i = 0; i < info->count; i++) {
2301            if (group->groupid == devices[i].group_id) {
2302                count++;
2303                break;
2304            }
2305        }
2306    }
2307
2308    reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
2309    reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
2310    fds = &reset->group_fds[0];
2311
2312    /* Fill in group fds */
2313    QLIST_FOREACH(group, &vfio_group_list, next) {
2314        for (i = 0; i < info->count; i++) {
2315            if (group->groupid == devices[i].group_id) {
2316                fds[reset->count++] = group->fd;
2317                break;
2318            }
2319        }
2320    }
2321
2322    /* Bus reset! */
2323    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
2324    g_free(reset);
2325
2326    trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
2327                                    ret ? "%m" : "Success");
2328
2329out:
2330    /* Re-enable INTx on affected devices */
2331    for (i = 0; i < info->count; i++) {
2332        PCIHostDeviceAddress host;
2333        VFIOPCIDevice *tmp;
2334        VFIODevice *vbasedev_iter;
2335
2336        host.domain = devices[i].segment;
2337        host.bus = devices[i].bus;
2338        host.slot = PCI_SLOT(devices[i].devfn);
2339        host.function = PCI_FUNC(devices[i].devfn);
2340
2341        if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
2342            continue;
2343        }
2344
2345        QLIST_FOREACH(group, &vfio_group_list, next) {
2346            if (group->groupid == devices[i].group_id) {
2347                break;
2348            }
2349        }
2350
2351        if (!group) {
2352            break;
2353        }
2354
2355        QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2356            if (!vbasedev_iter->dev->realized ||
2357                vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
2358                continue;
2359            }
2360            tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
2361            if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
2362                vfio_pci_post_reset(tmp);
2363                break;
2364            }
2365        }
2366    }
2367out_single:
2368    if (!single) {
2369        vfio_pci_post_reset(vdev);
2370    }
2371    g_free(info);
2372
2373    return ret;
2374}
2375
2376/*
2377 * We want to differentiate hot reset of mulitple in-use devices vs hot reset
2378 * of a single in-use device.  VFIO_DEVICE_RESET will already handle the case
2379 * of doing hot resets when there is only a single device per bus.  The in-use
2380 * here refers to how many VFIODevices are affected.  A hot reset that affects
2381 * multiple devices, but only a single in-use device, means that we can call
2382 * it from our bus ->reset() callback since the extent is effectively a single
2383 * device.  This allows us to make use of it in the hotplug path.  When there
2384 * are multiple in-use devices, we can only trigger the hot reset during a
2385 * system reset and thus from our reset handler.  We separate _one vs _multi
2386 * here so that we don't overlap and do a double reset on the system reset
2387 * path where both our reset handler and ->reset() callback are used.  Calling
2388 * _one() will only do a hot reset for the one in-use devices case, calling
2389 * _multi() will do nothing if a _one() would have been sufficient.
2390 */
2391static int vfio_pci_hot_reset_one(VFIOPCIDevice *vdev)
2392{
2393    return vfio_pci_hot_reset(vdev, true);
2394}
2395
2396static int vfio_pci_hot_reset_multi(VFIODevice *vbasedev)
2397{
2398    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2399    return vfio_pci_hot_reset(vdev, false);
2400}
2401
2402static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev)
2403{
2404    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
2405    if (!vbasedev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) {
2406        vbasedev->needs_reset = true;
2407    }
2408}
2409
2410static VFIODeviceOps vfio_pci_ops = {
2411    .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
2412    .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
2413    .vfio_eoi = vfio_intx_eoi,
2414};
2415
2416int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
2417{
2418    VFIODevice *vbasedev = &vdev->vbasedev;
2419    struct vfio_region_info *reg_info;
2420    int ret;
2421
2422    ret = vfio_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, &reg_info);
2423    if (ret) {
2424        error_setg_errno(errp, -ret,
2425                         "failed getting region info for VGA region index %d",
2426                         VFIO_PCI_VGA_REGION_INDEX);
2427        return ret;
2428    }
2429
2430    if (!(reg_info->flags & VFIO_REGION_INFO_FLAG_READ) ||
2431        !(reg_info->flags & VFIO_REGION_INFO_FLAG_WRITE) ||
2432        reg_info->size < 0xbffff + 1) {
2433        error_setg(errp, "unexpected VGA info, flags 0x%lx, size 0x%lx",
2434                   (unsigned long)reg_info->flags,
2435                   (unsigned long)reg_info->size);
2436        g_free(reg_info);
2437        return -EINVAL;
2438    }
2439
2440    vdev->vga = g_new0(VFIOVGA, 1);
2441
2442    vdev->vga->fd_offset = reg_info->offset;
2443    vdev->vga->fd = vdev->vbasedev.fd;
2444
2445    g_free(reg_info);
2446
2447    vdev->vga->region[QEMU_PCI_VGA_MEM].offset = QEMU_PCI_VGA_MEM_BASE;
2448    vdev->vga->region[QEMU_PCI_VGA_MEM].nr = QEMU_PCI_VGA_MEM;
2449    QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_MEM].quirks);
2450
2451    memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2452                          OBJECT(vdev), &vfio_vga_ops,
2453                          &vdev->vga->region[QEMU_PCI_VGA_MEM],
2454                          "vfio-vga-mmio@0xa0000",
2455                          QEMU_PCI_VGA_MEM_SIZE);
2456
2457    vdev->vga->region[QEMU_PCI_VGA_IO_LO].offset = QEMU_PCI_VGA_IO_LO_BASE;
2458    vdev->vga->region[QEMU_PCI_VGA_IO_LO].nr = QEMU_PCI_VGA_IO_LO;
2459    QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].quirks);
2460
2461    memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2462                          OBJECT(vdev), &vfio_vga_ops,
2463                          &vdev->vga->region[QEMU_PCI_VGA_IO_LO],
2464                          "vfio-vga-io@0x3b0",
2465                          QEMU_PCI_VGA_IO_LO_SIZE);
2466
2467    vdev->vga->region[QEMU_PCI_VGA_IO_HI].offset = QEMU_PCI_VGA_IO_HI_BASE;
2468    vdev->vga->region[QEMU_PCI_VGA_IO_HI].nr = QEMU_PCI_VGA_IO_HI;
2469    QLIST_INIT(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks);
2470
2471    memory_region_init_io(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
2472                          OBJECT(vdev), &vfio_vga_ops,
2473                          &vdev->vga->region[QEMU_PCI_VGA_IO_HI],
2474                          "vfio-vga-io@0x3c0",
2475                          QEMU_PCI_VGA_IO_HI_SIZE);
2476
2477    pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem,
2478                     &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem,
2479                     &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem);
2480
2481    return 0;
2482}
2483
2484static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
2485{
2486    VFIODevice *vbasedev = &vdev->vbasedev;
2487    struct vfio_region_info *reg_info;
2488    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
2489    int i, ret = -1;
2490
2491    /* Sanity check device */
2492    if (!(vbasedev->flags & VFIO_DEVICE_FLAGS_PCI)) {
2493        error_setg(errp, "this isn't a PCI device");
2494        return;
2495    }
2496
2497    if (vbasedev->num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) {
2498        error_setg(errp, "unexpected number of io regions %u",
2499                   vbasedev->num_regions);
2500        return;
2501    }
2502
2503    if (vbasedev->num_irqs < VFIO_PCI_MSIX_IRQ_INDEX + 1) {
2504        error_setg(errp, "unexpected number of irqs %u", vbasedev->num_irqs);
2505        return;
2506    }
2507
2508    for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
2509        char *name = g_strdup_printf("%s BAR %d", vbasedev->name, i);
2510
2511        ret = vfio_region_setup(OBJECT(vdev), vbasedev,
2512                                &vdev->bars[i].region, i, name);
2513        g_free(name);
2514
2515        if (ret) {
2516            error_setg_errno(errp, -ret, "failed to get region %d info", i);
2517            return;
2518        }
2519
2520        QLIST_INIT(&vdev->bars[i].quirks);
2521    }
2522
2523    ret = vfio_get_region_info(vbasedev,
2524                               VFIO_PCI_CONFIG_REGION_INDEX, &reg_info);
2525    if (ret) {
2526        error_setg_errno(errp, -ret, "failed to get config info");
2527        return;
2528    }
2529
2530    trace_vfio_populate_device_config(vdev->vbasedev.name,
2531                                      (unsigned long)reg_info->size,
2532                                      (unsigned long)reg_info->offset,
2533                                      (unsigned long)reg_info->flags);
2534
2535    vdev->config_size = reg_info->size;
2536    if (vdev->config_size == PCI_CONFIG_SPACE_SIZE) {
2537        vdev->pdev.cap_present &= ~QEMU_PCI_CAP_EXPRESS;
2538    }
2539    vdev->config_offset = reg_info->offset;
2540
2541    g_free(reg_info);
2542
2543    if (vdev->features & VFIO_FEATURE_ENABLE_VGA) {
2544        ret = vfio_populate_vga(vdev, errp);
2545        if (ret) {
2546            error_append_hint(errp, "device does not support "
2547                              "requested feature x-vga\n");
2548            return;
2549        }
2550    }
2551
2552    irq_info.index = VFIO_PCI_ERR_IRQ_INDEX;
2553
2554    ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
2555    if (ret) {
2556        /* This can fail for an old kernel or legacy PCI dev */
2557        trace_vfio_populate_device_get_irq_info_failure(strerror(errno));
2558    } else if (irq_info.count == 1) {
2559        vdev->pci_aer = true;
2560    } else {
2561        warn_report(VFIO_MSG_PREFIX
2562                    "Could not enable error recovery for the device",
2563                    vbasedev->name);
2564    }
2565}
2566
2567static void vfio_put_device(VFIOPCIDevice *vdev)
2568{
2569    g_free(vdev->vbasedev.name);
2570    g_free(vdev->msix);
2571
2572    vfio_put_base_device(&vdev->vbasedev);
2573}
2574
2575static void vfio_err_notifier_handler(void *opaque)
2576{
2577    VFIOPCIDevice *vdev = opaque;
2578
2579    if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
2580        return;
2581    }
2582
2583    /*
2584     * TBD. Retrieve the error details and decide what action
2585     * needs to be taken. One of the actions could be to pass
2586     * the error to the guest and have the guest driver recover
2587     * from the error. This requires that PCIe capabilities be
2588     * exposed to the guest. For now, we just terminate the
2589     * guest to contain the error.
2590     */
2591
2592    error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
2593
2594    vm_stop(RUN_STATE_INTERNAL_ERROR);
2595}
2596
2597/*
2598 * Registers error notifier for devices supporting error recovery.
2599 * If we encounter a failure in this function, we report an error
2600 * and continue after disabling error recovery support for the
2601 * device.
2602 */
2603static void vfio_register_err_notifier(VFIOPCIDevice *vdev)
2604{
2605    Error *err = NULL;
2606    int32_t fd;
2607
2608    if (!vdev->pci_aer) {
2609        return;
2610    }
2611
2612    if (event_notifier_init(&vdev->err_notifier, 0)) {
2613        error_report("vfio: Unable to init event notifier for error detection");
2614        vdev->pci_aer = false;
2615        return;
2616    }
2617
2618    fd = event_notifier_get_fd(&vdev->err_notifier);
2619    qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
2620
2621    if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
2622                               VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
2623        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2624        qemu_set_fd_handler(fd, NULL, NULL, vdev);
2625        event_notifier_cleanup(&vdev->err_notifier);
2626        vdev->pci_aer = false;
2627    }
2628}
2629
2630static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev)
2631{
2632    Error *err = NULL;
2633
2634    if (!vdev->pci_aer) {
2635        return;
2636    }
2637
2638    if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_ERR_IRQ_INDEX, 0,
2639                               VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
2640        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2641    }
2642    qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier),
2643                        NULL, NULL, vdev);
2644    event_notifier_cleanup(&vdev->err_notifier);
2645}
2646
2647static void vfio_req_notifier_handler(void *opaque)
2648{
2649    VFIOPCIDevice *vdev = opaque;
2650    Error *err = NULL;
2651
2652    if (!event_notifier_test_and_clear(&vdev->req_notifier)) {
2653        return;
2654    }
2655
2656    qdev_unplug(DEVICE(vdev), &err);
2657    if (err) {
2658        warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2659    }
2660}
2661
2662static void vfio_register_req_notifier(VFIOPCIDevice *vdev)
2663{
2664    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
2665                                      .index = VFIO_PCI_REQ_IRQ_INDEX };
2666    Error *err = NULL;
2667    int32_t fd;
2668
2669    if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) {
2670        return;
2671    }
2672
2673    if (ioctl(vdev->vbasedev.fd,
2674              VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) {
2675        return;
2676    }
2677
2678    if (event_notifier_init(&vdev->req_notifier, 0)) {
2679        error_report("vfio: Unable to init event notifier for device request");
2680        return;
2681    }
2682
2683    fd = event_notifier_get_fd(&vdev->req_notifier);
2684    qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
2685
2686    if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
2687                           VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
2688        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2689        qemu_set_fd_handler(fd, NULL, NULL, vdev);
2690        event_notifier_cleanup(&vdev->req_notifier);
2691    } else {
2692        vdev->req_enabled = true;
2693    }
2694}
2695
2696static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
2697{
2698    Error *err = NULL;
2699
2700    if (!vdev->req_enabled) {
2701        return;
2702    }
2703
2704    if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, 0,
2705                               VFIO_IRQ_SET_ACTION_TRIGGER, -1, &err)) {
2706        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
2707    }
2708    qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier),
2709                        NULL, NULL, vdev);
2710    event_notifier_cleanup(&vdev->req_notifier);
2711
2712    vdev->req_enabled = false;
2713}
2714
2715static void vfio_realize(PCIDevice *pdev, Error **errp)
2716{
2717    VFIOPCIDevice *vdev = PCI_VFIO(pdev);
2718    VFIODevice *vbasedev_iter;
2719    VFIOGroup *group;
2720    char *tmp, *subsys, group_path[PATH_MAX], *group_name;
2721    Error *err = NULL;
2722    ssize_t len;
2723    struct stat st;
2724    int groupid;
2725    int i, ret;
2726    bool is_mdev;
2727
2728    if (!vdev->vbasedev.sysfsdev) {
2729        if (!(~vdev->host.domain || ~vdev->host.bus ||
2730              ~vdev->host.slot || ~vdev->host.function)) {
2731            error_setg(errp, "No provided host device");
2732            error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
2733                              "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
2734            return;
2735        }
2736        vdev->vbasedev.sysfsdev =
2737            g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
2738                            vdev->host.domain, vdev->host.bus,
2739                            vdev->host.slot, vdev->host.function);
2740    }
2741
2742    if (stat(vdev->vbasedev.sysfsdev, &st) < 0) {
2743        error_setg_errno(errp, errno, "no such host device");
2744        error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.sysfsdev);
2745        return;
2746    }
2747
2748    if (!pdev->failover_pair_id) {
2749        error_setg(&vdev->migration_blocker,
2750                "VFIO device doesn't support migration");
2751        ret = migrate_add_blocker(vdev->migration_blocker, &err);
2752        if (ret) {
2753            error_propagate(errp, err);
2754            error_free(vdev->migration_blocker);
2755            vdev->migration_blocker = NULL;
2756            return;
2757        }
2758    }
2759
2760    vdev->vbasedev.name = g_path_get_basename(vdev->vbasedev.sysfsdev);
2761    vdev->vbasedev.ops = &vfio_pci_ops;
2762    vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI;
2763    vdev->vbasedev.dev = DEVICE(vdev);
2764
2765    tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev);
2766    len = readlink(tmp, group_path, sizeof(group_path));
2767    g_free(tmp);
2768
2769    if (len <= 0 || len >= sizeof(group_path)) {
2770        error_setg_errno(errp, len < 0 ? errno : ENAMETOOLONG,
2771                         "no iommu_group found");
2772        goto error;
2773    }
2774
2775    group_path[len] = 0;
2776
2777    group_name = basename(group_path);
2778    if (sscanf(group_name, "%d", &groupid) != 1) {
2779        error_setg_errno(errp, errno, "failed to read %s", group_path);
2780        goto error;
2781    }
2782
2783    trace_vfio_realize(vdev->vbasedev.name, groupid);
2784
2785    group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), errp);
2786    if (!group) {
2787        goto error;
2788    }
2789
2790    QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
2791        if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) {
2792            error_setg(errp, "device is already attached");
2793            vfio_put_group(group);
2794            goto error;
2795        }
2796    }
2797
2798    /*
2799     * Mediated devices *might* operate compatibly with memory ballooning, but
2800     * we cannot know for certain, it depends on whether the mdev vendor driver
2801     * stays in sync with the active working set of the guest driver.  Prevent
2802     * the x-balloon-allowed option unless this is minimally an mdev device.
2803     */
2804    tmp = g_strdup_printf("%s/subsystem", vdev->vbasedev.sysfsdev);
2805    subsys = realpath(tmp, NULL);
2806    g_free(tmp);
2807    is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
2808    free(subsys);
2809
2810    trace_vfio_mdev(vdev->vbasedev.name, is_mdev);
2811
2812    if (vdev->vbasedev.balloon_allowed && !is_mdev) {
2813        error_setg(errp, "x-balloon-allowed only potentially compatible "
2814                   "with mdev devices");
2815        vfio_put_group(group);
2816        goto error;
2817    }
2818
2819    ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev, errp);
2820    if (ret) {
2821        vfio_put_group(group);
2822        goto error;
2823    }
2824
2825    vfio_populate_device(vdev, &err);
2826    if (err) {
2827        error_propagate(errp, err);
2828        goto error;
2829    }
2830
2831    /* Get a copy of config space */
2832    ret = pread(vdev->vbasedev.fd, vdev->pdev.config,
2833                MIN(pci_config_size(&vdev->pdev), vdev->config_size),
2834                vdev->config_offset);
2835    if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
2836        ret = ret < 0 ? -errno : -EFAULT;
2837        error_setg_errno(errp, -ret, "failed to read device config space");
2838        goto error;
2839    }
2840
2841    /* vfio emulates a lot for us, but some bits need extra love */
2842    vdev->emulated_config_bits = g_malloc0(vdev->config_size);
2843
2844    /* QEMU can choose to expose the ROM or not */
2845    memset(vdev->emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
2846    /* QEMU can also add or extend BARs */
2847    memset(vdev->emulated_config_bits + PCI_BASE_ADDRESS_0, 0xff, 6 * 4);
2848
2849    /*
2850     * The PCI spec reserves vendor ID 0xffff as an invalid value.  The
2851     * device ID is managed by the vendor and need only be a 16-bit value.
2852     * Allow any 16-bit value for subsystem so they can be hidden or changed.
2853     */
2854    if (vdev->vendor_id != PCI_ANY_ID) {
2855        if (vdev->vendor_id >= 0xffff) {
2856            error_setg(errp, "invalid PCI vendor ID provided");
2857            goto error;
2858        }
2859        vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
2860        trace_vfio_pci_emulated_vendor_id(vdev->vbasedev.name, vdev->vendor_id);
2861    } else {
2862        vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
2863    }
2864
2865    if (vdev->device_id != PCI_ANY_ID) {
2866        if (vdev->device_id > 0xffff) {
2867            error_setg(errp, "invalid PCI device ID provided");
2868            goto error;
2869        }
2870        vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
2871        trace_vfio_pci_emulated_device_id(vdev->vbasedev.name, vdev->device_id);
2872    } else {
2873        vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
2874    }
2875
2876    if (vdev->sub_vendor_id != PCI_ANY_ID) {
2877        if (vdev->sub_vendor_id > 0xffff) {
2878            error_setg(errp, "invalid PCI subsystem vendor ID provided");
2879            goto error;
2880        }
2881        vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
2882                               vdev->sub_vendor_id, ~0);
2883        trace_vfio_pci_emulated_sub_vendor_id(vdev->vbasedev.name,
2884                                              vdev->sub_vendor_id);
2885    }
2886
2887    if (vdev->sub_device_id != PCI_ANY_ID) {
2888        if (vdev->sub_device_id > 0xffff) {
2889            error_setg(errp, "invalid PCI subsystem device ID provided");
2890            goto error;
2891        }
2892        vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
2893        trace_vfio_pci_emulated_sub_device_id(vdev->vbasedev.name,
2894                                              vdev->sub_device_id);
2895    }
2896
2897    /* QEMU can change multi-function devices to single function, or reverse */
2898    vdev->emulated_config_bits[PCI_HEADER_TYPE] =
2899                                              PCI_HEADER_TYPE_MULTI_FUNCTION;
2900
2901    /* Restore or clear multifunction, this is always controlled by QEMU */
2902    if (vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
2903        vdev->pdev.config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
2904    } else {
2905        vdev->pdev.config[PCI_HEADER_TYPE] &= ~PCI_HEADER_TYPE_MULTI_FUNCTION;
2906    }
2907
2908    /*
2909     * Clear host resource mapping info.  If we choose not to register a
2910     * BAR, such as might be the case with the option ROM, we can get
2911     * confusing, unwritable, residual addresses from the host here.
2912     */
2913    memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
2914    memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
2915
2916    vfio_pci_size_rom(vdev);
2917
2918    vfio_bars_prepare(vdev);
2919
2920    vfio_msix_early_setup(vdev, &err);
2921    if (err) {
2922        error_propagate(errp, err);
2923        goto error;
2924    }
2925
2926    vfio_bars_register(vdev);
2927
2928    ret = vfio_add_capabilities(vdev, errp);
2929    if (ret) {
2930        goto out_teardown;
2931    }
2932
2933    if (vdev->vga) {
2934        vfio_vga_quirk_setup(vdev);
2935    }
2936
2937    for (i = 0; i < PCI_ROM_SLOT; i++) {
2938        vfio_bar_quirk_setup(vdev, i);
2939    }
2940
2941    if (!vdev->igd_opregion &&
2942        vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) {
2943        struct vfio_region_info *opregion;
2944
2945        if (vdev->pdev.qdev.hotplugged) {
2946            error_setg(errp,
2947                       "cannot support IGD OpRegion feature on hotplugged "
2948                       "device");
2949            goto out_teardown;
2950        }
2951
2952        ret = vfio_get_dev_region_info(&vdev->vbasedev,
2953                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
2954                        VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
2955        if (ret) {
2956            error_setg_errno(errp, -ret,
2957                             "does not support requested IGD OpRegion feature");
2958            goto out_teardown;
2959        }
2960
2961        ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);
2962        g_free(opregion);
2963        if (ret) {
2964            goto out_teardown;
2965        }
2966    }
2967
2968    /* QEMU emulates all of MSI & MSIX */
2969    if (pdev->cap_present & QEMU_PCI_CAP_MSIX) {
2970        memset(vdev->emulated_config_bits + pdev->msix_cap, 0xff,
2971               MSIX_CAP_LENGTH);
2972    }
2973
2974    if (pdev->cap_present & QEMU_PCI_CAP_MSI) {
2975        memset(vdev->emulated_config_bits + pdev->msi_cap, 0xff,
2976               vdev->msi_cap_size);
2977    }
2978
2979    if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
2980        vdev->intx.mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
2981                                                  vfio_intx_mmap_enable, vdev);
2982        pci_device_set_intx_routing_notifier(&vdev->pdev,
2983                                             vfio_intx_routing_notifier);
2984        vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
2985        kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
2986        ret = vfio_intx_enable(vdev, errp);
2987        if (ret) {
2988            goto out_deregister;
2989        }
2990    }
2991
2992    if (vdev->display != ON_OFF_AUTO_OFF) {
2993        ret = vfio_display_probe(vdev, errp);
2994        if (ret) {
2995            goto out_deregister;
2996        }
2997    }
2998    if (vdev->enable_ramfb && vdev->dpy == NULL) {
2999        error_setg(errp, "ramfb=on requires display=on");
3000        goto out_deregister;
3001    }
3002    if (vdev->display_xres || vdev->display_yres) {
3003        if (vdev->dpy == NULL) {
3004            error_setg(errp, "xres and yres properties require display=on");
3005            goto out_deregister;
3006        }
3007        if (vdev->dpy->edid_regs == NULL) {
3008            error_setg(errp, "xres and yres properties need edid support");
3009            goto out_deregister;
3010        }
3011    }
3012
3013    if (vdev->vendor_id == PCI_VENDOR_ID_NVIDIA) {
3014        ret = vfio_pci_nvidia_v100_ram_init(vdev, errp);
3015        if (ret && ret != -ENODEV) {
3016            error_report("Failed to setup NVIDIA V100 GPU RAM");
3017        }
3018    }
3019
3020    if (vdev->vendor_id == PCI_VENDOR_ID_IBM) {
3021        ret = vfio_pci_nvlink2_init(vdev, errp);
3022        if (ret && ret != -ENODEV) {
3023            error_report("Failed to setup NVlink2 bridge");
3024        }
3025    }
3026
3027    vfio_register_err_notifier(vdev);
3028    vfio_register_req_notifier(vdev);
3029    vfio_setup_resetfn_quirk(vdev);
3030
3031    return;
3032
3033out_deregister:
3034    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3035    kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3036out_teardown:
3037    vfio_teardown_msi(vdev);
3038    vfio_bars_exit(vdev);
3039error:
3040    error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name);
3041    if (vdev->migration_blocker) {
3042        migrate_del_blocker(vdev->migration_blocker);
3043        error_free(vdev->migration_blocker);
3044        vdev->migration_blocker = NULL;
3045    }
3046}
3047
3048static void vfio_instance_finalize(Object *obj)
3049{
3050    VFIOPCIDevice *vdev = PCI_VFIO(obj);
3051    VFIOGroup *group = vdev->vbasedev.group;
3052
3053    vfio_display_finalize(vdev);
3054    vfio_bars_finalize(vdev);
3055    g_free(vdev->emulated_config_bits);
3056    g_free(vdev->rom);
3057    if (vdev->migration_blocker) {
3058        migrate_del_blocker(vdev->migration_blocker);
3059        error_free(vdev->migration_blocker);
3060    }
3061    /*
3062     * XXX Leaking igd_opregion is not an oversight, we can't remove the
3063     * fw_cfg entry therefore leaking this allocation seems like the safest
3064     * option.
3065     *
3066     * g_free(vdev->igd_opregion);
3067     */
3068    vfio_put_device(vdev);
3069    vfio_put_group(group);
3070}
3071
3072static void vfio_exitfn(PCIDevice *pdev)
3073{
3074    VFIOPCIDevice *vdev = PCI_VFIO(pdev);
3075
3076    vfio_unregister_req_notifier(vdev);
3077    vfio_unregister_err_notifier(vdev);
3078    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
3079    if (vdev->irqchip_change_notifier.notify) {
3080        kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
3081    }
3082    vfio_disable_interrupts(vdev);
3083    if (vdev->intx.mmap_timer) {
3084        timer_free(vdev->intx.mmap_timer);
3085    }
3086    vfio_teardown_msi(vdev);
3087    vfio_bars_exit(vdev);
3088}
3089
3090static void vfio_pci_reset(DeviceState *dev)
3091{
3092    VFIOPCIDevice *vdev = PCI_VFIO(dev);
3093
3094    trace_vfio_pci_reset(vdev->vbasedev.name);
3095
3096    vfio_pci_pre_reset(vdev);
3097
3098    if (vdev->display != ON_OFF_AUTO_OFF) {
3099        vfio_display_reset(vdev);
3100    }
3101
3102    if (vdev->resetfn && !vdev->resetfn(vdev)) {
3103        goto post_reset;
3104    }
3105
3106    if (vdev->vbasedev.reset_works &&
3107        (vdev->has_flr || !vdev->has_pm_reset) &&
3108        !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3109        trace_vfio_pci_reset_flr(vdev->vbasedev.name);
3110        goto post_reset;
3111    }
3112
3113    /* See if we can do our own bus reset */
3114    if (!vfio_pci_hot_reset_one(vdev)) {
3115        goto post_reset;
3116    }
3117
3118    /* If nothing else works and the device supports PM reset, use it */
3119    if (vdev->vbasedev.reset_works && vdev->has_pm_reset &&
3120        !ioctl(vdev->vbasedev.fd, VFIO_DEVICE_RESET)) {
3121        trace_vfio_pci_reset_pm(vdev->vbasedev.name);
3122        goto post_reset;
3123    }
3124
3125post_reset:
3126    vfio_pci_post_reset(vdev);
3127}
3128
3129static void vfio_instance_init(Object *obj)
3130{
3131    PCIDevice *pci_dev = PCI_DEVICE(obj);
3132    VFIOPCIDevice *vdev = PCI_VFIO(obj);
3133
3134    device_add_bootindex_property(obj, &vdev->bootindex,
3135                                  "bootindex", NULL,
3136                                  &pci_dev->qdev, NULL);
3137    vdev->host.domain = ~0U;
3138    vdev->host.bus = ~0U;
3139    vdev->host.slot = ~0U;
3140    vdev->host.function = ~0U;
3141
3142    vdev->nv_gpudirect_clique = 0xFF;
3143
3144    /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
3145     * line, therefore, no need to wait to realize like other devices */
3146    pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
3147}
3148
3149static Property vfio_pci_dev_properties[] = {
3150    DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
3151    DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
3152    DEFINE_PROP_ON_OFF_AUTO("display", VFIOPCIDevice,
3153                            display, ON_OFF_AUTO_OFF),
3154    DEFINE_PROP_UINT32("xres", VFIOPCIDevice, display_xres, 0),
3155    DEFINE_PROP_UINT32("yres", VFIOPCIDevice, display_yres, 0),
3156    DEFINE_PROP_UINT32("x-intx-mmap-timeout-ms", VFIOPCIDevice,
3157                       intx.mmap_timeout, 1100),
3158    DEFINE_PROP_BIT("x-vga", VFIOPCIDevice, features,
3159                    VFIO_FEATURE_ENABLE_VGA_BIT, false),
3160    DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features,
3161                    VFIO_FEATURE_ENABLE_REQ_BIT, true),
3162    DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
3163                    VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
3164    DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
3165    DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
3166                     vbasedev.balloon_allowed, false),
3167    DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
3168    DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
3169    DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),
3170    DEFINE_PROP_BOOL("x-no-geforce-quirks", VFIOPCIDevice,
3171                     no_geforce_quirks, false),
3172    DEFINE_PROP_BOOL("x-no-kvm-ioeventfd", VFIOPCIDevice, no_kvm_ioeventfd,
3173                     false),
3174    DEFINE_PROP_BOOL("x-no-vfio-ioeventfd", VFIOPCIDevice, no_vfio_ioeventfd,
3175                     false),
3176    DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice, vendor_id, PCI_ANY_ID),
3177    DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice, device_id, PCI_ANY_ID),
3178    DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
3179                       sub_vendor_id, PCI_ANY_ID),
3180    DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
3181                       sub_device_id, PCI_ANY_ID),
3182    DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0),
3183    DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice,
3184                                   nv_gpudirect_clique,
3185                                   qdev_prop_nv_gpudirect_clique, uint8_t),
3186    DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
3187                                OFF_AUTOPCIBAR_OFF),
3188    /*
3189     * TODO - support passed fds... is this necessary?
3190     * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
3191     * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
3192     */
3193    DEFINE_PROP_END_OF_LIST(),
3194};
3195
3196static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
3197{
3198    DeviceClass *dc = DEVICE_CLASS(klass);
3199    PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
3200
3201    dc->reset = vfio_pci_reset;
3202    dc->props = vfio_pci_dev_properties;
3203    dc->desc = "VFIO-based PCI device assignment";
3204    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3205    pdc->realize = vfio_realize;
3206    pdc->exit = vfio_exitfn;
3207    pdc->config_read = vfio_pci_read_config;
3208    pdc->config_write = vfio_pci_write_config;
3209}
3210
3211static const TypeInfo vfio_pci_dev_info = {
3212    .name = TYPE_VFIO_PCI,
3213    .parent = TYPE_PCI_DEVICE,
3214    .instance_size = sizeof(VFIOPCIDevice),
3215    .class_init = vfio_pci_dev_class_init,
3216    .instance_init = vfio_instance_init,
3217    .instance_finalize = vfio_instance_finalize,
3218    .interfaces = (InterfaceInfo[]) {
3219        { INTERFACE_PCIE_DEVICE },
3220        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
3221        { }
3222    },
3223};
3224
3225static Property vfio_pci_dev_nohotplug_properties[] = {
3226    DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false),
3227    DEFINE_PROP_END_OF_LIST(),
3228};
3229
3230static void vfio_pci_nohotplug_dev_class_init(ObjectClass *klass, void *data)
3231{
3232    DeviceClass *dc = DEVICE_CLASS(klass);
3233
3234    dc->props = vfio_pci_dev_nohotplug_properties;
3235    dc->hotpluggable = false;
3236}
3237
3238static const TypeInfo vfio_pci_nohotplug_dev_info = {
3239    .name = TYPE_VFIO_PCI_NOHOTPLUG,
3240    .parent = TYPE_VFIO_PCI,
3241    .instance_size = sizeof(VFIOPCIDevice),
3242    .class_init = vfio_pci_nohotplug_dev_class_init,
3243};
3244
3245static void register_vfio_pci_dev_type(void)
3246{
3247    type_register_static(&vfio_pci_dev_info);
3248    type_register_static(&vfio_pci_nohotplug_dev_info);
3249}
3250
3251type_init(register_vfio_pci_dev_type)
3252