qemu/hw/vfio/pci-quirks.c
<<
>>
Prefs
   1/*
   2 * device quirks for PCI devices
   3 *
   4 * Copyright Red Hat, Inc. 2012-2015
   5 *
   6 * Authors:
   7 *  Alex Williamson <alex.williamson@redhat.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 */
  12
  13#include "qemu/osdep.h"
  14#include CONFIG_DEVICES
  15#include "exec/memop.h"
  16#include "qemu/units.h"
  17#include "qemu/log.h"
  18#include "qemu/error-report.h"
  19#include "qemu/main-loop.h"
  20#include "qemu/module.h"
  21#include "qemu/range.h"
  22#include "qapi/error.h"
  23#include "qapi/visitor.h"
  24#include <sys/ioctl.h>
  25#include "hw/nvram/fw_cfg.h"
  26#include "hw/qdev-properties.h"
  27#include "pci.h"
  28#include "trace.h"
  29
  30/*
  31 * List of device ids/vendor ids for which to disable
  32 * option rom loading. This avoids the guest hangs during rom
  33 * execution as noticed with the BCM 57810 card for lack of a
  34 * more better way to handle such issues.
  35 * The  user can still override by specifying a romfile or
  36 * rombar=1.
  37 * Please see https://bugs.launchpad.net/qemu/+bug/1284874
  38 * for an analysis of the 57810 card hang. When adding
  39 * a new vendor id/device id combination below, please also add
  40 * your card/environment details and information that could
  41 * help in debugging to the bug tracking this issue
  42 */
  43static const struct {
  44    uint32_t vendor;
  45    uint32_t device;
  46} rom_denylist[] = {
  47    { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */
  48};
  49
  50bool vfio_opt_rom_in_denylist(VFIOPCIDevice *vdev)
  51{
  52    int i;
  53
  54    for (i = 0 ; i < ARRAY_SIZE(rom_denylist); i++) {
  55        if (vfio_pci_is(vdev, rom_denylist[i].vendor, rom_denylist[i].device)) {
  56            trace_vfio_quirk_rom_in_denylist(vdev->vbasedev.name,
  57                                             rom_denylist[i].vendor,
  58                                             rom_denylist[i].device);
  59            return true;
  60        }
  61    }
  62    return false;
  63}
  64
  65/*
  66 * Device specific region quirks (mostly backdoors to PCI config space)
  67 */
  68
  69/*
  70 * The generic window quirks operate on an address and data register,
  71 * vfio_generic_window_address_quirk handles the address register and
  72 * vfio_generic_window_data_quirk handles the data register.  These ops
  73 * pass reads and writes through to hardware until a value matching the
  74 * stored address match/mask is written.  When this occurs, the data
  75 * register access emulated PCI config space for the device rather than
  76 * passing through accesses.  This enables devices where PCI config space
  77 * is accessible behind a window register to maintain the virtualization
  78 * provided through vfio.
  79 */
  80typedef struct VFIOConfigWindowMatch {
  81    uint32_t match;
  82    uint32_t mask;
  83} VFIOConfigWindowMatch;
  84
  85typedef struct VFIOConfigWindowQuirk {
  86    struct VFIOPCIDevice *vdev;
  87
  88    uint32_t address_val;
  89
  90    uint32_t address_offset;
  91    uint32_t data_offset;
  92
  93    bool window_enabled;
  94    uint8_t bar;
  95
  96    MemoryRegion *addr_mem;
  97    MemoryRegion *data_mem;
  98
  99    uint32_t nr_matches;
 100    VFIOConfigWindowMatch matches[];
 101} VFIOConfigWindowQuirk;
 102
 103static uint64_t vfio_generic_window_quirk_address_read(void *opaque,
 104                                                       hwaddr addr,
 105                                                       unsigned size)
 106{
 107    VFIOConfigWindowQuirk *window = opaque;
 108    VFIOPCIDevice *vdev = window->vdev;
 109
 110    return vfio_region_read(&vdev->bars[window->bar].region,
 111                            addr + window->address_offset, size);
 112}
 113
 114static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr,
 115                                                    uint64_t data,
 116                                                    unsigned size)
 117{
 118    VFIOConfigWindowQuirk *window = opaque;
 119    VFIOPCIDevice *vdev = window->vdev;
 120    int i;
 121
 122    window->window_enabled = false;
 123
 124    vfio_region_write(&vdev->bars[window->bar].region,
 125                      addr + window->address_offset, data, size);
 126
 127    for (i = 0; i < window->nr_matches; i++) {
 128        if ((data & ~window->matches[i].mask) == window->matches[i].match) {
 129            window->window_enabled = true;
 130            window->address_val = data & window->matches[i].mask;
 131            trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name,
 132                                    memory_region_name(window->addr_mem), data);
 133            break;
 134        }
 135    }
 136}
 137
 138static const MemoryRegionOps vfio_generic_window_address_quirk = {
 139    .read = vfio_generic_window_quirk_address_read,
 140    .write = vfio_generic_window_quirk_address_write,
 141    .endianness = DEVICE_LITTLE_ENDIAN,
 142};
 143
 144static uint64_t vfio_generic_window_quirk_data_read(void *opaque,
 145                                                    hwaddr addr, unsigned size)
 146{
 147    VFIOConfigWindowQuirk *window = opaque;
 148    VFIOPCIDevice *vdev = window->vdev;
 149    uint64_t data;
 150
 151    /* Always read data reg, discard if window enabled */
 152    data = vfio_region_read(&vdev->bars[window->bar].region,
 153                            addr + window->data_offset, size);
 154
 155    if (window->window_enabled) {
 156        data = vfio_pci_read_config(&vdev->pdev, window->address_val, size);
 157        trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name,
 158                                    memory_region_name(window->data_mem), data);
 159    }
 160
 161    return data;
 162}
 163
 164static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr,
 165                                                 uint64_t data, unsigned size)
 166{
 167    VFIOConfigWindowQuirk *window = opaque;
 168    VFIOPCIDevice *vdev = window->vdev;
 169
 170    if (window->window_enabled) {
 171        vfio_pci_write_config(&vdev->pdev, window->address_val, data, size);
 172        trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name,
 173                                    memory_region_name(window->data_mem), data);
 174        return;
 175    }
 176
 177    vfio_region_write(&vdev->bars[window->bar].region,
 178                      addr + window->data_offset, data, size);
 179}
 180
 181static const MemoryRegionOps vfio_generic_window_data_quirk = {
 182    .read = vfio_generic_window_quirk_data_read,
 183    .write = vfio_generic_window_quirk_data_write,
 184    .endianness = DEVICE_LITTLE_ENDIAN,
 185};
 186
 187/*
 188 * The generic mirror quirk handles devices which expose PCI config space
 189 * through a region within a BAR.  When enabled, reads and writes are
 190 * redirected through to emulated PCI config space.  XXX if PCI config space
 191 * used memory regions, this could just be an alias.
 192 */
 193typedef struct VFIOConfigMirrorQuirk {
 194    struct VFIOPCIDevice *vdev;
 195    uint32_t offset;
 196    uint8_t bar;
 197    MemoryRegion *mem;
 198    uint8_t data[];
 199} VFIOConfigMirrorQuirk;
 200
 201static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
 202                                               hwaddr addr, unsigned size)
 203{
 204    VFIOConfigMirrorQuirk *mirror = opaque;
 205    VFIOPCIDevice *vdev = mirror->vdev;
 206    uint64_t data;
 207
 208    /* Read and discard in case the hardware cares */
 209    (void)vfio_region_read(&vdev->bars[mirror->bar].region,
 210                           addr + mirror->offset, size);
 211
 212    data = vfio_pci_read_config(&vdev->pdev, addr, size);
 213    trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name,
 214                                         memory_region_name(mirror->mem),
 215                                         addr, data);
 216    return data;
 217}
 218
 219static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr,
 220                                            uint64_t data, unsigned size)
 221{
 222    VFIOConfigMirrorQuirk *mirror = opaque;
 223    VFIOPCIDevice *vdev = mirror->vdev;
 224
 225    vfio_pci_write_config(&vdev->pdev, addr, data, size);
 226    trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name,
 227                                          memory_region_name(mirror->mem),
 228                                          addr, data);
 229}
 230
 231static const MemoryRegionOps vfio_generic_mirror_quirk = {
 232    .read = vfio_generic_quirk_mirror_read,
 233    .write = vfio_generic_quirk_mirror_write,
 234    .endianness = DEVICE_LITTLE_ENDIAN,
 235};
 236
 237/* Is range1 fully contained within range2?  */
 238static bool vfio_range_contained(uint64_t first1, uint64_t len1,
 239                                 uint64_t first2, uint64_t len2) {
 240    return (first1 >= first2 && first1 + len1 <= first2 + len2);
 241}
 242
 243#define PCI_VENDOR_ID_ATI               0x1002
 244
 245/*
 246 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
 247 * through VGA register 0x3c3.  On newer cards, the I/O port BAR is always
 248 * BAR4 (older cards like the X550 used BAR1, but we don't care to support
 249 * those).  Note that on bare metal, a read of 0x3c3 doesn't always return the
 250 * I/O port BAR address.  Originally this was coded to return the virtual BAR
 251 * address only if the physical register read returns the actual BAR address,
 252 * but users have reported greater success if we return the virtual address
 253 * unconditionally.
 254 */
 255static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
 256                                        hwaddr addr, unsigned size)
 257{
 258    VFIOPCIDevice *vdev = opaque;
 259    uint64_t data = vfio_pci_read_config(&vdev->pdev,
 260                                         PCI_BASE_ADDRESS_4 + 1, size);
 261
 262    trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data);
 263
 264    return data;
 265}
 266
 267static void vfio_ati_3c3_quirk_write(void *opaque, hwaddr addr,
 268                                        uint64_t data, unsigned size)
 269{
 270    qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid access\n", __func__);
 271}
 272
 273static const MemoryRegionOps vfio_ati_3c3_quirk = {
 274    .read = vfio_ati_3c3_quirk_read,
 275    .write = vfio_ati_3c3_quirk_write,
 276    .endianness = DEVICE_LITTLE_ENDIAN,
 277};
 278
 279VFIOQuirk *vfio_quirk_alloc(int nr_mem)
 280{
 281    VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
 282    QLIST_INIT(&quirk->ioeventfds);
 283    quirk->mem = g_new0(MemoryRegion, nr_mem);
 284    quirk->nr_mem = nr_mem;
 285
 286    return quirk;
 287}
 288
 289static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
 290{
 291    QLIST_REMOVE(ioeventfd, next);
 292    memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
 293                              true, ioeventfd->data, &ioeventfd->e);
 294
 295    if (ioeventfd->vfio) {
 296        struct vfio_device_ioeventfd vfio_ioeventfd;
 297
 298        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
 299        vfio_ioeventfd.flags = ioeventfd->size;
 300        vfio_ioeventfd.data = ioeventfd->data;
 301        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
 302                                ioeventfd->region_addr;
 303        vfio_ioeventfd.fd = -1;
 304
 305        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) {
 306            error_report("Failed to remove vfio ioeventfd for %s+0x%"
 307                         HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)",
 308                         memory_region_name(ioeventfd->mr), ioeventfd->addr,
 309                         ioeventfd->size, ioeventfd->data);
 310        }
 311    } else {
 312        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
 313                            NULL, NULL, NULL);
 314    }
 315
 316    event_notifier_cleanup(&ioeventfd->e);
 317    trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
 318                              (uint64_t)ioeventfd->addr, ioeventfd->size,
 319                              ioeventfd->data);
 320    g_free(ioeventfd);
 321}
 322
 323static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
 324{
 325    VFIOIOEventFD *ioeventfd, *tmp;
 326
 327    QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
 328        if (ioeventfd->dynamic) {
 329            vfio_ioeventfd_exit(vdev, ioeventfd);
 330        }
 331    }
 332}
 333
 334static void vfio_ioeventfd_handler(void *opaque)
 335{
 336    VFIOIOEventFD *ioeventfd = opaque;
 337
 338    if (event_notifier_test_and_clear(&ioeventfd->e)) {
 339        vfio_region_write(ioeventfd->region, ioeventfd->region_addr,
 340                          ioeventfd->data, ioeventfd->size);
 341        trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr),
 342                                     (uint64_t)ioeventfd->addr, ioeventfd->size,
 343                                     ioeventfd->data);
 344    }
 345}
 346
 347static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
 348                                          MemoryRegion *mr, hwaddr addr,
 349                                          unsigned size, uint64_t data,
 350                                          VFIORegion *region,
 351                                          hwaddr region_addr, bool dynamic)
 352{
 353    VFIOIOEventFD *ioeventfd;
 354
 355    if (vdev->no_kvm_ioeventfd) {
 356        return NULL;
 357    }
 358
 359    ioeventfd = g_malloc0(sizeof(*ioeventfd));
 360
 361    if (event_notifier_init(&ioeventfd->e, 0)) {
 362        g_free(ioeventfd);
 363        return NULL;
 364    }
 365
 366    /*
 367     * MemoryRegion and relative offset, plus additional ioeventfd setup
 368     * parameters for configuring and later tearing down KVM ioeventfd.
 369     */
 370    ioeventfd->mr = mr;
 371    ioeventfd->addr = addr;
 372    ioeventfd->size = size;
 373    ioeventfd->data = data;
 374    ioeventfd->dynamic = dynamic;
 375    /*
 376     * VFIORegion and relative offset for implementing the userspace
 377     * handler.  data & size fields shared for both uses.
 378     */
 379    ioeventfd->region = region;
 380    ioeventfd->region_addr = region_addr;
 381
 382    if (!vdev->no_vfio_ioeventfd) {
 383        struct vfio_device_ioeventfd vfio_ioeventfd;
 384
 385        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
 386        vfio_ioeventfd.flags = ioeventfd->size;
 387        vfio_ioeventfd.data = ioeventfd->data;
 388        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
 389                                ioeventfd->region_addr;
 390        vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e);
 391
 392        ioeventfd->vfio = !ioctl(vdev->vbasedev.fd,
 393                                 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
 394    }
 395
 396    if (!ioeventfd->vfio) {
 397        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
 398                            vfio_ioeventfd_handler, NULL, ioeventfd);
 399    }
 400
 401    memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
 402                              true, ioeventfd->data, &ioeventfd->e);
 403    trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
 404                              size, data, ioeventfd->vfio);
 405
 406    return ioeventfd;
 407}
 408
 409static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
 410{
 411    VFIOQuirk *quirk;
 412
 413    /*
 414     * As long as the BAR is >= 256 bytes it will be aligned such that the
 415     * lower byte is always zero.  Filter out anything else, if it exists.
 416     */
 417    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 418        !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) {
 419        return;
 420    }
 421
 422    quirk = vfio_quirk_alloc(1);
 423
 424    memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev,
 425                          "vfio-ati-3c3-quirk", 1);
 426    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 427                                3 /* offset 3 bytes from 0x3c0 */, quirk->mem);
 428
 429    QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
 430                      quirk, next);
 431
 432    trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name);
 433}
 434
 435/*
 436 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI
 437 * config space through MMIO BAR2 at offset 0x4000.  Nothing seems to access
 438 * the MMIO space directly, but a window to this space is provided through
 439 * I/O port BAR4.  Offset 0x0 is the address register and offset 0x4 is the
 440 * data register.  When the address is programmed to a range of 0x4000-0x4fff
 441 * PCI configuration space is available.  Experimentation seems to indicate
 442 * that read-only may be provided by hardware.
 443 */
 444static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 445{
 446    VFIOQuirk *quirk;
 447    VFIOConfigWindowQuirk *window;
 448
 449    /* This windows doesn't seem to be used except by legacy VGA code */
 450    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 451        !vdev->vga || nr != 4) {
 452        return;
 453    }
 454
 455    quirk = vfio_quirk_alloc(2);
 456    window = quirk->data = g_malloc0(sizeof(*window) +
 457                                     sizeof(VFIOConfigWindowMatch));
 458    window->vdev = vdev;
 459    window->address_offset = 0;
 460    window->data_offset = 4;
 461    window->nr_matches = 1;
 462    window->matches[0].match = 0x4000;
 463    window->matches[0].mask = vdev->config_size - 1;
 464    window->bar = nr;
 465    window->addr_mem = &quirk->mem[0];
 466    window->data_mem = &quirk->mem[1];
 467
 468    memory_region_init_io(window->addr_mem, OBJECT(vdev),
 469                          &vfio_generic_window_address_quirk, window,
 470                          "vfio-ati-bar4-window-address-quirk", 4);
 471    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 472                                        window->address_offset,
 473                                        window->addr_mem, 1);
 474
 475    memory_region_init_io(window->data_mem, OBJECT(vdev),
 476                          &vfio_generic_window_data_quirk, window,
 477                          "vfio-ati-bar4-window-data-quirk", 4);
 478    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 479                                        window->data_offset,
 480                                        window->data_mem, 1);
 481
 482    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 483
 484    trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name);
 485}
 486
 487/*
 488 * Trap the BAR2 MMIO mirror to config space as well.
 489 */
 490static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr)
 491{
 492    VFIOQuirk *quirk;
 493    VFIOConfigMirrorQuirk *mirror;
 494
 495    /* Only enable on newer devices where BAR2 is 64bit */
 496    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 497        !vdev->vga || nr != 2 || !vdev->bars[2].mem64) {
 498        return;
 499    }
 500
 501    quirk = vfio_quirk_alloc(1);
 502    mirror = quirk->data = g_malloc0(sizeof(*mirror));
 503    mirror->mem = quirk->mem;
 504    mirror->vdev = vdev;
 505    mirror->offset = 0x4000;
 506    mirror->bar = nr;
 507
 508    memory_region_init_io(mirror->mem, OBJECT(vdev),
 509                          &vfio_generic_mirror_quirk, mirror,
 510                          "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE);
 511    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 512                                        mirror->offset, mirror->mem, 1);
 513
 514    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 515
 516    trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name);
 517}
 518
 519/*
 520 * Older ATI/AMD cards like the X550 have a similar window to that above.
 521 * I/O port BAR1 provides a window to a mirror of PCI config space located
 522 * in BAR2 at offset 0xf00.  We don't care to support such older cards, but
 523 * note it for future reference.
 524 */
 525
 526/*
 527 * Nvidia has several different methods to get to config space, the
 528 * nouveu project has several of these documented here:
 529 * https://github.com/pathscale/envytools/tree/master/hwdocs
 530 *
 531 * The first quirk is actually not documented in envytools and is found
 532 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]).  This is an
 533 * NV46 chipset.  The backdoor uses the legacy VGA I/O ports to access
 534 * the mirror of PCI config space found at BAR0 offset 0x1800.  The access
 535 * sequence first writes 0x338 to I/O port 0x3d4.  The target offset is
 536 * then written to 0x3d0.  Finally 0x538 is written for a read and 0x738
 537 * is written for a write to 0x3d4.  The BAR0 offset is then accessible
 538 * through 0x3d0.  This quirk doesn't seem to be necessary on newer cards
 539 * that use the I/O port BAR5 window but it doesn't hurt to leave it.
 540 */
 541typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State;
 542static const char *nv3d0_states[] = { "NONE", "SELECT",
 543                                      "WINDOW", "READ", "WRITE" };
 544
 545typedef struct VFIONvidia3d0Quirk {
 546    VFIOPCIDevice *vdev;
 547    VFIONvidia3d0State state;
 548    uint32_t offset;
 549} VFIONvidia3d0Quirk;
 550
 551static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque,
 552                                           hwaddr addr, unsigned size)
 553{
 554    VFIONvidia3d0Quirk *quirk = opaque;
 555    VFIOPCIDevice *vdev = quirk->vdev;
 556
 557    quirk->state = NONE;
 558
 559    return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 560                         addr + 0x14, size);
 561}
 562
 563static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr,
 564                                        uint64_t data, unsigned size)
 565{
 566    VFIONvidia3d0Quirk *quirk = opaque;
 567    VFIOPCIDevice *vdev = quirk->vdev;
 568    VFIONvidia3d0State old_state = quirk->state;
 569
 570    quirk->state = NONE;
 571
 572    switch (data) {
 573    case 0x338:
 574        if (old_state == NONE) {
 575            quirk->state = SELECT;
 576            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 577                                              nv3d0_states[quirk->state]);
 578        }
 579        break;
 580    case 0x538:
 581        if (old_state == WINDOW) {
 582            quirk->state = READ;
 583            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 584                                              nv3d0_states[quirk->state]);
 585        }
 586        break;
 587    case 0x738:
 588        if (old_state == WINDOW) {
 589            quirk->state = WRITE;
 590            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 591                                              nv3d0_states[quirk->state]);
 592        }
 593        break;
 594    }
 595
 596    vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 597                   addr + 0x14, data, size);
 598}
 599
 600static const MemoryRegionOps vfio_nvidia_3d4_quirk = {
 601    .read = vfio_nvidia_3d4_quirk_read,
 602    .write = vfio_nvidia_3d4_quirk_write,
 603    .endianness = DEVICE_LITTLE_ENDIAN,
 604};
 605
 606static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
 607                                           hwaddr addr, unsigned size)
 608{
 609    VFIONvidia3d0Quirk *quirk = opaque;
 610    VFIOPCIDevice *vdev = quirk->vdev;
 611    VFIONvidia3d0State old_state = quirk->state;
 612    uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 613                                  addr + 0x10, size);
 614
 615    quirk->state = NONE;
 616
 617    if (old_state == READ &&
 618        (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
 619        uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
 620
 621        data = vfio_pci_read_config(&vdev->pdev, offset, size);
 622        trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name,
 623                                         offset, size, data);
 624    }
 625
 626    return data;
 627}
 628
 629static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
 630                                        uint64_t data, unsigned size)
 631{
 632    VFIONvidia3d0Quirk *quirk = opaque;
 633    VFIOPCIDevice *vdev = quirk->vdev;
 634    VFIONvidia3d0State old_state = quirk->state;
 635
 636    quirk->state = NONE;
 637
 638    if (old_state == SELECT) {
 639        quirk->offset = (uint32_t)data;
 640        quirk->state = WINDOW;
 641        trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 642                                          nv3d0_states[quirk->state]);
 643    } else if (old_state == WRITE) {
 644        if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
 645            uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
 646
 647            vfio_pci_write_config(&vdev->pdev, offset, data, size);
 648            trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name,
 649                                              offset, data, size);
 650            return;
 651        }
 652    }
 653
 654    vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 655                   addr + 0x10, data, size);
 656}
 657
 658static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
 659    .read = vfio_nvidia_3d0_quirk_read,
 660    .write = vfio_nvidia_3d0_quirk_write,
 661    .endianness = DEVICE_LITTLE_ENDIAN,
 662};
 663
 664static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
 665{
 666    VFIOQuirk *quirk;
 667    VFIONvidia3d0Quirk *data;
 668
 669    if (vdev->no_geforce_quirks ||
 670        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 671        !vdev->bars[1].region.size) {
 672        return;
 673    }
 674
 675    quirk = vfio_quirk_alloc(2);
 676    quirk->data = data = g_malloc0(sizeof(*data));
 677    data->vdev = vdev;
 678
 679    memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk,
 680                          data, "vfio-nvidia-3d4-quirk", 2);
 681    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 682                                0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]);
 683
 684    memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk,
 685                          data, "vfio-nvidia-3d0-quirk", 2);
 686    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 687                                0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]);
 688
 689    QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
 690                      quirk, next);
 691
 692    trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name);
 693}
 694
 695/*
 696 * The second quirk is documented in envytools.  The I/O port BAR5 is just
 697 * a set of address/data ports to the MMIO BARs.  The BAR we care about is
 698 * again BAR0.  This backdoor is apparently a bit newer than the one above
 699 * so we need to not only trap 256 bytes @0x1800, but all of PCI config
 700 * space, including extended space is available at the 4k @0x88000.
 701 */
 702typedef struct VFIONvidiaBAR5Quirk {
 703    uint32_t master;
 704    uint32_t enable;
 705    MemoryRegion *addr_mem;
 706    MemoryRegion *data_mem;
 707    bool enabled;
 708    VFIOConfigWindowQuirk window; /* last for match data */
 709} VFIONvidiaBAR5Quirk;
 710
 711static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5)
 712{
 713    VFIOPCIDevice *vdev = bar5->window.vdev;
 714
 715    if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) {
 716        return;
 717    }
 718
 719    bar5->enabled = !bar5->enabled;
 720    trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name,
 721                                       bar5->enabled ?  "Enable" : "Disable");
 722    memory_region_set_enabled(bar5->addr_mem, bar5->enabled);
 723    memory_region_set_enabled(bar5->data_mem, bar5->enabled);
 724}
 725
 726static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque,
 727                                                   hwaddr addr, unsigned size)
 728{
 729    VFIONvidiaBAR5Quirk *bar5 = opaque;
 730    VFIOPCIDevice *vdev = bar5->window.vdev;
 731
 732    return vfio_region_read(&vdev->bars[5].region, addr, size);
 733}
 734
 735static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr,
 736                                                uint64_t data, unsigned size)
 737{
 738    VFIONvidiaBAR5Quirk *bar5 = opaque;
 739    VFIOPCIDevice *vdev = bar5->window.vdev;
 740
 741    vfio_region_write(&vdev->bars[5].region, addr, data, size);
 742
 743    bar5->master = data;
 744    vfio_nvidia_bar5_enable(bar5);
 745}
 746
 747static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = {
 748    .read = vfio_nvidia_bar5_quirk_master_read,
 749    .write = vfio_nvidia_bar5_quirk_master_write,
 750    .endianness = DEVICE_LITTLE_ENDIAN,
 751};
 752
 753static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque,
 754                                                   hwaddr addr, unsigned size)
 755{
 756    VFIONvidiaBAR5Quirk *bar5 = opaque;
 757    VFIOPCIDevice *vdev = bar5->window.vdev;
 758
 759    return vfio_region_read(&vdev->bars[5].region, addr + 4, size);
 760}
 761
 762static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr,
 763                                                uint64_t data, unsigned size)
 764{
 765    VFIONvidiaBAR5Quirk *bar5 = opaque;
 766    VFIOPCIDevice *vdev = bar5->window.vdev;
 767
 768    vfio_region_write(&vdev->bars[5].region, addr + 4, data, size);
 769
 770    bar5->enable = data;
 771    vfio_nvidia_bar5_enable(bar5);
 772}
 773
 774static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = {
 775    .read = vfio_nvidia_bar5_quirk_enable_read,
 776    .write = vfio_nvidia_bar5_quirk_enable_write,
 777    .endianness = DEVICE_LITTLE_ENDIAN,
 778};
 779
 780static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
 781{
 782    VFIOQuirk *quirk;
 783    VFIONvidiaBAR5Quirk *bar5;
 784    VFIOConfigWindowQuirk *window;
 785
 786    if (vdev->no_geforce_quirks ||
 787        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 788        !vdev->vga || nr != 5 || !vdev->bars[5].ioport) {
 789        return;
 790    }
 791
 792    quirk = vfio_quirk_alloc(4);
 793    bar5 = quirk->data = g_malloc0(sizeof(*bar5) +
 794                                   (sizeof(VFIOConfigWindowMatch) * 2));
 795    window = &bar5->window;
 796
 797    window->vdev = vdev;
 798    window->address_offset = 0x8;
 799    window->data_offset = 0xc;
 800    window->nr_matches = 2;
 801    window->matches[0].match = 0x1800;
 802    window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1;
 803    window->matches[1].match = 0x88000;
 804    window->matches[1].mask = vdev->config_size - 1;
 805    window->bar = nr;
 806    window->addr_mem = bar5->addr_mem = &quirk->mem[0];
 807    window->data_mem = bar5->data_mem = &quirk->mem[1];
 808
 809    memory_region_init_io(window->addr_mem, OBJECT(vdev),
 810                          &vfio_generic_window_address_quirk, window,
 811                          "vfio-nvidia-bar5-window-address-quirk", 4);
 812    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 813                                        window->address_offset,
 814                                        window->addr_mem, 1);
 815    memory_region_set_enabled(window->addr_mem, false);
 816
 817    memory_region_init_io(window->data_mem, OBJECT(vdev),
 818                          &vfio_generic_window_data_quirk, window,
 819                          "vfio-nvidia-bar5-window-data-quirk", 4);
 820    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 821                                        window->data_offset,
 822                                        window->data_mem, 1);
 823    memory_region_set_enabled(window->data_mem, false);
 824
 825    memory_region_init_io(&quirk->mem[2], OBJECT(vdev),
 826                          &vfio_nvidia_bar5_quirk_master, bar5,
 827                          "vfio-nvidia-bar5-master-quirk", 4);
 828    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 829                                        0, &quirk->mem[2], 1);
 830
 831    memory_region_init_io(&quirk->mem[3], OBJECT(vdev),
 832                          &vfio_nvidia_bar5_quirk_enable, bar5,
 833                          "vfio-nvidia-bar5-enable-quirk", 4);
 834    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 835                                        4, &quirk->mem[3], 1);
 836
 837    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 838
 839    trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
 840}
 841
 842typedef struct LastDataSet {
 843    VFIOQuirk *quirk;
 844    hwaddr addr;
 845    uint64_t data;
 846    unsigned size;
 847    int hits;
 848    int added;
 849} LastDataSet;
 850
 851#define MAX_DYN_IOEVENTFD 10
 852#define HITS_FOR_IOEVENTFD 10
 853
 854/*
 855 * Finally, BAR0 itself.  We want to redirect any accesses to either
 856 * 0x1800 or 0x88000 through the PCI config space access functions.
 857 */
 858static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
 859                                           uint64_t data, unsigned size)
 860{
 861    VFIOConfigMirrorQuirk *mirror = opaque;
 862    VFIOPCIDevice *vdev = mirror->vdev;
 863    PCIDevice *pdev = &vdev->pdev;
 864    LastDataSet *last = (LastDataSet *)&mirror->data;
 865
 866    vfio_generic_quirk_mirror_write(opaque, addr, data, size);
 867
 868    /*
 869     * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
 870     * MSI capability ID register.  Both the ID and next register are
 871     * read-only, so we allow writes covering either of those to real hw.
 872     */
 873    if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
 874        vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
 875        vfio_region_write(&vdev->bars[mirror->bar].region,
 876                          addr + mirror->offset, data, size);
 877        trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
 878    }
 879
 880    /*
 881     * Automatically add an ioeventfd to handle any repeated write with the
 882     * same data and size above the standard PCI config space header.  This is
 883     * primarily expected to accelerate the MSI-ACK behavior, such as noted
 884     * above.  Current hardware/drivers should trigger an ioeventfd at config
 885     * offset 0x704 (region offset 0x88704), with data 0x0, size 4.
 886     *
 887     * The criteria of 10 successive hits is arbitrary but reliably adds the
 888     * MSI-ACK region.  Note that as some writes are bypassed via the ioeventfd,
 889     * the remaining ones have a greater chance of being seen successively.
 890     * To avoid the pathological case of burning up all of QEMU's open file
 891     * handles, arbitrarily limit this algorithm from adding no more than 10
 892     * ioeventfds, print an error if we would have added an 11th, and then
 893     * stop counting.
 894     */
 895    if (!vdev->no_kvm_ioeventfd &&
 896        addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) {
 897        if (addr != last->addr || data != last->data || size != last->size) {
 898            last->addr = addr;
 899            last->data = data;
 900            last->size = size;
 901            last->hits = 1;
 902        } else if (++last->hits >= HITS_FOR_IOEVENTFD) {
 903            if (last->added < MAX_DYN_IOEVENTFD) {
 904                VFIOIOEventFD *ioeventfd;
 905                ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size,
 906                                        data, &vdev->bars[mirror->bar].region,
 907                                        mirror->offset + addr, true);
 908                if (ioeventfd) {
 909                    VFIOQuirk *quirk = last->quirk;
 910
 911                    QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next);
 912                    last->added++;
 913                }
 914            } else {
 915                last->added++;
 916                warn_report("NVIDIA ioeventfd queue full for %s, unable to "
 917                            "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", "
 918                            "size %u", vdev->vbasedev.name, addr, data, size);
 919            }
 920        }
 921    }
 922}
 923
 924static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
 925    .read = vfio_generic_quirk_mirror_read,
 926    .write = vfio_nvidia_quirk_mirror_write,
 927    .endianness = DEVICE_LITTLE_ENDIAN,
 928};
 929
 930static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
 931{
 932    VFIOConfigMirrorQuirk *mirror = quirk->data;
 933    LastDataSet *last = (LastDataSet *)&mirror->data;
 934
 935    last->addr = last->data = last->size = last->hits = last->added = 0;
 936
 937    vfio_drop_dynamic_eventfds(vdev, quirk);
 938}
 939
 940static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
 941{
 942    VFIOQuirk *quirk;
 943    VFIOConfigMirrorQuirk *mirror;
 944    LastDataSet *last;
 945
 946    if (vdev->no_geforce_quirks ||
 947        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 948        !vfio_is_vga(vdev) || nr != 0) {
 949        return;
 950    }
 951
 952    quirk = vfio_quirk_alloc(1);
 953    quirk->reset = vfio_nvidia_bar0_quirk_reset;
 954    mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
 955    mirror->mem = quirk->mem;
 956    mirror->vdev = vdev;
 957    mirror->offset = 0x88000;
 958    mirror->bar = nr;
 959    last = (LastDataSet *)&mirror->data;
 960    last->quirk = quirk;
 961
 962    memory_region_init_io(mirror->mem, OBJECT(vdev),
 963                          &vfio_nvidia_mirror_quirk, mirror,
 964                          "vfio-nvidia-bar0-88000-mirror-quirk",
 965                          vdev->config_size);
 966    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 967                                        mirror->offset, mirror->mem, 1);
 968
 969    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 970
 971    /* The 0x1800 offset mirror only seems to get used by legacy VGA */
 972    if (vdev->vga) {
 973        quirk = vfio_quirk_alloc(1);
 974        quirk->reset = vfio_nvidia_bar0_quirk_reset;
 975        mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
 976        mirror->mem = quirk->mem;
 977        mirror->vdev = vdev;
 978        mirror->offset = 0x1800;
 979        mirror->bar = nr;
 980        last = (LastDataSet *)&mirror->data;
 981        last->quirk = quirk;
 982
 983        memory_region_init_io(mirror->mem, OBJECT(vdev),
 984                              &vfio_nvidia_mirror_quirk, mirror,
 985                              "vfio-nvidia-bar0-1800-mirror-quirk",
 986                              PCI_CONFIG_SPACE_SIZE);
 987        memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 988                                            mirror->offset, mirror->mem, 1);
 989
 990        QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 991    }
 992
 993    trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name);
 994}
 995
 996/*
 997 * TODO - Some Nvidia devices provide config access to their companion HDA
 998 * device and even to their parent bridge via these config space mirrors.
 999 * Add quirks for those regions.
1000 */
1001
1002#define PCI_VENDOR_ID_REALTEK 0x10ec
1003
1004/*
1005 * RTL8168 devices have a backdoor that can access the MSI-X table.  At BAR2
1006 * offset 0x70 there is a dword data register, offset 0x74 is a dword address
1007 * register.  According to the Linux r8169 driver, the MSI-X table is addressed
1008 * when the "type" portion of the address register is set to 0x1.  This appears
1009 * to be bits 16:30.  Bit 31 is both a write indicator and some sort of
1010 * "address latched" indicator.  Bits 12:15 are a mask field, which we can
1011 * ignore because the MSI-X table should always be accessed as a dword (full
1012 * mask).  Bits 0:11 is offset within the type.
1013 *
1014 * Example trace:
1015 *
1016 * Read from MSI-X table offset 0
1017 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
1018 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
1019 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
1020 *
1021 * Write 0xfee00000 to MSI-X table offset 0
1022 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
1023 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
1024 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
1025 */
1026typedef struct VFIOrtl8168Quirk {
1027    VFIOPCIDevice *vdev;
1028    uint32_t addr;
1029    uint32_t data;
1030    bool enabled;
1031} VFIOrtl8168Quirk;
1032
1033static uint64_t vfio_rtl8168_quirk_address_read(void *opaque,
1034                                                hwaddr addr, unsigned size)
1035{
1036    VFIOrtl8168Quirk *rtl = opaque;
1037    VFIOPCIDevice *vdev = rtl->vdev;
1038    uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size);
1039
1040    if (rtl->enabled) {
1041        data = rtl->addr ^ 0x80000000U; /* latch/complete */
1042        trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data);
1043    }
1044
1045    return data;
1046}
1047
1048static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr,
1049                                             uint64_t data, unsigned size)
1050{
1051    VFIOrtl8168Quirk *rtl = opaque;
1052    VFIOPCIDevice *vdev = rtl->vdev;
1053
1054    rtl->enabled = false;
1055
1056    if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */
1057        rtl->enabled = true;
1058        rtl->addr = (uint32_t)data;
1059
1060        if (data & 0x80000000U) { /* Do write */
1061            if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
1062                hwaddr offset = data & 0xfff;
1063                uint64_t val = rtl->data;
1064
1065                trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name,
1066                                                    (uint16_t)offset, val);
1067
1068                /* Write to the proper guest MSI-X table instead */
1069                memory_region_dispatch_write(&vdev->pdev.msix_table_mmio,
1070                                             offset, val,
1071                                             size_memop(size) | MO_LE,
1072                                             MEMTXATTRS_UNSPECIFIED);
1073            }
1074            return; /* Do not write guest MSI-X data to hardware */
1075        }
1076    }
1077
1078    vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size);
1079}
1080
1081static const MemoryRegionOps vfio_rtl_address_quirk = {
1082    .read = vfio_rtl8168_quirk_address_read,
1083    .write = vfio_rtl8168_quirk_address_write,
1084    .valid = {
1085        .min_access_size = 4,
1086        .max_access_size = 4,
1087        .unaligned = false,
1088    },
1089    .endianness = DEVICE_LITTLE_ENDIAN,
1090};
1091
1092static uint64_t vfio_rtl8168_quirk_data_read(void *opaque,
1093                                             hwaddr addr, unsigned size)
1094{
1095    VFIOrtl8168Quirk *rtl = opaque;
1096    VFIOPCIDevice *vdev = rtl->vdev;
1097    uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size);
1098
1099    if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
1100        hwaddr offset = rtl->addr & 0xfff;
1101        memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset,
1102                                    &data, size_memop(size) | MO_LE,
1103                                    MEMTXATTRS_UNSPECIFIED);
1104        trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data);
1105    }
1106
1107    return data;
1108}
1109
1110static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr,
1111                                          uint64_t data, unsigned size)
1112{
1113    VFIOrtl8168Quirk *rtl = opaque;
1114    VFIOPCIDevice *vdev = rtl->vdev;
1115
1116    rtl->data = (uint32_t)data;
1117
1118    vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size);
1119}
1120
1121static const MemoryRegionOps vfio_rtl_data_quirk = {
1122    .read = vfio_rtl8168_quirk_data_read,
1123    .write = vfio_rtl8168_quirk_data_write,
1124    .valid = {
1125        .min_access_size = 4,
1126        .max_access_size = 4,
1127        .unaligned = false,
1128    },
1129    .endianness = DEVICE_LITTLE_ENDIAN,
1130};
1131
1132static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr)
1133{
1134    VFIOQuirk *quirk;
1135    VFIOrtl8168Quirk *rtl;
1136
1137    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) {
1138        return;
1139    }
1140
1141    quirk = vfio_quirk_alloc(2);
1142    quirk->data = rtl = g_malloc0(sizeof(*rtl));
1143    rtl->vdev = vdev;
1144
1145    memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
1146                          &vfio_rtl_address_quirk, rtl,
1147                          "vfio-rtl8168-window-address-quirk", 4);
1148    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1149                                        0x74, &quirk->mem[0], 1);
1150
1151    memory_region_init_io(&quirk->mem[1], OBJECT(vdev),
1152                          &vfio_rtl_data_quirk, rtl,
1153                          "vfio-rtl8168-window-data-quirk", 4);
1154    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1155                                        0x70, &quirk->mem[1], 1);
1156
1157    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1158
1159    trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name);
1160}
1161
1162#define IGD_ASLS 0xfc /* ASL Storage Register */
1163
1164/*
1165 * The OpRegion includes the Video BIOS Table, which seems important for
1166 * telling the driver what sort of outputs it has.  Without this, the device
1167 * may work in the guest, but we may not get output.  This also requires BIOS
1168 * support to reserve and populate a section of guest memory sufficient for
1169 * the table and to write the base address of that memory to the ASLS register
1170 * of the IGD device.
1171 */
1172int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
1173                               struct vfio_region_info *info, Error **errp)
1174{
1175    int ret;
1176
1177    vdev->igd_opregion = g_malloc0(info->size);
1178    ret = pread(vdev->vbasedev.fd, vdev->igd_opregion,
1179                info->size, info->offset);
1180    if (ret != info->size) {
1181        error_setg(errp, "failed to read IGD OpRegion");
1182        g_free(vdev->igd_opregion);
1183        vdev->igd_opregion = NULL;
1184        return -EINVAL;
1185    }
1186
1187    /*
1188     * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to
1189     * allocate 32bit reserved memory for, copy these contents into, and write
1190     * the reserved memory base address to the device ASLS register at 0xFC.
1191     * Alignment of this reserved region seems flexible, but using a 4k page
1192     * alignment seems to work well.  This interface assumes a single IGD
1193     * device, which may be at VM address 00:02.0 in legacy mode or another
1194     * address in UPT mode.
1195     *
1196     * NB, there may be future use cases discovered where the VM should have
1197     * direct interaction with the host OpRegion, in which case the write to
1198     * the ASLS register would trigger MemoryRegion setup to enable that.
1199     */
1200    fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion",
1201                    vdev->igd_opregion, info->size);
1202
1203    trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name);
1204
1205    pci_set_long(vdev->pdev.config + IGD_ASLS, 0);
1206    pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0);
1207    pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0);
1208
1209    return 0;
1210}
1211
1212/*
1213 * Common quirk probe entry points.
1214 */
1215void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
1216{
1217    vfio_vga_probe_ati_3c3_quirk(vdev);
1218    vfio_vga_probe_nvidia_3d0_quirk(vdev);
1219}
1220
1221void vfio_vga_quirk_exit(VFIOPCIDevice *vdev)
1222{
1223    VFIOQuirk *quirk;
1224    int i, j;
1225
1226    for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1227        QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) {
1228            for (j = 0; j < quirk->nr_mem; j++) {
1229                memory_region_del_subregion(&vdev->vga->region[i].mem,
1230                                            &quirk->mem[j]);
1231            }
1232        }
1233    }
1234}
1235
1236void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev)
1237{
1238    int i, j;
1239
1240    for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1241        while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) {
1242            VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks);
1243            QLIST_REMOVE(quirk, next);
1244            for (j = 0; j < quirk->nr_mem; j++) {
1245                object_unparent(OBJECT(&quirk->mem[j]));
1246            }
1247            g_free(quirk->mem);
1248            g_free(quirk->data);
1249            g_free(quirk);
1250        }
1251    }
1252}
1253
1254void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
1255{
1256    vfio_probe_ati_bar4_quirk(vdev, nr);
1257    vfio_probe_ati_bar2_quirk(vdev, nr);
1258    vfio_probe_nvidia_bar5_quirk(vdev, nr);
1259    vfio_probe_nvidia_bar0_quirk(vdev, nr);
1260    vfio_probe_rtl8168_bar2_quirk(vdev, nr);
1261#ifdef CONFIG_VFIO_IGD
1262    vfio_probe_igd_bar4_quirk(vdev, nr);
1263#endif
1264}
1265
1266void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
1267{
1268    VFIOBAR *bar = &vdev->bars[nr];
1269    VFIOQuirk *quirk;
1270    int i;
1271
1272    QLIST_FOREACH(quirk, &bar->quirks, next) {
1273        while (!QLIST_EMPTY(&quirk->ioeventfds)) {
1274            vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds));
1275        }
1276
1277        for (i = 0; i < quirk->nr_mem; i++) {
1278            memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
1279        }
1280    }
1281}
1282
1283void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr)
1284{
1285    VFIOBAR *bar = &vdev->bars[nr];
1286    int i;
1287
1288    while (!QLIST_EMPTY(&bar->quirks)) {
1289        VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1290        QLIST_REMOVE(quirk, next);
1291        for (i = 0; i < quirk->nr_mem; i++) {
1292            object_unparent(OBJECT(&quirk->mem[i]));
1293        }
1294        g_free(quirk->mem);
1295        g_free(quirk->data);
1296        g_free(quirk);
1297    }
1298}
1299
1300/*
1301 * Reset quirks
1302 */
1303void vfio_quirk_reset(VFIOPCIDevice *vdev)
1304{
1305    int i;
1306
1307    for (i = 0; i < PCI_ROM_SLOT; i++) {
1308        VFIOQuirk *quirk;
1309        VFIOBAR *bar = &vdev->bars[i];
1310
1311        QLIST_FOREACH(quirk, &bar->quirks, next) {
1312            if (quirk->reset) {
1313                quirk->reset(vdev, quirk);
1314            }
1315        }
1316    }
1317}
1318
1319/*
1320 * AMD Radeon PCI config reset, based on Linux:
1321 *   drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running()
1322 *   drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset
1323 *   drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc()
1324 *   drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock()
1325 * IDs: include/drm/drm_pciids.h
1326 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0
1327 *
1328 * Bonaire and Hawaii GPUs do not respond to a bus reset.  This is a bug in the
1329 * hardware that should be fixed on future ASICs.  The symptom of this is that
1330 * once the accerlated driver loads, Windows guests will bsod on subsequent
1331 * attmpts to load the driver, such as after VM reset or shutdown/restart.  To
1332 * work around this, we do an AMD specific PCI config reset, followed by an SMC
1333 * reset.  The PCI config reset only works if SMC firmware is running, so we
1334 * have a dependency on the state of the device as to whether this reset will
1335 * be effective.  There are still cases where we won't be able to kick the
1336 * device into working, but this greatly improves the usability overall.  The
1337 * config reset magic is relatively common on AMD GPUs, but the setup and SMC
1338 * poking is largely ASIC specific.
1339 */
1340static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev)
1341{
1342    uint32_t clk, pc_c;
1343
1344    /*
1345     * Registers 200h and 204h are index and data registers for accessing
1346     * indirect configuration registers within the device.
1347     */
1348    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1349    clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1350    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4);
1351    pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1352
1353    return (!(clk & 1) && (0x20100 <= pc_c));
1354}
1355
1356/*
1357 * The scope of a config reset is controlled by a mode bit in the misc register
1358 * and a fuse, exposed as a bit in another register.  The fuse is the default
1359 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the formula
1360 * scope = !(misc ^ fuse), where the resulting scope is defined the same as
1361 * the fuse.  A truth table therefore tells us that if misc == fuse, we need
1362 * to flip the value of the bit in the misc register.
1363 */
1364static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev)
1365{
1366    uint32_t misc, fuse;
1367    bool a, b;
1368
1369    vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4);
1370    fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1371    b = fuse & 64;
1372
1373    vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4);
1374    misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1375    a = misc & 2;
1376
1377    if (a == b) {
1378        vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4);
1379        vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */
1380    }
1381}
1382
1383static int vfio_radeon_reset(VFIOPCIDevice *vdev)
1384{
1385    PCIDevice *pdev = &vdev->pdev;
1386    int i, ret = 0;
1387    uint32_t data;
1388
1389    /* Defer to a kernel implemented reset */
1390    if (vdev->vbasedev.reset_works) {
1391        trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name);
1392        return -ENODEV;
1393    }
1394
1395    /* Enable only memory BAR access */
1396    vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2);
1397
1398    /* Reset only works if SMC firmware is loaded and running */
1399    if (!vfio_radeon_smc_is_running(vdev)) {
1400        ret = -EINVAL;
1401        trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name);
1402        goto out;
1403    }
1404
1405    /* Make sure only the GFX function is reset */
1406    vfio_radeon_set_gfx_only_reset(vdev);
1407
1408    /* AMD PCI config reset */
1409    vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4);
1410    usleep(100);
1411
1412    /* Read back the memory size to make sure we're out of reset */
1413    for (i = 0; i < 100000; i++) {
1414        if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) {
1415            goto reset_smc;
1416        }
1417        usleep(1);
1418    }
1419
1420    trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name);
1421
1422reset_smc:
1423    /* Reset SMC */
1424    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4);
1425    data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1426    data |= 1;
1427    vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
1428
1429    /* Disable SMC clock */
1430    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1431    data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1432    data |= 1;
1433    vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
1434
1435    trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name);
1436
1437out:
1438    /* Restore PCI command register */
1439    vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2);
1440
1441    return ret;
1442}
1443
1444void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev)
1445{
1446    switch (vdev->vendor_id) {
1447    case 0x1002:
1448        switch (vdev->device_id) {
1449        /* Bonaire */
1450        case 0x6649: /* Bonaire [FirePro W5100] */
1451        case 0x6650:
1452        case 0x6651:
1453        case 0x6658: /* Bonaire XTX [Radeon R7 260X] */
1454        case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */
1455        case 0x665d: /* Bonaire [Radeon R7 200 Series] */
1456        /* Hawaii */
1457        case 0x67A0: /* Hawaii XT GL [FirePro W9100] */
1458        case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */
1459        case 0x67A2:
1460        case 0x67A8:
1461        case 0x67A9:
1462        case 0x67AA:
1463        case 0x67B0: /* Hawaii XT [Radeon R9 290X] */
1464        case 0x67B1: /* Hawaii PRO [Radeon R9 290] */
1465        case 0x67B8:
1466        case 0x67B9:
1467        case 0x67BA:
1468        case 0x67BE:
1469            vdev->resetfn = vfio_radeon_reset;
1470            trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name);
1471            break;
1472        }
1473        break;
1474    }
1475}
1476
1477/*
1478 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify
1479 * devices as a member of a clique.  Devices within the same clique ID
1480 * are capable of direct P2P.  It's the user's responsibility that this
1481 * is correct.  The spec says that this may reside at any unused config
1482 * offset, but reserves and recommends hypervisors place this at C8h.
1483 * The spec also states that the hypervisor should place this capability
1484 * at the end of the capability list, thus next is defined as 0h.
1485 *
1486 * +----------------+----------------+----------------+----------------+
1487 * | sig 7:0 ('P')  |  vndr len (8h) |    next (0h)   |   cap id (9h)  |
1488 * +----------------+----------------+----------------+----------------+
1489 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)|          sig 23:8 ('P2')        |
1490 * +---------------------------------+---------------------------------+
1491 *
1492 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
1493 */
1494static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v,
1495                                       const char *name, void *opaque,
1496                                       Error **errp)
1497{
1498    Property *prop = opaque;
1499    uint8_t *ptr = object_field_prop_ptr(obj, prop);
1500
1501    visit_type_uint8(v, name, ptr, errp);
1502}
1503
1504static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v,
1505                                       const char *name, void *opaque,
1506                                       Error **errp)
1507{
1508    Property *prop = opaque;
1509    uint8_t value, *ptr = object_field_prop_ptr(obj, prop);
1510
1511    if (!visit_type_uint8(v, name, &value, errp)) {
1512        return;
1513    }
1514
1515    if (value & ~0xF) {
1516        error_setg(errp, "Property %s: valid range 0-15", name);
1517        return;
1518    }
1519
1520    *ptr = value;
1521}
1522
1523const PropertyInfo qdev_prop_nv_gpudirect_clique = {
1524    .name = "uint4",
1525    .description = "NVIDIA GPUDirect Clique ID (0 - 15)",
1526    .get = get_nv_gpudirect_clique_id,
1527    .set = set_nv_gpudirect_clique_id,
1528};
1529
1530static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
1531{
1532    PCIDevice *pdev = &vdev->pdev;
1533    int ret, pos = 0xC8;
1534
1535    if (vdev->nv_gpudirect_clique == 0xFF) {
1536        return 0;
1537    }
1538
1539    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
1540        error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor");
1541        return -EINVAL;
1542    }
1543
1544    if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) !=
1545        PCI_BASE_CLASS_DISPLAY) {
1546        error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class");
1547        return -EINVAL;
1548    }
1549
1550    ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
1551    if (ret < 0) {
1552        error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
1553        return ret;
1554    }
1555
1556    memset(vdev->emulated_config_bits + pos, 0xFF, 8);
1557    pos += PCI_CAP_FLAGS;
1558    pci_set_byte(pdev->config + pos++, 8);
1559    pci_set_byte(pdev->config + pos++, 'P');
1560    pci_set_byte(pdev->config + pos++, '2');
1561    pci_set_byte(pdev->config + pos++, 'P');
1562    pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3);
1563    pci_set_byte(pdev->config + pos, 0);
1564
1565    return 0;
1566}
1567
1568static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v,
1569                                     const char *name,
1570                                     void *opaque, Error **errp)
1571{
1572    uint64_t tgt = (uintptr_t) opaque;
1573    visit_type_uint64(v, name, &tgt, errp);
1574}
1575
1576static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v,
1577                                                 const char *name,
1578                                                 void *opaque, Error **errp)
1579{
1580    uint32_t link_speed = (uint32_t)(uintptr_t) opaque;
1581    visit_type_uint32(v, name, &link_speed, errp);
1582}
1583
1584int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp)
1585{
1586    int ret;
1587    void *p;
1588    struct vfio_region_info *nv2reg = NULL;
1589    struct vfio_info_cap_header *hdr;
1590    struct vfio_region_info_cap_nvlink2_ssatgt *cap;
1591    VFIOQuirk *quirk;
1592
1593    ret = vfio_get_dev_region_info(&vdev->vbasedev,
1594                                   VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
1595                                   PCI_VENDOR_ID_NVIDIA,
1596                                   VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
1597                                   &nv2reg);
1598    if (ret) {
1599        return ret;
1600    }
1601
1602    hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
1603    if (!hdr) {
1604        ret = -ENODEV;
1605        goto free_exit;
1606    }
1607    cap = (void *) hdr;
1608
1609    p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE,
1610             MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset);
1611    if (p == MAP_FAILED) {
1612        ret = -errno;
1613        goto free_exit;
1614    }
1615
1616    quirk = vfio_quirk_alloc(1);
1617    memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr",
1618                               nv2reg->size, p);
1619    QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
1620
1621    object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
1622                        vfio_pci_nvlink2_get_tgt, NULL, NULL,
1623                        (void *) (uintptr_t) cap->tgt);
1624    trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt,
1625                                          nv2reg->size);
1626free_exit:
1627    g_free(nv2reg);
1628
1629    return ret;
1630}
1631
1632int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp)
1633{
1634    int ret;
1635    void *p;
1636    struct vfio_region_info *atsdreg = NULL;
1637    struct vfio_info_cap_header *hdr;
1638    struct vfio_region_info_cap_nvlink2_ssatgt *captgt;
1639    struct vfio_region_info_cap_nvlink2_lnkspd *capspeed;
1640    VFIOQuirk *quirk;
1641
1642    ret = vfio_get_dev_region_info(&vdev->vbasedev,
1643                                   VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
1644                                   PCI_VENDOR_ID_IBM,
1645                                   VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
1646                                   &atsdreg);
1647    if (ret) {
1648        return ret;
1649    }
1650
1651    hdr = vfio_get_region_info_cap(atsdreg,
1652                                   VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
1653    if (!hdr) {
1654        ret = -ENODEV;
1655        goto free_exit;
1656    }
1657    captgt = (void *) hdr;
1658
1659    hdr = vfio_get_region_info_cap(atsdreg,
1660                                   VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD);
1661    if (!hdr) {
1662        ret = -ENODEV;
1663        goto free_exit;
1664    }
1665    capspeed = (void *) hdr;
1666
1667    /* Some NVLink bridges may not have assigned ATSD */
1668    if (atsdreg->size) {
1669        p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE,
1670                 MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset);
1671        if (p == MAP_FAILED) {
1672            ret = -errno;
1673            goto free_exit;
1674        }
1675
1676        quirk = vfio_quirk_alloc(1);
1677        memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev),
1678                                          "nvlink2-atsd-mr", atsdreg->size, p);
1679        QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
1680    }
1681
1682    object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
1683                        vfio_pci_nvlink2_get_tgt, NULL, NULL,
1684                        (void *) (uintptr_t) captgt->tgt);
1685    trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt,
1686                                              atsdreg->size);
1687
1688    object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32",
1689                        vfio_pci_nvlink2_get_link_speed, NULL, NULL,
1690                        (void *) (uintptr_t) capspeed->link_speed);
1691    trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name,
1692                                              capspeed->link_speed);
1693free_exit:
1694    g_free(atsdreg);
1695
1696    return ret;
1697}
1698
1699/*
1700 * The VMD endpoint provides a real PCIe domain to the guest and the guest
1701 * kernel performs enumeration of the VMD sub-device domain. Guest transactions
1702 * to VMD sub-devices go through MMU translation from guest addresses to
1703 * physical addresses. When MMIO goes to an endpoint after being translated to
1704 * physical addresses, the bridge rejects the transaction because the window
1705 * has been programmed with guest addresses.
1706 *
1707 * VMD can use the Host Physical Address in order to correctly program the
1708 * bridge windows in its PCIe domain. VMD device 28C0 has HPA shadow registers
1709 * located at offset 0x2000 in MEMBAR2 (BAR 4). This quirk provides the HPA
1710 * shadow registers in a vendor-specific capability register for devices
1711 * without native support. The position of 0xE8-0xFF is in the reserved range
1712 * of the VMD device capability space following the Power Management
1713 * Capability.
1714 */
1715#define VMD_SHADOW_CAP_VER 1
1716#define VMD_SHADOW_CAP_LEN 24
1717static int vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, Error **errp)
1718{
1719    uint8_t membar_phys[16];
1720    int ret, pos = 0xE8;
1721
1722    if (!(vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x201D) ||
1723          vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x467F) ||
1724          vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x4C3D) ||
1725          vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x9A0B))) {
1726        return 0;
1727    }
1728
1729    ret = pread(vdev->vbasedev.fd, membar_phys, 16,
1730                vdev->config_offset + PCI_BASE_ADDRESS_2);
1731    if (ret != 16) {
1732        error_report("VMD %s cannot read MEMBARs (%d)",
1733                     vdev->vbasedev.name, ret);
1734        return -EFAULT;
1735    }
1736
1737    ret = pci_add_capability(&vdev->pdev, PCI_CAP_ID_VNDR, pos,
1738                             VMD_SHADOW_CAP_LEN, errp);
1739    if (ret < 0) {
1740        error_prepend(errp, "Failed to add VMD MEMBAR Shadow cap: ");
1741        return ret;
1742    }
1743
1744    memset(vdev->emulated_config_bits + pos, 0xFF, VMD_SHADOW_CAP_LEN);
1745    pos += PCI_CAP_FLAGS;
1746    pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_LEN);
1747    pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_VER);
1748    pci_set_long(vdev->pdev.config + pos, 0x53484457); /* SHDW */
1749    memcpy(vdev->pdev.config + pos + 4, membar_phys, 16);
1750
1751    return 0;
1752}
1753
1754int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
1755{
1756    int ret;
1757
1758    ret = vfio_add_nv_gpudirect_cap(vdev, errp);
1759    if (ret) {
1760        return ret;
1761    }
1762
1763    ret = vfio_add_vmd_shadow_cap(vdev, errp);
1764    if (ret) {
1765        return ret;
1766    }
1767
1768    return 0;
1769}
1770