qemu/hw/vfio/pci-quirks.c
<<
>>
Prefs
   1/*
   2 * device quirks for PCI devices
   3 *
   4 * Copyright Red Hat, Inc. 2012-2015
   5 *
   6 * Authors:
   7 *  Alex Williamson <alex.williamson@redhat.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 */
  12
  13#include "qemu/osdep.h"
  14#include CONFIG_DEVICES
  15#include "exec/memop.h"
  16#include "qemu/units.h"
  17#include "qemu/log.h"
  18#include "qemu/error-report.h"
  19#include "qemu/main-loop.h"
  20#include "qemu/module.h"
  21#include "qemu/range.h"
  22#include "qapi/error.h"
  23#include "qapi/visitor.h"
  24#include <sys/ioctl.h>
  25#include "hw/hw.h"
  26#include "hw/nvram/fw_cfg.h"
  27#include "hw/qdev-properties.h"
  28#include "pci.h"
  29#include "trace.h"
  30
  31/*
  32 * List of device ids/vendor ids for which to disable
  33 * option rom loading. This avoids the guest hangs during rom
  34 * execution as noticed with the BCM 57810 card for lack of a
  35 * more better way to handle such issues.
  36 * The  user can still override by specifying a romfile or
  37 * rombar=1.
  38 * Please see https://bugs.launchpad.net/qemu/+bug/1284874
  39 * for an analysis of the 57810 card hang. When adding
  40 * a new vendor id/device id combination below, please also add
  41 * your card/environment details and information that could
  42 * help in debugging to the bug tracking this issue
  43 */
  44static const struct {
  45    uint32_t vendor;
  46    uint32_t device;
  47} rom_denylist[] = {
  48    { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */
  49};
  50
  51bool vfio_opt_rom_in_denylist(VFIOPCIDevice *vdev)
  52{
  53    int i;
  54
  55    for (i = 0 ; i < ARRAY_SIZE(rom_denylist); i++) {
  56        if (vfio_pci_is(vdev, rom_denylist[i].vendor, rom_denylist[i].device)) {
  57            trace_vfio_quirk_rom_in_denylist(vdev->vbasedev.name,
  58                                             rom_denylist[i].vendor,
  59                                             rom_denylist[i].device);
  60            return true;
  61        }
  62    }
  63    return false;
  64}
  65
  66/*
  67 * Device specific region quirks (mostly backdoors to PCI config space)
  68 */
  69
  70/*
  71 * The generic window quirks operate on an address and data register,
  72 * vfio_generic_window_address_quirk handles the address register and
  73 * vfio_generic_window_data_quirk handles the data register.  These ops
  74 * pass reads and writes through to hardware until a value matching the
  75 * stored address match/mask is written.  When this occurs, the data
  76 * register access emulated PCI config space for the device rather than
  77 * passing through accesses.  This enables devices where PCI config space
  78 * is accessible behind a window register to maintain the virtualization
  79 * provided through vfio.
  80 */
  81typedef struct VFIOConfigWindowMatch {
  82    uint32_t match;
  83    uint32_t mask;
  84} VFIOConfigWindowMatch;
  85
  86typedef struct VFIOConfigWindowQuirk {
  87    struct VFIOPCIDevice *vdev;
  88
  89    uint32_t address_val;
  90
  91    uint32_t address_offset;
  92    uint32_t data_offset;
  93
  94    bool window_enabled;
  95    uint8_t bar;
  96
  97    MemoryRegion *addr_mem;
  98    MemoryRegion *data_mem;
  99
 100    uint32_t nr_matches;
 101    VFIOConfigWindowMatch matches[];
 102} VFIOConfigWindowQuirk;
 103
 104static uint64_t vfio_generic_window_quirk_address_read(void *opaque,
 105                                                       hwaddr addr,
 106                                                       unsigned size)
 107{
 108    VFIOConfigWindowQuirk *window = opaque;
 109    VFIOPCIDevice *vdev = window->vdev;
 110
 111    return vfio_region_read(&vdev->bars[window->bar].region,
 112                            addr + window->address_offset, size);
 113}
 114
 115static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr,
 116                                                    uint64_t data,
 117                                                    unsigned size)
 118{
 119    VFIOConfigWindowQuirk *window = opaque;
 120    VFIOPCIDevice *vdev = window->vdev;
 121    int i;
 122
 123    window->window_enabled = false;
 124
 125    vfio_region_write(&vdev->bars[window->bar].region,
 126                      addr + window->address_offset, data, size);
 127
 128    for (i = 0; i < window->nr_matches; i++) {
 129        if ((data & ~window->matches[i].mask) == window->matches[i].match) {
 130            window->window_enabled = true;
 131            window->address_val = data & window->matches[i].mask;
 132            trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name,
 133                                    memory_region_name(window->addr_mem), data);
 134            break;
 135        }
 136    }
 137}
 138
 139static const MemoryRegionOps vfio_generic_window_address_quirk = {
 140    .read = vfio_generic_window_quirk_address_read,
 141    .write = vfio_generic_window_quirk_address_write,
 142    .endianness = DEVICE_LITTLE_ENDIAN,
 143};
 144
 145static uint64_t vfio_generic_window_quirk_data_read(void *opaque,
 146                                                    hwaddr addr, unsigned size)
 147{
 148    VFIOConfigWindowQuirk *window = opaque;
 149    VFIOPCIDevice *vdev = window->vdev;
 150    uint64_t data;
 151
 152    /* Always read data reg, discard if window enabled */
 153    data = vfio_region_read(&vdev->bars[window->bar].region,
 154                            addr + window->data_offset, size);
 155
 156    if (window->window_enabled) {
 157        data = vfio_pci_read_config(&vdev->pdev, window->address_val, size);
 158        trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name,
 159                                    memory_region_name(window->data_mem), data);
 160    }
 161
 162    return data;
 163}
 164
 165static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr,
 166                                                 uint64_t data, unsigned size)
 167{
 168    VFIOConfigWindowQuirk *window = opaque;
 169    VFIOPCIDevice *vdev = window->vdev;
 170
 171    if (window->window_enabled) {
 172        vfio_pci_write_config(&vdev->pdev, window->address_val, data, size);
 173        trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name,
 174                                    memory_region_name(window->data_mem), data);
 175        return;
 176    }
 177
 178    vfio_region_write(&vdev->bars[window->bar].region,
 179                      addr + window->data_offset, data, size);
 180}
 181
 182static const MemoryRegionOps vfio_generic_window_data_quirk = {
 183    .read = vfio_generic_window_quirk_data_read,
 184    .write = vfio_generic_window_quirk_data_write,
 185    .endianness = DEVICE_LITTLE_ENDIAN,
 186};
 187
 188/*
 189 * The generic mirror quirk handles devices which expose PCI config space
 190 * through a region within a BAR.  When enabled, reads and writes are
 191 * redirected through to emulated PCI config space.  XXX if PCI config space
 192 * used memory regions, this could just be an alias.
 193 */
 194typedef struct VFIOConfigMirrorQuirk {
 195    struct VFIOPCIDevice *vdev;
 196    uint32_t offset;
 197    uint8_t bar;
 198    MemoryRegion *mem;
 199    uint8_t data[];
 200} VFIOConfigMirrorQuirk;
 201
 202static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
 203                                               hwaddr addr, unsigned size)
 204{
 205    VFIOConfigMirrorQuirk *mirror = opaque;
 206    VFIOPCIDevice *vdev = mirror->vdev;
 207    uint64_t data;
 208
 209    /* Read and discard in case the hardware cares */
 210    (void)vfio_region_read(&vdev->bars[mirror->bar].region,
 211                           addr + mirror->offset, size);
 212
 213    data = vfio_pci_read_config(&vdev->pdev, addr, size);
 214    trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name,
 215                                         memory_region_name(mirror->mem),
 216                                         addr, data);
 217    return data;
 218}
 219
 220static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr,
 221                                            uint64_t data, unsigned size)
 222{
 223    VFIOConfigMirrorQuirk *mirror = opaque;
 224    VFIOPCIDevice *vdev = mirror->vdev;
 225
 226    vfio_pci_write_config(&vdev->pdev, addr, data, size);
 227    trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name,
 228                                          memory_region_name(mirror->mem),
 229                                          addr, data);
 230}
 231
 232static const MemoryRegionOps vfio_generic_mirror_quirk = {
 233    .read = vfio_generic_quirk_mirror_read,
 234    .write = vfio_generic_quirk_mirror_write,
 235    .endianness = DEVICE_LITTLE_ENDIAN,
 236};
 237
 238/* Is range1 fully contained within range2?  */
 239static bool vfio_range_contained(uint64_t first1, uint64_t len1,
 240                                 uint64_t first2, uint64_t len2) {
 241    return (first1 >= first2 && first1 + len1 <= first2 + len2);
 242}
 243
 244#define PCI_VENDOR_ID_ATI               0x1002
 245
 246/*
 247 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
 248 * through VGA register 0x3c3.  On newer cards, the I/O port BAR is always
 249 * BAR4 (older cards like the X550 used BAR1, but we don't care to support
 250 * those).  Note that on bare metal, a read of 0x3c3 doesn't always return the
 251 * I/O port BAR address.  Originally this was coded to return the virtual BAR
 252 * address only if the physical register read returns the actual BAR address,
 253 * but users have reported greater success if we return the virtual address
 254 * unconditionally.
 255 */
 256static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
 257                                        hwaddr addr, unsigned size)
 258{
 259    VFIOPCIDevice *vdev = opaque;
 260    uint64_t data = vfio_pci_read_config(&vdev->pdev,
 261                                         PCI_BASE_ADDRESS_4 + 1, size);
 262
 263    trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data);
 264
 265    return data;
 266}
 267
 268static void vfio_ati_3c3_quirk_write(void *opaque, hwaddr addr,
 269                                        uint64_t data, unsigned size)
 270{
 271    qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid access\n", __func__);
 272}
 273
 274static const MemoryRegionOps vfio_ati_3c3_quirk = {
 275    .read = vfio_ati_3c3_quirk_read,
 276    .write = vfio_ati_3c3_quirk_write,
 277    .endianness = DEVICE_LITTLE_ENDIAN,
 278};
 279
 280VFIOQuirk *vfio_quirk_alloc(int nr_mem)
 281{
 282    VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
 283    QLIST_INIT(&quirk->ioeventfds);
 284    quirk->mem = g_new0(MemoryRegion, nr_mem);
 285    quirk->nr_mem = nr_mem;
 286
 287    return quirk;
 288}
 289
 290static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
 291{
 292    QLIST_REMOVE(ioeventfd, next);
 293    memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
 294                              true, ioeventfd->data, &ioeventfd->e);
 295
 296    if (ioeventfd->vfio) {
 297        struct vfio_device_ioeventfd vfio_ioeventfd;
 298
 299        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
 300        vfio_ioeventfd.flags = ioeventfd->size;
 301        vfio_ioeventfd.data = ioeventfd->data;
 302        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
 303                                ioeventfd->region_addr;
 304        vfio_ioeventfd.fd = -1;
 305
 306        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) {
 307            error_report("Failed to remove vfio ioeventfd for %s+0x%"
 308                         HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)",
 309                         memory_region_name(ioeventfd->mr), ioeventfd->addr,
 310                         ioeventfd->size, ioeventfd->data);
 311        }
 312    } else {
 313        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
 314                            NULL, NULL, NULL);
 315    }
 316
 317    event_notifier_cleanup(&ioeventfd->e);
 318    trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
 319                              (uint64_t)ioeventfd->addr, ioeventfd->size,
 320                              ioeventfd->data);
 321    g_free(ioeventfd);
 322}
 323
 324static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
 325{
 326    VFIOIOEventFD *ioeventfd, *tmp;
 327
 328    QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
 329        if (ioeventfd->dynamic) {
 330            vfio_ioeventfd_exit(vdev, ioeventfd);
 331        }
 332    }
 333}
 334
 335static void vfio_ioeventfd_handler(void *opaque)
 336{
 337    VFIOIOEventFD *ioeventfd = opaque;
 338
 339    if (event_notifier_test_and_clear(&ioeventfd->e)) {
 340        vfio_region_write(ioeventfd->region, ioeventfd->region_addr,
 341                          ioeventfd->data, ioeventfd->size);
 342        trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr),
 343                                     (uint64_t)ioeventfd->addr, ioeventfd->size,
 344                                     ioeventfd->data);
 345    }
 346}
 347
 348static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
 349                                          MemoryRegion *mr, hwaddr addr,
 350                                          unsigned size, uint64_t data,
 351                                          VFIORegion *region,
 352                                          hwaddr region_addr, bool dynamic)
 353{
 354    VFIOIOEventFD *ioeventfd;
 355
 356    if (vdev->no_kvm_ioeventfd) {
 357        return NULL;
 358    }
 359
 360    ioeventfd = g_malloc0(sizeof(*ioeventfd));
 361
 362    if (event_notifier_init(&ioeventfd->e, 0)) {
 363        g_free(ioeventfd);
 364        return NULL;
 365    }
 366
 367    /*
 368     * MemoryRegion and relative offset, plus additional ioeventfd setup
 369     * parameters for configuring and later tearing down KVM ioeventfd.
 370     */
 371    ioeventfd->mr = mr;
 372    ioeventfd->addr = addr;
 373    ioeventfd->size = size;
 374    ioeventfd->data = data;
 375    ioeventfd->dynamic = dynamic;
 376    /*
 377     * VFIORegion and relative offset for implementing the userspace
 378     * handler.  data & size fields shared for both uses.
 379     */
 380    ioeventfd->region = region;
 381    ioeventfd->region_addr = region_addr;
 382
 383    if (!vdev->no_vfio_ioeventfd) {
 384        struct vfio_device_ioeventfd vfio_ioeventfd;
 385
 386        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
 387        vfio_ioeventfd.flags = ioeventfd->size;
 388        vfio_ioeventfd.data = ioeventfd->data;
 389        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
 390                                ioeventfd->region_addr;
 391        vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e);
 392
 393        ioeventfd->vfio = !ioctl(vdev->vbasedev.fd,
 394                                 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
 395    }
 396
 397    if (!ioeventfd->vfio) {
 398        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
 399                            vfio_ioeventfd_handler, NULL, ioeventfd);
 400    }
 401
 402    memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
 403                              true, ioeventfd->data, &ioeventfd->e);
 404    trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
 405                              size, data, ioeventfd->vfio);
 406
 407    return ioeventfd;
 408}
 409
 410static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
 411{
 412    VFIOQuirk *quirk;
 413
 414    /*
 415     * As long as the BAR is >= 256 bytes it will be aligned such that the
 416     * lower byte is always zero.  Filter out anything else, if it exists.
 417     */
 418    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 419        !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) {
 420        return;
 421    }
 422
 423    quirk = vfio_quirk_alloc(1);
 424
 425    memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev,
 426                          "vfio-ati-3c3-quirk", 1);
 427    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 428                                3 /* offset 3 bytes from 0x3c0 */, quirk->mem);
 429
 430    QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
 431                      quirk, next);
 432
 433    trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name);
 434}
 435
 436/*
 437 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI
 438 * config space through MMIO BAR2 at offset 0x4000.  Nothing seems to access
 439 * the MMIO space directly, but a window to this space is provided through
 440 * I/O port BAR4.  Offset 0x0 is the address register and offset 0x4 is the
 441 * data register.  When the address is programmed to a range of 0x4000-0x4fff
 442 * PCI configuration space is available.  Experimentation seems to indicate
 443 * that read-only may be provided by hardware.
 444 */
 445static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 446{
 447    VFIOQuirk *quirk;
 448    VFIOConfigWindowQuirk *window;
 449
 450    /* This windows doesn't seem to be used except by legacy VGA code */
 451    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 452        !vdev->vga || nr != 4) {
 453        return;
 454    }
 455
 456    quirk = vfio_quirk_alloc(2);
 457    window = quirk->data = g_malloc0(sizeof(*window) +
 458                                     sizeof(VFIOConfigWindowMatch));
 459    window->vdev = vdev;
 460    window->address_offset = 0;
 461    window->data_offset = 4;
 462    window->nr_matches = 1;
 463    window->matches[0].match = 0x4000;
 464    window->matches[0].mask = vdev->config_size - 1;
 465    window->bar = nr;
 466    window->addr_mem = &quirk->mem[0];
 467    window->data_mem = &quirk->mem[1];
 468
 469    memory_region_init_io(window->addr_mem, OBJECT(vdev),
 470                          &vfio_generic_window_address_quirk, window,
 471                          "vfio-ati-bar4-window-address-quirk", 4);
 472    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 473                                        window->address_offset,
 474                                        window->addr_mem, 1);
 475
 476    memory_region_init_io(window->data_mem, OBJECT(vdev),
 477                          &vfio_generic_window_data_quirk, window,
 478                          "vfio-ati-bar4-window-data-quirk", 4);
 479    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 480                                        window->data_offset,
 481                                        window->data_mem, 1);
 482
 483    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 484
 485    trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name);
 486}
 487
 488/*
 489 * Trap the BAR2 MMIO mirror to config space as well.
 490 */
 491static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr)
 492{
 493    VFIOQuirk *quirk;
 494    VFIOConfigMirrorQuirk *mirror;
 495
 496    /* Only enable on newer devices where BAR2 is 64bit */
 497    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 498        !vdev->vga || nr != 2 || !vdev->bars[2].mem64) {
 499        return;
 500    }
 501
 502    quirk = vfio_quirk_alloc(1);
 503    mirror = quirk->data = g_malloc0(sizeof(*mirror));
 504    mirror->mem = quirk->mem;
 505    mirror->vdev = vdev;
 506    mirror->offset = 0x4000;
 507    mirror->bar = nr;
 508
 509    memory_region_init_io(mirror->mem, OBJECT(vdev),
 510                          &vfio_generic_mirror_quirk, mirror,
 511                          "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE);
 512    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 513                                        mirror->offset, mirror->mem, 1);
 514
 515    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 516
 517    trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name);
 518}
 519
 520/*
 521 * Older ATI/AMD cards like the X550 have a similar window to that above.
 522 * I/O port BAR1 provides a window to a mirror of PCI config space located
 523 * in BAR2 at offset 0xf00.  We don't care to support such older cards, but
 524 * note it for future reference.
 525 */
 526
 527/*
 528 * Nvidia has several different methods to get to config space, the
 529 * nouveu project has several of these documented here:
 530 * https://github.com/pathscale/envytools/tree/master/hwdocs
 531 *
 532 * The first quirk is actually not documented in envytools and is found
 533 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]).  This is an
 534 * NV46 chipset.  The backdoor uses the legacy VGA I/O ports to access
 535 * the mirror of PCI config space found at BAR0 offset 0x1800.  The access
 536 * sequence first writes 0x338 to I/O port 0x3d4.  The target offset is
 537 * then written to 0x3d0.  Finally 0x538 is written for a read and 0x738
 538 * is written for a write to 0x3d4.  The BAR0 offset is then accessible
 539 * through 0x3d0.  This quirk doesn't seem to be necessary on newer cards
 540 * that use the I/O port BAR5 window but it doesn't hurt to leave it.
 541 */
 542typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State;
 543static const char *nv3d0_states[] = { "NONE", "SELECT",
 544                                      "WINDOW", "READ", "WRITE" };
 545
 546typedef struct VFIONvidia3d0Quirk {
 547    VFIOPCIDevice *vdev;
 548    VFIONvidia3d0State state;
 549    uint32_t offset;
 550} VFIONvidia3d0Quirk;
 551
 552static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque,
 553                                           hwaddr addr, unsigned size)
 554{
 555    VFIONvidia3d0Quirk *quirk = opaque;
 556    VFIOPCIDevice *vdev = quirk->vdev;
 557
 558    quirk->state = NONE;
 559
 560    return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 561                         addr + 0x14, size);
 562}
 563
 564static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr,
 565                                        uint64_t data, unsigned size)
 566{
 567    VFIONvidia3d0Quirk *quirk = opaque;
 568    VFIOPCIDevice *vdev = quirk->vdev;
 569    VFIONvidia3d0State old_state = quirk->state;
 570
 571    quirk->state = NONE;
 572
 573    switch (data) {
 574    case 0x338:
 575        if (old_state == NONE) {
 576            quirk->state = SELECT;
 577            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 578                                              nv3d0_states[quirk->state]);
 579        }
 580        break;
 581    case 0x538:
 582        if (old_state == WINDOW) {
 583            quirk->state = READ;
 584            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 585                                              nv3d0_states[quirk->state]);
 586        }
 587        break;
 588    case 0x738:
 589        if (old_state == WINDOW) {
 590            quirk->state = WRITE;
 591            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 592                                              nv3d0_states[quirk->state]);
 593        }
 594        break;
 595    }
 596
 597    vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 598                   addr + 0x14, data, size);
 599}
 600
 601static const MemoryRegionOps vfio_nvidia_3d4_quirk = {
 602    .read = vfio_nvidia_3d4_quirk_read,
 603    .write = vfio_nvidia_3d4_quirk_write,
 604    .endianness = DEVICE_LITTLE_ENDIAN,
 605};
 606
 607static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
 608                                           hwaddr addr, unsigned size)
 609{
 610    VFIONvidia3d0Quirk *quirk = opaque;
 611    VFIOPCIDevice *vdev = quirk->vdev;
 612    VFIONvidia3d0State old_state = quirk->state;
 613    uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 614                                  addr + 0x10, size);
 615
 616    quirk->state = NONE;
 617
 618    if (old_state == READ &&
 619        (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
 620        uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
 621
 622        data = vfio_pci_read_config(&vdev->pdev, offset, size);
 623        trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name,
 624                                         offset, size, data);
 625    }
 626
 627    return data;
 628}
 629
 630static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
 631                                        uint64_t data, unsigned size)
 632{
 633    VFIONvidia3d0Quirk *quirk = opaque;
 634    VFIOPCIDevice *vdev = quirk->vdev;
 635    VFIONvidia3d0State old_state = quirk->state;
 636
 637    quirk->state = NONE;
 638
 639    if (old_state == SELECT) {
 640        quirk->offset = (uint32_t)data;
 641        quirk->state = WINDOW;
 642        trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 643                                          nv3d0_states[quirk->state]);
 644    } else if (old_state == WRITE) {
 645        if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
 646            uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
 647
 648            vfio_pci_write_config(&vdev->pdev, offset, data, size);
 649            trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name,
 650                                              offset, data, size);
 651            return;
 652        }
 653    }
 654
 655    vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 656                   addr + 0x10, data, size);
 657}
 658
 659static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
 660    .read = vfio_nvidia_3d0_quirk_read,
 661    .write = vfio_nvidia_3d0_quirk_write,
 662    .endianness = DEVICE_LITTLE_ENDIAN,
 663};
 664
 665static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
 666{
 667    VFIOQuirk *quirk;
 668    VFIONvidia3d0Quirk *data;
 669
 670    if (vdev->no_geforce_quirks ||
 671        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 672        !vdev->bars[1].region.size) {
 673        return;
 674    }
 675
 676    quirk = vfio_quirk_alloc(2);
 677    quirk->data = data = g_malloc0(sizeof(*data));
 678    data->vdev = vdev;
 679
 680    memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk,
 681                          data, "vfio-nvidia-3d4-quirk", 2);
 682    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 683                                0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]);
 684
 685    memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk,
 686                          data, "vfio-nvidia-3d0-quirk", 2);
 687    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 688                                0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]);
 689
 690    QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
 691                      quirk, next);
 692
 693    trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name);
 694}
 695
 696/*
 697 * The second quirk is documented in envytools.  The I/O port BAR5 is just
 698 * a set of address/data ports to the MMIO BARs.  The BAR we care about is
 699 * again BAR0.  This backdoor is apparently a bit newer than the one above
 700 * so we need to not only trap 256 bytes @0x1800, but all of PCI config
 701 * space, including extended space is available at the 4k @0x88000.
 702 */
 703typedef struct VFIONvidiaBAR5Quirk {
 704    uint32_t master;
 705    uint32_t enable;
 706    MemoryRegion *addr_mem;
 707    MemoryRegion *data_mem;
 708    bool enabled;
 709    VFIOConfigWindowQuirk window; /* last for match data */
 710} VFIONvidiaBAR5Quirk;
 711
 712static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5)
 713{
 714    VFIOPCIDevice *vdev = bar5->window.vdev;
 715
 716    if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) {
 717        return;
 718    }
 719
 720    bar5->enabled = !bar5->enabled;
 721    trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name,
 722                                       bar5->enabled ?  "Enable" : "Disable");
 723    memory_region_set_enabled(bar5->addr_mem, bar5->enabled);
 724    memory_region_set_enabled(bar5->data_mem, bar5->enabled);
 725}
 726
 727static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque,
 728                                                   hwaddr addr, unsigned size)
 729{
 730    VFIONvidiaBAR5Quirk *bar5 = opaque;
 731    VFIOPCIDevice *vdev = bar5->window.vdev;
 732
 733    return vfio_region_read(&vdev->bars[5].region, addr, size);
 734}
 735
 736static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr,
 737                                                uint64_t data, unsigned size)
 738{
 739    VFIONvidiaBAR5Quirk *bar5 = opaque;
 740    VFIOPCIDevice *vdev = bar5->window.vdev;
 741
 742    vfio_region_write(&vdev->bars[5].region, addr, data, size);
 743
 744    bar5->master = data;
 745    vfio_nvidia_bar5_enable(bar5);
 746}
 747
 748static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = {
 749    .read = vfio_nvidia_bar5_quirk_master_read,
 750    .write = vfio_nvidia_bar5_quirk_master_write,
 751    .endianness = DEVICE_LITTLE_ENDIAN,
 752};
 753
 754static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque,
 755                                                   hwaddr addr, unsigned size)
 756{
 757    VFIONvidiaBAR5Quirk *bar5 = opaque;
 758    VFIOPCIDevice *vdev = bar5->window.vdev;
 759
 760    return vfio_region_read(&vdev->bars[5].region, addr + 4, size);
 761}
 762
 763static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr,
 764                                                uint64_t data, unsigned size)
 765{
 766    VFIONvidiaBAR5Quirk *bar5 = opaque;
 767    VFIOPCIDevice *vdev = bar5->window.vdev;
 768
 769    vfio_region_write(&vdev->bars[5].region, addr + 4, data, size);
 770
 771    bar5->enable = data;
 772    vfio_nvidia_bar5_enable(bar5);
 773}
 774
 775static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = {
 776    .read = vfio_nvidia_bar5_quirk_enable_read,
 777    .write = vfio_nvidia_bar5_quirk_enable_write,
 778    .endianness = DEVICE_LITTLE_ENDIAN,
 779};
 780
 781static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
 782{
 783    VFIOQuirk *quirk;
 784    VFIONvidiaBAR5Quirk *bar5;
 785    VFIOConfigWindowQuirk *window;
 786
 787    if (vdev->no_geforce_quirks ||
 788        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 789        !vdev->vga || nr != 5 || !vdev->bars[5].ioport) {
 790        return;
 791    }
 792
 793    quirk = vfio_quirk_alloc(4);
 794    bar5 = quirk->data = g_malloc0(sizeof(*bar5) +
 795                                   (sizeof(VFIOConfigWindowMatch) * 2));
 796    window = &bar5->window;
 797
 798    window->vdev = vdev;
 799    window->address_offset = 0x8;
 800    window->data_offset = 0xc;
 801    window->nr_matches = 2;
 802    window->matches[0].match = 0x1800;
 803    window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1;
 804    window->matches[1].match = 0x88000;
 805    window->matches[1].mask = vdev->config_size - 1;
 806    window->bar = nr;
 807    window->addr_mem = bar5->addr_mem = &quirk->mem[0];
 808    window->data_mem = bar5->data_mem = &quirk->mem[1];
 809
 810    memory_region_init_io(window->addr_mem, OBJECT(vdev),
 811                          &vfio_generic_window_address_quirk, window,
 812                          "vfio-nvidia-bar5-window-address-quirk", 4);
 813    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 814                                        window->address_offset,
 815                                        window->addr_mem, 1);
 816    memory_region_set_enabled(window->addr_mem, false);
 817
 818    memory_region_init_io(window->data_mem, OBJECT(vdev),
 819                          &vfio_generic_window_data_quirk, window,
 820                          "vfio-nvidia-bar5-window-data-quirk", 4);
 821    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 822                                        window->data_offset,
 823                                        window->data_mem, 1);
 824    memory_region_set_enabled(window->data_mem, false);
 825
 826    memory_region_init_io(&quirk->mem[2], OBJECT(vdev),
 827                          &vfio_nvidia_bar5_quirk_master, bar5,
 828                          "vfio-nvidia-bar5-master-quirk", 4);
 829    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 830                                        0, &quirk->mem[2], 1);
 831
 832    memory_region_init_io(&quirk->mem[3], OBJECT(vdev),
 833                          &vfio_nvidia_bar5_quirk_enable, bar5,
 834                          "vfio-nvidia-bar5-enable-quirk", 4);
 835    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 836                                        4, &quirk->mem[3], 1);
 837
 838    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 839
 840    trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
 841}
 842
 843typedef struct LastDataSet {
 844    VFIOQuirk *quirk;
 845    hwaddr addr;
 846    uint64_t data;
 847    unsigned size;
 848    int hits;
 849    int added;
 850} LastDataSet;
 851
 852#define MAX_DYN_IOEVENTFD 10
 853#define HITS_FOR_IOEVENTFD 10
 854
 855/*
 856 * Finally, BAR0 itself.  We want to redirect any accesses to either
 857 * 0x1800 or 0x88000 through the PCI config space access functions.
 858 */
 859static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
 860                                           uint64_t data, unsigned size)
 861{
 862    VFIOConfigMirrorQuirk *mirror = opaque;
 863    VFIOPCIDevice *vdev = mirror->vdev;
 864    PCIDevice *pdev = &vdev->pdev;
 865    LastDataSet *last = (LastDataSet *)&mirror->data;
 866
 867    vfio_generic_quirk_mirror_write(opaque, addr, data, size);
 868
 869    /*
 870     * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
 871     * MSI capability ID register.  Both the ID and next register are
 872     * read-only, so we allow writes covering either of those to real hw.
 873     */
 874    if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
 875        vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
 876        vfio_region_write(&vdev->bars[mirror->bar].region,
 877                          addr + mirror->offset, data, size);
 878        trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
 879    }
 880
 881    /*
 882     * Automatically add an ioeventfd to handle any repeated write with the
 883     * same data and size above the standard PCI config space header.  This is
 884     * primarily expected to accelerate the MSI-ACK behavior, such as noted
 885     * above.  Current hardware/drivers should trigger an ioeventfd at config
 886     * offset 0x704 (region offset 0x88704), with data 0x0, size 4.
 887     *
 888     * The criteria of 10 successive hits is arbitrary but reliably adds the
 889     * MSI-ACK region.  Note that as some writes are bypassed via the ioeventfd,
 890     * the remaining ones have a greater chance of being seen successively.
 891     * To avoid the pathological case of burning up all of QEMU's open file
 892     * handles, arbitrarily limit this algorithm from adding no more than 10
 893     * ioeventfds, print an error if we would have added an 11th, and then
 894     * stop counting.
 895     */
 896    if (!vdev->no_kvm_ioeventfd &&
 897        addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) {
 898        if (addr != last->addr || data != last->data || size != last->size) {
 899            last->addr = addr;
 900            last->data = data;
 901            last->size = size;
 902            last->hits = 1;
 903        } else if (++last->hits >= HITS_FOR_IOEVENTFD) {
 904            if (last->added < MAX_DYN_IOEVENTFD) {
 905                VFIOIOEventFD *ioeventfd;
 906                ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size,
 907                                        data, &vdev->bars[mirror->bar].region,
 908                                        mirror->offset + addr, true);
 909                if (ioeventfd) {
 910                    VFIOQuirk *quirk = last->quirk;
 911
 912                    QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next);
 913                    last->added++;
 914                }
 915            } else {
 916                last->added++;
 917                warn_report("NVIDIA ioeventfd queue full for %s, unable to "
 918                            "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", "
 919                            "size %u", vdev->vbasedev.name, addr, data, size);
 920            }
 921        }
 922    }
 923}
 924
 925static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
 926    .read = vfio_generic_quirk_mirror_read,
 927    .write = vfio_nvidia_quirk_mirror_write,
 928    .endianness = DEVICE_LITTLE_ENDIAN,
 929};
 930
 931static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
 932{
 933    VFIOConfigMirrorQuirk *mirror = quirk->data;
 934    LastDataSet *last = (LastDataSet *)&mirror->data;
 935
 936    last->addr = last->data = last->size = last->hits = last->added = 0;
 937
 938    vfio_drop_dynamic_eventfds(vdev, quirk);
 939}
 940
 941static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
 942{
 943    VFIOQuirk *quirk;
 944    VFIOConfigMirrorQuirk *mirror;
 945    LastDataSet *last;
 946
 947    if (vdev->no_geforce_quirks ||
 948        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 949        !vfio_is_vga(vdev) || nr != 0) {
 950        return;
 951    }
 952
 953    quirk = vfio_quirk_alloc(1);
 954    quirk->reset = vfio_nvidia_bar0_quirk_reset;
 955    mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
 956    mirror->mem = quirk->mem;
 957    mirror->vdev = vdev;
 958    mirror->offset = 0x88000;
 959    mirror->bar = nr;
 960    last = (LastDataSet *)&mirror->data;
 961    last->quirk = quirk;
 962
 963    memory_region_init_io(mirror->mem, OBJECT(vdev),
 964                          &vfio_nvidia_mirror_quirk, mirror,
 965                          "vfio-nvidia-bar0-88000-mirror-quirk",
 966                          vdev->config_size);
 967    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 968                                        mirror->offset, mirror->mem, 1);
 969
 970    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 971
 972    /* The 0x1800 offset mirror only seems to get used by legacy VGA */
 973    if (vdev->vga) {
 974        quirk = vfio_quirk_alloc(1);
 975        quirk->reset = vfio_nvidia_bar0_quirk_reset;
 976        mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
 977        mirror->mem = quirk->mem;
 978        mirror->vdev = vdev;
 979        mirror->offset = 0x1800;
 980        mirror->bar = nr;
 981        last = (LastDataSet *)&mirror->data;
 982        last->quirk = quirk;
 983
 984        memory_region_init_io(mirror->mem, OBJECT(vdev),
 985                              &vfio_nvidia_mirror_quirk, mirror,
 986                              "vfio-nvidia-bar0-1800-mirror-quirk",
 987                              PCI_CONFIG_SPACE_SIZE);
 988        memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 989                                            mirror->offset, mirror->mem, 1);
 990
 991        QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 992    }
 993
 994    trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name);
 995}
 996
 997/*
 998 * TODO - Some Nvidia devices provide config access to their companion HDA
 999 * device and even to their parent bridge via these config space mirrors.
1000 * Add quirks for those regions.
1001 */
1002
1003#define PCI_VENDOR_ID_REALTEK 0x10ec
1004
1005/*
1006 * RTL8168 devices have a backdoor that can access the MSI-X table.  At BAR2
1007 * offset 0x70 there is a dword data register, offset 0x74 is a dword address
1008 * register.  According to the Linux r8169 driver, the MSI-X table is addressed
1009 * when the "type" portion of the address register is set to 0x1.  This appears
1010 * to be bits 16:30.  Bit 31 is both a write indicator and some sort of
1011 * "address latched" indicator.  Bits 12:15 are a mask field, which we can
1012 * ignore because the MSI-X table should always be accessed as a dword (full
1013 * mask).  Bits 0:11 is offset within the type.
1014 *
1015 * Example trace:
1016 *
1017 * Read from MSI-X table offset 0
1018 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
1019 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
1020 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
1021 *
1022 * Write 0xfee00000 to MSI-X table offset 0
1023 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
1024 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
1025 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
1026 */
1027typedef struct VFIOrtl8168Quirk {
1028    VFIOPCIDevice *vdev;
1029    uint32_t addr;
1030    uint32_t data;
1031    bool enabled;
1032} VFIOrtl8168Quirk;
1033
1034static uint64_t vfio_rtl8168_quirk_address_read(void *opaque,
1035                                                hwaddr addr, unsigned size)
1036{
1037    VFIOrtl8168Quirk *rtl = opaque;
1038    VFIOPCIDevice *vdev = rtl->vdev;
1039    uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size);
1040
1041    if (rtl->enabled) {
1042        data = rtl->addr ^ 0x80000000U; /* latch/complete */
1043        trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data);
1044    }
1045
1046    return data;
1047}
1048
1049static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr,
1050                                             uint64_t data, unsigned size)
1051{
1052    VFIOrtl8168Quirk *rtl = opaque;
1053    VFIOPCIDevice *vdev = rtl->vdev;
1054
1055    rtl->enabled = false;
1056
1057    if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */
1058        rtl->enabled = true;
1059        rtl->addr = (uint32_t)data;
1060
1061        if (data & 0x80000000U) { /* Do write */
1062            if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
1063                hwaddr offset = data & 0xfff;
1064                uint64_t val = rtl->data;
1065
1066                trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name,
1067                                                    (uint16_t)offset, val);
1068
1069                /* Write to the proper guest MSI-X table instead */
1070                memory_region_dispatch_write(&vdev->pdev.msix_table_mmio,
1071                                             offset, val,
1072                                             size_memop(size) | MO_LE,
1073                                             MEMTXATTRS_UNSPECIFIED);
1074            }
1075            return; /* Do not write guest MSI-X data to hardware */
1076        }
1077    }
1078
1079    vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size);
1080}
1081
1082static const MemoryRegionOps vfio_rtl_address_quirk = {
1083    .read = vfio_rtl8168_quirk_address_read,
1084    .write = vfio_rtl8168_quirk_address_write,
1085    .valid = {
1086        .min_access_size = 4,
1087        .max_access_size = 4,
1088        .unaligned = false,
1089    },
1090    .endianness = DEVICE_LITTLE_ENDIAN,
1091};
1092
1093static uint64_t vfio_rtl8168_quirk_data_read(void *opaque,
1094                                             hwaddr addr, unsigned size)
1095{
1096    VFIOrtl8168Quirk *rtl = opaque;
1097    VFIOPCIDevice *vdev = rtl->vdev;
1098    uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size);
1099
1100    if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
1101        hwaddr offset = rtl->addr & 0xfff;
1102        memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset,
1103                                    &data, size_memop(size) | MO_LE,
1104                                    MEMTXATTRS_UNSPECIFIED);
1105        trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data);
1106    }
1107
1108    return data;
1109}
1110
1111static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr,
1112                                          uint64_t data, unsigned size)
1113{
1114    VFIOrtl8168Quirk *rtl = opaque;
1115    VFIOPCIDevice *vdev = rtl->vdev;
1116
1117    rtl->data = (uint32_t)data;
1118
1119    vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size);
1120}
1121
1122static const MemoryRegionOps vfio_rtl_data_quirk = {
1123    .read = vfio_rtl8168_quirk_data_read,
1124    .write = vfio_rtl8168_quirk_data_write,
1125    .valid = {
1126        .min_access_size = 4,
1127        .max_access_size = 4,
1128        .unaligned = false,
1129    },
1130    .endianness = DEVICE_LITTLE_ENDIAN,
1131};
1132
1133static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr)
1134{
1135    VFIOQuirk *quirk;
1136    VFIOrtl8168Quirk *rtl;
1137
1138    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) {
1139        return;
1140    }
1141
1142    quirk = vfio_quirk_alloc(2);
1143    quirk->data = rtl = g_malloc0(sizeof(*rtl));
1144    rtl->vdev = vdev;
1145
1146    memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
1147                          &vfio_rtl_address_quirk, rtl,
1148                          "vfio-rtl8168-window-address-quirk", 4);
1149    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1150                                        0x74, &quirk->mem[0], 1);
1151
1152    memory_region_init_io(&quirk->mem[1], OBJECT(vdev),
1153                          &vfio_rtl_data_quirk, rtl,
1154                          "vfio-rtl8168-window-data-quirk", 4);
1155    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1156                                        0x70, &quirk->mem[1], 1);
1157
1158    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1159
1160    trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name);
1161}
1162
1163#define IGD_ASLS 0xfc /* ASL Storage Register */
1164
1165/*
1166 * The OpRegion includes the Video BIOS Table, which seems important for
1167 * telling the driver what sort of outputs it has.  Without this, the device
1168 * may work in the guest, but we may not get output.  This also requires BIOS
1169 * support to reserve and populate a section of guest memory sufficient for
1170 * the table and to write the base address of that memory to the ASLS register
1171 * of the IGD device.
1172 */
1173int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
1174                               struct vfio_region_info *info, Error **errp)
1175{
1176    int ret;
1177
1178    vdev->igd_opregion = g_malloc0(info->size);
1179    ret = pread(vdev->vbasedev.fd, vdev->igd_opregion,
1180                info->size, info->offset);
1181    if (ret != info->size) {
1182        error_setg(errp, "failed to read IGD OpRegion");
1183        g_free(vdev->igd_opregion);
1184        vdev->igd_opregion = NULL;
1185        return -EINVAL;
1186    }
1187
1188    /*
1189     * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to
1190     * allocate 32bit reserved memory for, copy these contents into, and write
1191     * the reserved memory base address to the device ASLS register at 0xFC.
1192     * Alignment of this reserved region seems flexible, but using a 4k page
1193     * alignment seems to work well.  This interface assumes a single IGD
1194     * device, which may be at VM address 00:02.0 in legacy mode or another
1195     * address in UPT mode.
1196     *
1197     * NB, there may be future use cases discovered where the VM should have
1198     * direct interaction with the host OpRegion, in which case the write to
1199     * the ASLS register would trigger MemoryRegion setup to enable that.
1200     */
1201    fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion",
1202                    vdev->igd_opregion, info->size);
1203
1204    trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name);
1205
1206    pci_set_long(vdev->pdev.config + IGD_ASLS, 0);
1207    pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0);
1208    pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0);
1209
1210    return 0;
1211}
1212
1213/*
1214 * Common quirk probe entry points.
1215 */
1216void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
1217{
1218    vfio_vga_probe_ati_3c3_quirk(vdev);
1219    vfio_vga_probe_nvidia_3d0_quirk(vdev);
1220}
1221
1222void vfio_vga_quirk_exit(VFIOPCIDevice *vdev)
1223{
1224    VFIOQuirk *quirk;
1225    int i, j;
1226
1227    for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1228        QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) {
1229            for (j = 0; j < quirk->nr_mem; j++) {
1230                memory_region_del_subregion(&vdev->vga->region[i].mem,
1231                                            &quirk->mem[j]);
1232            }
1233        }
1234    }
1235}
1236
1237void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev)
1238{
1239    int i, j;
1240
1241    for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1242        while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) {
1243            VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks);
1244            QLIST_REMOVE(quirk, next);
1245            for (j = 0; j < quirk->nr_mem; j++) {
1246                object_unparent(OBJECT(&quirk->mem[j]));
1247            }
1248            g_free(quirk->mem);
1249            g_free(quirk->data);
1250            g_free(quirk);
1251        }
1252    }
1253}
1254
1255void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
1256{
1257    vfio_probe_ati_bar4_quirk(vdev, nr);
1258    vfio_probe_ati_bar2_quirk(vdev, nr);
1259    vfio_probe_nvidia_bar5_quirk(vdev, nr);
1260    vfio_probe_nvidia_bar0_quirk(vdev, nr);
1261    vfio_probe_rtl8168_bar2_quirk(vdev, nr);
1262#ifdef CONFIG_VFIO_IGD
1263    vfio_probe_igd_bar4_quirk(vdev, nr);
1264#endif
1265}
1266
1267void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
1268{
1269    VFIOBAR *bar = &vdev->bars[nr];
1270    VFIOQuirk *quirk;
1271    int i;
1272
1273    QLIST_FOREACH(quirk, &bar->quirks, next) {
1274        while (!QLIST_EMPTY(&quirk->ioeventfds)) {
1275            vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds));
1276        }
1277
1278        for (i = 0; i < quirk->nr_mem; i++) {
1279            memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
1280        }
1281    }
1282}
1283
1284void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr)
1285{
1286    VFIOBAR *bar = &vdev->bars[nr];
1287    int i;
1288
1289    while (!QLIST_EMPTY(&bar->quirks)) {
1290        VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1291        QLIST_REMOVE(quirk, next);
1292        for (i = 0; i < quirk->nr_mem; i++) {
1293            object_unparent(OBJECT(&quirk->mem[i]));
1294        }
1295        g_free(quirk->mem);
1296        g_free(quirk->data);
1297        g_free(quirk);
1298    }
1299}
1300
1301/*
1302 * Reset quirks
1303 */
1304void vfio_quirk_reset(VFIOPCIDevice *vdev)
1305{
1306    int i;
1307
1308    for (i = 0; i < PCI_ROM_SLOT; i++) {
1309        VFIOQuirk *quirk;
1310        VFIOBAR *bar = &vdev->bars[i];
1311
1312        QLIST_FOREACH(quirk, &bar->quirks, next) {
1313            if (quirk->reset) {
1314                quirk->reset(vdev, quirk);
1315            }
1316        }
1317    }
1318}
1319
1320/*
1321 * AMD Radeon PCI config reset, based on Linux:
1322 *   drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running()
1323 *   drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset
1324 *   drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc()
1325 *   drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock()
1326 * IDs: include/drm/drm_pciids.h
1327 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0
1328 *
1329 * Bonaire and Hawaii GPUs do not respond to a bus reset.  This is a bug in the
1330 * hardware that should be fixed on future ASICs.  The symptom of this is that
1331 * once the accerlated driver loads, Windows guests will bsod on subsequent
1332 * attmpts to load the driver, such as after VM reset or shutdown/restart.  To
1333 * work around this, we do an AMD specific PCI config reset, followed by an SMC
1334 * reset.  The PCI config reset only works if SMC firmware is running, so we
1335 * have a dependency on the state of the device as to whether this reset will
1336 * be effective.  There are still cases where we won't be able to kick the
1337 * device into working, but this greatly improves the usability overall.  The
1338 * config reset magic is relatively common on AMD GPUs, but the setup and SMC
1339 * poking is largely ASIC specific.
1340 */
1341static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev)
1342{
1343    uint32_t clk, pc_c;
1344
1345    /*
1346     * Registers 200h and 204h are index and data registers for accessing
1347     * indirect configuration registers within the device.
1348     */
1349    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1350    clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1351    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4);
1352    pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1353
1354    return (!(clk & 1) && (0x20100 <= pc_c));
1355}
1356
1357/*
1358 * The scope of a config reset is controlled by a mode bit in the misc register
1359 * and a fuse, exposed as a bit in another register.  The fuse is the default
1360 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula
1361 * scope = !(misc ^ fuse), where the resulting scope is defined the same as
1362 * the fuse.  A truth table therefore tells us that if misc == fuse, we need
1363 * to flip the value of the bit in the misc register.
1364 */
1365static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev)
1366{
1367    uint32_t misc, fuse;
1368    bool a, b;
1369
1370    vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4);
1371    fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1372    b = fuse & 64;
1373
1374    vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4);
1375    misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1376    a = misc & 2;
1377
1378    if (a == b) {
1379        vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4);
1380        vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */
1381    }
1382}
1383
1384static int vfio_radeon_reset(VFIOPCIDevice *vdev)
1385{
1386    PCIDevice *pdev = &vdev->pdev;
1387    int i, ret = 0;
1388    uint32_t data;
1389
1390    /* Defer to a kernel implemented reset */
1391    if (vdev->vbasedev.reset_works) {
1392        trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name);
1393        return -ENODEV;
1394    }
1395
1396    /* Enable only memory BAR access */
1397    vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2);
1398
1399    /* Reset only works if SMC firmware is loaded and running */
1400    if (!vfio_radeon_smc_is_running(vdev)) {
1401        ret = -EINVAL;
1402        trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name);
1403        goto out;
1404    }
1405
1406    /* Make sure only the GFX function is reset */
1407    vfio_radeon_set_gfx_only_reset(vdev);
1408
1409    /* AMD PCI config reset */
1410    vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4);
1411    usleep(100);
1412
1413    /* Read back the memory size to make sure we're out of reset */
1414    for (i = 0; i < 100000; i++) {
1415        if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) {
1416            goto reset_smc;
1417        }
1418        usleep(1);
1419    }
1420
1421    trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name);
1422
1423reset_smc:
1424    /* Reset SMC */
1425    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4);
1426    data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1427    data |= 1;
1428    vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
1429
1430    /* Disable SMC clock */
1431    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1432    data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1433    data |= 1;
1434    vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
1435
1436    trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name);
1437
1438out:
1439    /* Restore PCI command register */
1440    vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2);
1441
1442    return ret;
1443}
1444
1445void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev)
1446{
1447    switch (vdev->vendor_id) {
1448    case 0x1002:
1449        switch (vdev->device_id) {
1450        /* Bonaire */
1451        case 0x6649: /* Bonaire [FirePro W5100] */
1452        case 0x6650:
1453        case 0x6651:
1454        case 0x6658: /* Bonaire XTX [Radeon R7 260X] */
1455        case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */
1456        case 0x665d: /* Bonaire [Radeon R7 200 Series] */
1457        /* Hawaii */
1458        case 0x67A0: /* Hawaii XT GL [FirePro W9100] */
1459        case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */
1460        case 0x67A2:
1461        case 0x67A8:
1462        case 0x67A9:
1463        case 0x67AA:
1464        case 0x67B0: /* Hawaii XT [Radeon R9 290X] */
1465        case 0x67B1: /* Hawaii PRO [Radeon R9 290] */
1466        case 0x67B8:
1467        case 0x67B9:
1468        case 0x67BA:
1469        case 0x67BE:
1470            vdev->resetfn = vfio_radeon_reset;
1471            trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name);
1472            break;
1473        }
1474        break;
1475    }
1476}
1477
1478/*
1479 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify
1480 * devices as a member of a clique.  Devices within the same clique ID
1481 * are capable of direct P2P.  It's the user's responsibility that this
1482 * is correct.  The spec says that this may reside at any unused config
1483 * offset, but reserves and recommends hypervisors place this at C8h.
1484 * The spec also states that the hypervisor should place this capability
1485 * at the end of the capability list, thus next is defined as 0h.
1486 *
1487 * +----------------+----------------+----------------+----------------+
1488 * | sig 7:0 ('P')  |  vndr len (8h) |    next (0h)   |   cap id (9h)  |
1489 * +----------------+----------------+----------------+----------------+
1490 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)|          sig 23:8 ('P2')        |
1491 * +---------------------------------+---------------------------------+
1492 *
1493 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
1494 */
1495static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v,
1496                                       const char *name, void *opaque,
1497                                       Error **errp)
1498{
1499    Property *prop = opaque;
1500    uint8_t *ptr = object_field_prop_ptr(obj, prop);
1501
1502    visit_type_uint8(v, name, ptr, errp);
1503}
1504
1505static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v,
1506                                       const char *name, void *opaque,
1507                                       Error **errp)
1508{
1509    Property *prop = opaque;
1510    uint8_t value, *ptr = object_field_prop_ptr(obj, prop);
1511
1512    if (!visit_type_uint8(v, name, &value, errp)) {
1513        return;
1514    }
1515
1516    if (value & ~0xF) {
1517        error_setg(errp, "Property %s: valid range 0-15", name);
1518        return;
1519    }
1520
1521    *ptr = value;
1522}
1523
1524const PropertyInfo qdev_prop_nv_gpudirect_clique = {
1525    .name = "uint4",
1526    .description = "NVIDIA GPUDirect Clique ID (0 - 15)",
1527    .get = get_nv_gpudirect_clique_id,
1528    .set = set_nv_gpudirect_clique_id,
1529};
1530
1531static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
1532{
1533    PCIDevice *pdev = &vdev->pdev;
1534    int ret, pos = 0xC8;
1535
1536    if (vdev->nv_gpudirect_clique == 0xFF) {
1537        return 0;
1538    }
1539
1540    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
1541        error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor");
1542        return -EINVAL;
1543    }
1544
1545    if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) !=
1546        PCI_BASE_CLASS_DISPLAY) {
1547        error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class");
1548        return -EINVAL;
1549    }
1550
1551    ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
1552    if (ret < 0) {
1553        error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
1554        return ret;
1555    }
1556
1557    memset(vdev->emulated_config_bits + pos, 0xFF, 8);
1558    pos += PCI_CAP_FLAGS;
1559    pci_set_byte(pdev->config + pos++, 8);
1560    pci_set_byte(pdev->config + pos++, 'P');
1561    pci_set_byte(pdev->config + pos++, '2');
1562    pci_set_byte(pdev->config + pos++, 'P');
1563    pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3);
1564    pci_set_byte(pdev->config + pos, 0);
1565
1566    return 0;
1567}
1568
1569static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v,
1570                                     const char *name,
1571                                     void *opaque, Error **errp)
1572{
1573    uint64_t tgt = (uintptr_t) opaque;
1574    visit_type_uint64(v, name, &tgt, errp);
1575}
1576
1577static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v,
1578                                                 const char *name,
1579                                                 void *opaque, Error **errp)
1580{
1581    uint32_t link_speed = (uint32_t)(uintptr_t) opaque;
1582    visit_type_uint32(v, name, &link_speed, errp);
1583}
1584
1585int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp)
1586{
1587    int ret;
1588    void *p;
1589    struct vfio_region_info *nv2reg = NULL;
1590    struct vfio_info_cap_header *hdr;
1591    struct vfio_region_info_cap_nvlink2_ssatgt *cap;
1592    VFIOQuirk *quirk;
1593
1594    ret = vfio_get_dev_region_info(&vdev->vbasedev,
1595                                   VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
1596                                   PCI_VENDOR_ID_NVIDIA,
1597                                   VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
1598                                   &nv2reg);
1599    if (ret) {
1600        return ret;
1601    }
1602
1603    hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
1604    if (!hdr) {
1605        ret = -ENODEV;
1606        goto free_exit;
1607    }
1608    cap = (void *) hdr;
1609
1610    p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE,
1611             MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset);
1612    if (p == MAP_FAILED) {
1613        ret = -errno;
1614        goto free_exit;
1615    }
1616
1617    quirk = vfio_quirk_alloc(1);
1618    memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr",
1619                               nv2reg->size, p);
1620    QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
1621
1622    object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
1623                        vfio_pci_nvlink2_get_tgt, NULL, NULL,
1624                        (void *) (uintptr_t) cap->tgt);
1625    trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt,
1626                                          nv2reg->size);
1627free_exit:
1628    g_free(nv2reg);
1629
1630    return ret;
1631}
1632
1633int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp)
1634{
1635    int ret;
1636    void *p;
1637    struct vfio_region_info *atsdreg = NULL;
1638    struct vfio_info_cap_header *hdr;
1639    struct vfio_region_info_cap_nvlink2_ssatgt *captgt;
1640    struct vfio_region_info_cap_nvlink2_lnkspd *capspeed;
1641    VFIOQuirk *quirk;
1642
1643    ret = vfio_get_dev_region_info(&vdev->vbasedev,
1644                                   VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
1645                                   PCI_VENDOR_ID_IBM,
1646                                   VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
1647                                   &atsdreg);
1648    if (ret) {
1649        return ret;
1650    }
1651
1652    hdr = vfio_get_region_info_cap(atsdreg,
1653                                   VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
1654    if (!hdr) {
1655        ret = -ENODEV;
1656        goto free_exit;
1657    }
1658    captgt = (void *) hdr;
1659
1660    hdr = vfio_get_region_info_cap(atsdreg,
1661                                   VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD);
1662    if (!hdr) {
1663        ret = -ENODEV;
1664        goto free_exit;
1665    }
1666    capspeed = (void *) hdr;
1667
1668    /* Some NVLink bridges may not have assigned ATSD */
1669    if (atsdreg->size) {
1670        p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE,
1671                 MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset);
1672        if (p == MAP_FAILED) {
1673            ret = -errno;
1674            goto free_exit;
1675        }
1676
1677        quirk = vfio_quirk_alloc(1);
1678        memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev),
1679                                          "nvlink2-atsd-mr", atsdreg->size, p);
1680        QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
1681    }
1682
1683    object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
1684                        vfio_pci_nvlink2_get_tgt, NULL, NULL,
1685                        (void *) (uintptr_t) captgt->tgt);
1686    trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt,
1687                                              atsdreg->size);
1688
1689    object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32",
1690                        vfio_pci_nvlink2_get_link_speed, NULL, NULL,
1691                        (void *) (uintptr_t) capspeed->link_speed);
1692    trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name,
1693                                              capspeed->link_speed);
1694free_exit:
1695    g_free(atsdreg);
1696
1697    return ret;
1698}
1699
1700/*
1701 * The VMD endpoint provides a real PCIe domain to the guest and the guest
1702 * kernel performs enumeration of the VMD sub-device domain. Guest transactions
1703 * to VMD sub-devices go through MMU translation from guest addresses to
1704 * physical addresses. When MMIO goes to an endpoint after being translated to
1705 * physical addresses, the bridge rejects the transaction because the window
1706 * has been programmed with guest addresses.
1707 *
1708 * VMD can use the Host Physical Address in order to correctly program the
1709 * bridge windows in its PCIe domain. VMD device 28C0 has HPA shadow registers
1710 * located at offset 0x2000 in MEMBAR2 (BAR 4). This quirk provides the HPA
1711 * shadow registers in a vendor-specific capability register for devices
1712 * without native support. The position of 0xE8-0xFF is in the reserved range
1713 * of the VMD device capability space following the Power Management
1714 * Capability.
1715 */
1716#define VMD_SHADOW_CAP_VER 1
1717#define VMD_SHADOW_CAP_LEN 24
1718static int vfio_add_vmd_shadow_cap(VFIOPCIDevice *vdev, Error **errp)
1719{
1720    uint8_t membar_phys[16];
1721    int ret, pos = 0xE8;
1722
1723    if (!(vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x201D) ||
1724          vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x467F) ||
1725          vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x4C3D) ||
1726          vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, 0x9A0B))) {
1727        return 0;
1728    }
1729
1730    ret = pread(vdev->vbasedev.fd, membar_phys, 16,
1731                vdev->config_offset + PCI_BASE_ADDRESS_2);
1732    if (ret != 16) {
1733        error_report("VMD %s cannot read MEMBARs (%d)",
1734                     vdev->vbasedev.name, ret);
1735        return -EFAULT;
1736    }
1737
1738    ret = pci_add_capability(&vdev->pdev, PCI_CAP_ID_VNDR, pos,
1739                             VMD_SHADOW_CAP_LEN, errp);
1740    if (ret < 0) {
1741        error_prepend(errp, "Failed to add VMD MEMBAR Shadow cap: ");
1742        return ret;
1743    }
1744
1745    memset(vdev->emulated_config_bits + pos, 0xFF, VMD_SHADOW_CAP_LEN);
1746    pos += PCI_CAP_FLAGS;
1747    pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_LEN);
1748    pci_set_byte(vdev->pdev.config + pos++, VMD_SHADOW_CAP_VER);
1749    pci_set_long(vdev->pdev.config + pos, 0x53484457); /* SHDW */
1750    memcpy(vdev->pdev.config + pos + 4, membar_phys, 16);
1751
1752    return 0;
1753}
1754
1755int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
1756{
1757    int ret;
1758
1759    ret = vfio_add_nv_gpudirect_cap(vdev, errp);
1760    if (ret) {
1761        return ret;
1762    }
1763
1764    ret = vfio_add_vmd_shadow_cap(vdev, errp);
1765    if (ret) {
1766        return ret;
1767    }
1768
1769    return 0;
1770}
1771