qemu/hw/vfio/pci-quirks.c
<<
>>
Prefs
   1/*
   2 * device quirks for PCI devices
   3 *
   4 * Copyright Red Hat, Inc. 2012-2015
   5 *
   6 * Authors:
   7 *  Alex Williamson <alex.williamson@redhat.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 */
  12
  13#include "qemu/osdep.h"
  14#include "exec/memop.h"
  15#include "qemu/units.h"
  16#include "qemu/error-report.h"
  17#include "qemu/main-loop.h"
  18#include "qemu/module.h"
  19#include "qemu/range.h"
  20#include "qapi/error.h"
  21#include "qapi/visitor.h"
  22#include <sys/ioctl.h>
  23#include "hw/hw.h"
  24#include "hw/nvram/fw_cfg.h"
  25#include "hw/qdev-properties.h"
  26#include "pci.h"
  27#include "trace.h"
  28
  29/*
  30 * List of device ids/vendor ids for which to disable
  31 * option rom loading. This avoids the guest hangs during rom
  32 * execution as noticed with the BCM 57810 card for lack of a
  33 * more better way to handle such issues.
  34 * The  user can still override by specifying a romfile or
  35 * rombar=1.
  36 * Please see https://bugs.launchpad.net/qemu/+bug/1284874
  37 * for an analysis of the 57810 card hang. When adding
  38 * a new vendor id/device id combination below, please also add
  39 * your card/environment details and information that could
  40 * help in debugging to the bug tracking this issue
  41 */
  42static const struct {
  43    uint32_t vendor;
  44    uint32_t device;
  45} romblacklist[] = {
  46    { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */
  47};
  48
  49bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev)
  50{
  51    int i;
  52
  53    for (i = 0 ; i < ARRAY_SIZE(romblacklist); i++) {
  54        if (vfio_pci_is(vdev, romblacklist[i].vendor, romblacklist[i].device)) {
  55            trace_vfio_quirk_rom_blacklisted(vdev->vbasedev.name,
  56                                             romblacklist[i].vendor,
  57                                             romblacklist[i].device);
  58            return true;
  59        }
  60    }
  61    return false;
  62}
  63
  64/*
  65 * Device specific region quirks (mostly backdoors to PCI config space)
  66 */
  67
  68/*
  69 * The generic window quirks operate on an address and data register,
  70 * vfio_generic_window_address_quirk handles the address register and
  71 * vfio_generic_window_data_quirk handles the data register.  These ops
  72 * pass reads and writes through to hardware until a value matching the
  73 * stored address match/mask is written.  When this occurs, the data
  74 * register access emulated PCI config space for the device rather than
  75 * passing through accesses.  This enables devices where PCI config space
  76 * is accessible behind a window register to maintain the virtualization
  77 * provided through vfio.
  78 */
  79typedef struct VFIOConfigWindowMatch {
  80    uint32_t match;
  81    uint32_t mask;
  82} VFIOConfigWindowMatch;
  83
  84typedef struct VFIOConfigWindowQuirk {
  85    struct VFIOPCIDevice *vdev;
  86
  87    uint32_t address_val;
  88
  89    uint32_t address_offset;
  90    uint32_t data_offset;
  91
  92    bool window_enabled;
  93    uint8_t bar;
  94
  95    MemoryRegion *addr_mem;
  96    MemoryRegion *data_mem;
  97
  98    uint32_t nr_matches;
  99    VFIOConfigWindowMatch matches[];
 100} VFIOConfigWindowQuirk;
 101
 102static uint64_t vfio_generic_window_quirk_address_read(void *opaque,
 103                                                       hwaddr addr,
 104                                                       unsigned size)
 105{
 106    VFIOConfigWindowQuirk *window = opaque;
 107    VFIOPCIDevice *vdev = window->vdev;
 108
 109    return vfio_region_read(&vdev->bars[window->bar].region,
 110                            addr + window->address_offset, size);
 111}
 112
 113static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr,
 114                                                    uint64_t data,
 115                                                    unsigned size)
 116{
 117    VFIOConfigWindowQuirk *window = opaque;
 118    VFIOPCIDevice *vdev = window->vdev;
 119    int i;
 120
 121    window->window_enabled = false;
 122
 123    vfio_region_write(&vdev->bars[window->bar].region,
 124                      addr + window->address_offset, data, size);
 125
 126    for (i = 0; i < window->nr_matches; i++) {
 127        if ((data & ~window->matches[i].mask) == window->matches[i].match) {
 128            window->window_enabled = true;
 129            window->address_val = data & window->matches[i].mask;
 130            trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name,
 131                                    memory_region_name(window->addr_mem), data);
 132            break;
 133        }
 134    }
 135}
 136
 137static const MemoryRegionOps vfio_generic_window_address_quirk = {
 138    .read = vfio_generic_window_quirk_address_read,
 139    .write = vfio_generic_window_quirk_address_write,
 140    .endianness = DEVICE_LITTLE_ENDIAN,
 141};
 142
 143static uint64_t vfio_generic_window_quirk_data_read(void *opaque,
 144                                                    hwaddr addr, unsigned size)
 145{
 146    VFIOConfigWindowQuirk *window = opaque;
 147    VFIOPCIDevice *vdev = window->vdev;
 148    uint64_t data;
 149
 150    /* Always read data reg, discard if window enabled */
 151    data = vfio_region_read(&vdev->bars[window->bar].region,
 152                            addr + window->data_offset, size);
 153
 154    if (window->window_enabled) {
 155        data = vfio_pci_read_config(&vdev->pdev, window->address_val, size);
 156        trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name,
 157                                    memory_region_name(window->data_mem), data);
 158    }
 159
 160    return data;
 161}
 162
 163static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr,
 164                                                 uint64_t data, unsigned size)
 165{
 166    VFIOConfigWindowQuirk *window = opaque;
 167    VFIOPCIDevice *vdev = window->vdev;
 168
 169    if (window->window_enabled) {
 170        vfio_pci_write_config(&vdev->pdev, window->address_val, data, size);
 171        trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name,
 172                                    memory_region_name(window->data_mem), data);
 173        return;
 174    }
 175
 176    vfio_region_write(&vdev->bars[window->bar].region,
 177                      addr + window->data_offset, data, size);
 178}
 179
 180static const MemoryRegionOps vfio_generic_window_data_quirk = {
 181    .read = vfio_generic_window_quirk_data_read,
 182    .write = vfio_generic_window_quirk_data_write,
 183    .endianness = DEVICE_LITTLE_ENDIAN,
 184};
 185
 186/*
 187 * The generic mirror quirk handles devices which expose PCI config space
 188 * through a region within a BAR.  When enabled, reads and writes are
 189 * redirected through to emulated PCI config space.  XXX if PCI config space
 190 * used memory regions, this could just be an alias.
 191 */
 192typedef struct VFIOConfigMirrorQuirk {
 193    struct VFIOPCIDevice *vdev;
 194    uint32_t offset;
 195    uint8_t bar;
 196    MemoryRegion *mem;
 197    uint8_t data[];
 198} VFIOConfigMirrorQuirk;
 199
 200static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
 201                                               hwaddr addr, unsigned size)
 202{
 203    VFIOConfigMirrorQuirk *mirror = opaque;
 204    VFIOPCIDevice *vdev = mirror->vdev;
 205    uint64_t data;
 206
 207    /* Read and discard in case the hardware cares */
 208    (void)vfio_region_read(&vdev->bars[mirror->bar].region,
 209                           addr + mirror->offset, size);
 210
 211    data = vfio_pci_read_config(&vdev->pdev, addr, size);
 212    trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name,
 213                                         memory_region_name(mirror->mem),
 214                                         addr, data);
 215    return data;
 216}
 217
 218static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr,
 219                                            uint64_t data, unsigned size)
 220{
 221    VFIOConfigMirrorQuirk *mirror = opaque;
 222    VFIOPCIDevice *vdev = mirror->vdev;
 223
 224    vfio_pci_write_config(&vdev->pdev, addr, data, size);
 225    trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name,
 226                                          memory_region_name(mirror->mem),
 227                                          addr, data);
 228}
 229
 230static const MemoryRegionOps vfio_generic_mirror_quirk = {
 231    .read = vfio_generic_quirk_mirror_read,
 232    .write = vfio_generic_quirk_mirror_write,
 233    .endianness = DEVICE_LITTLE_ENDIAN,
 234};
 235
 236/* Is range1 fully contained within range2?  */
 237static bool vfio_range_contained(uint64_t first1, uint64_t len1,
 238                                 uint64_t first2, uint64_t len2) {
 239    return (first1 >= first2 && first1 + len1 <= first2 + len2);
 240}
 241
 242#define PCI_VENDOR_ID_ATI               0x1002
 243
 244/*
 245 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
 246 * through VGA register 0x3c3.  On newer cards, the I/O port BAR is always
 247 * BAR4 (older cards like the X550 used BAR1, but we don't care to support
 248 * those).  Note that on bare metal, a read of 0x3c3 doesn't always return the
 249 * I/O port BAR address.  Originally this was coded to return the virtual BAR
 250 * address only if the physical register read returns the actual BAR address,
 251 * but users have reported greater success if we return the virtual address
 252 * unconditionally.
 253 */
 254static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
 255                                        hwaddr addr, unsigned size)
 256{
 257    VFIOPCIDevice *vdev = opaque;
 258    uint64_t data = vfio_pci_read_config(&vdev->pdev,
 259                                         PCI_BASE_ADDRESS_4 + 1, size);
 260
 261    trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data);
 262
 263    return data;
 264}
 265
 266static const MemoryRegionOps vfio_ati_3c3_quirk = {
 267    .read = vfio_ati_3c3_quirk_read,
 268    .endianness = DEVICE_LITTLE_ENDIAN,
 269};
 270
 271VFIOQuirk *vfio_quirk_alloc(int nr_mem)
 272{
 273    VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
 274    QLIST_INIT(&quirk->ioeventfds);
 275    quirk->mem = g_new0(MemoryRegion, nr_mem);
 276    quirk->nr_mem = nr_mem;
 277
 278    return quirk;
 279}
 280
 281static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
 282{
 283    QLIST_REMOVE(ioeventfd, next);
 284    memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
 285                              true, ioeventfd->data, &ioeventfd->e);
 286
 287    if (ioeventfd->vfio) {
 288        struct vfio_device_ioeventfd vfio_ioeventfd;
 289
 290        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
 291        vfio_ioeventfd.flags = ioeventfd->size;
 292        vfio_ioeventfd.data = ioeventfd->data;
 293        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
 294                                ioeventfd->region_addr;
 295        vfio_ioeventfd.fd = -1;
 296
 297        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) {
 298            error_report("Failed to remove vfio ioeventfd for %s+0x%"
 299                         HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)",
 300                         memory_region_name(ioeventfd->mr), ioeventfd->addr,
 301                         ioeventfd->size, ioeventfd->data);
 302        }
 303    } else {
 304        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
 305                            NULL, NULL, NULL);
 306    }
 307
 308    event_notifier_cleanup(&ioeventfd->e);
 309    trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
 310                              (uint64_t)ioeventfd->addr, ioeventfd->size,
 311                              ioeventfd->data);
 312    g_free(ioeventfd);
 313}
 314
 315static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
 316{
 317    VFIOIOEventFD *ioeventfd, *tmp;
 318
 319    QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
 320        if (ioeventfd->dynamic) {
 321            vfio_ioeventfd_exit(vdev, ioeventfd);
 322        }
 323    }
 324}
 325
 326static void vfio_ioeventfd_handler(void *opaque)
 327{
 328    VFIOIOEventFD *ioeventfd = opaque;
 329
 330    if (event_notifier_test_and_clear(&ioeventfd->e)) {
 331        vfio_region_write(ioeventfd->region, ioeventfd->region_addr,
 332                          ioeventfd->data, ioeventfd->size);
 333        trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr),
 334                                     (uint64_t)ioeventfd->addr, ioeventfd->size,
 335                                     ioeventfd->data);
 336    }
 337}
 338
 339static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
 340                                          MemoryRegion *mr, hwaddr addr,
 341                                          unsigned size, uint64_t data,
 342                                          VFIORegion *region,
 343                                          hwaddr region_addr, bool dynamic)
 344{
 345    VFIOIOEventFD *ioeventfd;
 346
 347    if (vdev->no_kvm_ioeventfd) {
 348        return NULL;
 349    }
 350
 351    ioeventfd = g_malloc0(sizeof(*ioeventfd));
 352
 353    if (event_notifier_init(&ioeventfd->e, 0)) {
 354        g_free(ioeventfd);
 355        return NULL;
 356    }
 357
 358    /*
 359     * MemoryRegion and relative offset, plus additional ioeventfd setup
 360     * parameters for configuring and later tearing down KVM ioeventfd.
 361     */
 362    ioeventfd->mr = mr;
 363    ioeventfd->addr = addr;
 364    ioeventfd->size = size;
 365    ioeventfd->data = data;
 366    ioeventfd->dynamic = dynamic;
 367    /*
 368     * VFIORegion and relative offset for implementing the userspace
 369     * handler.  data & size fields shared for both uses.
 370     */
 371    ioeventfd->region = region;
 372    ioeventfd->region_addr = region_addr;
 373
 374    if (!vdev->no_vfio_ioeventfd) {
 375        struct vfio_device_ioeventfd vfio_ioeventfd;
 376
 377        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
 378        vfio_ioeventfd.flags = ioeventfd->size;
 379        vfio_ioeventfd.data = ioeventfd->data;
 380        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
 381                                ioeventfd->region_addr;
 382        vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e);
 383
 384        ioeventfd->vfio = !ioctl(vdev->vbasedev.fd,
 385                                 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
 386    }
 387
 388    if (!ioeventfd->vfio) {
 389        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
 390                            vfio_ioeventfd_handler, NULL, ioeventfd);
 391    }
 392
 393    memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
 394                              true, ioeventfd->data, &ioeventfd->e);
 395    trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
 396                              size, data, ioeventfd->vfio);
 397
 398    return ioeventfd;
 399}
 400
 401static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
 402{
 403    VFIOQuirk *quirk;
 404
 405    /*
 406     * As long as the BAR is >= 256 bytes it will be aligned such that the
 407     * lower byte is always zero.  Filter out anything else, if it exists.
 408     */
 409    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 410        !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) {
 411        return;
 412    }
 413
 414    quirk = vfio_quirk_alloc(1);
 415
 416    memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev,
 417                          "vfio-ati-3c3-quirk", 1);
 418    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 419                                3 /* offset 3 bytes from 0x3c0 */, quirk->mem);
 420
 421    QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
 422                      quirk, next);
 423
 424    trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name);
 425}
 426
 427/*
 428 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI
 429 * config space through MMIO BAR2 at offset 0x4000.  Nothing seems to access
 430 * the MMIO space directly, but a window to this space is provided through
 431 * I/O port BAR4.  Offset 0x0 is the address register and offset 0x4 is the
 432 * data register.  When the address is programmed to a range of 0x4000-0x4fff
 433 * PCI configuration space is available.  Experimentation seems to indicate
 434 * that read-only may be provided by hardware.
 435 */
 436static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 437{
 438    VFIOQuirk *quirk;
 439    VFIOConfigWindowQuirk *window;
 440
 441    /* This windows doesn't seem to be used except by legacy VGA code */
 442    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 443        !vdev->vga || nr != 4) {
 444        return;
 445    }
 446
 447    quirk = vfio_quirk_alloc(2);
 448    window = quirk->data = g_malloc0(sizeof(*window) +
 449                                     sizeof(VFIOConfigWindowMatch));
 450    window->vdev = vdev;
 451    window->address_offset = 0;
 452    window->data_offset = 4;
 453    window->nr_matches = 1;
 454    window->matches[0].match = 0x4000;
 455    window->matches[0].mask = vdev->config_size - 1;
 456    window->bar = nr;
 457    window->addr_mem = &quirk->mem[0];
 458    window->data_mem = &quirk->mem[1];
 459
 460    memory_region_init_io(window->addr_mem, OBJECT(vdev),
 461                          &vfio_generic_window_address_quirk, window,
 462                          "vfio-ati-bar4-window-address-quirk", 4);
 463    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 464                                        window->address_offset,
 465                                        window->addr_mem, 1);
 466
 467    memory_region_init_io(window->data_mem, OBJECT(vdev),
 468                          &vfio_generic_window_data_quirk, window,
 469                          "vfio-ati-bar4-window-data-quirk", 4);
 470    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 471                                        window->data_offset,
 472                                        window->data_mem, 1);
 473
 474    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 475
 476    trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name);
 477}
 478
 479/*
 480 * Trap the BAR2 MMIO mirror to config space as well.
 481 */
 482static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr)
 483{
 484    VFIOQuirk *quirk;
 485    VFIOConfigMirrorQuirk *mirror;
 486
 487    /* Only enable on newer devices where BAR2 is 64bit */
 488    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 489        !vdev->vga || nr != 2 || !vdev->bars[2].mem64) {
 490        return;
 491    }
 492
 493    quirk = vfio_quirk_alloc(1);
 494    mirror = quirk->data = g_malloc0(sizeof(*mirror));
 495    mirror->mem = quirk->mem;
 496    mirror->vdev = vdev;
 497    mirror->offset = 0x4000;
 498    mirror->bar = nr;
 499
 500    memory_region_init_io(mirror->mem, OBJECT(vdev),
 501                          &vfio_generic_mirror_quirk, mirror,
 502                          "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE);
 503    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 504                                        mirror->offset, mirror->mem, 1);
 505
 506    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 507
 508    trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name);
 509}
 510
 511/*
 512 * Older ATI/AMD cards like the X550 have a similar window to that above.
 513 * I/O port BAR1 provides a window to a mirror of PCI config space located
 514 * in BAR2 at offset 0xf00.  We don't care to support such older cards, but
 515 * note it for future reference.
 516 */
 517
 518/*
 519 * Nvidia has several different methods to get to config space, the
 520 * nouveu project has several of these documented here:
 521 * https://github.com/pathscale/envytools/tree/master/hwdocs
 522 *
 523 * The first quirk is actually not documented in envytools and is found
 524 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]).  This is an
 525 * NV46 chipset.  The backdoor uses the legacy VGA I/O ports to access
 526 * the mirror of PCI config space found at BAR0 offset 0x1800.  The access
 527 * sequence first writes 0x338 to I/O port 0x3d4.  The target offset is
 528 * then written to 0x3d0.  Finally 0x538 is written for a read and 0x738
 529 * is written for a write to 0x3d4.  The BAR0 offset is then accessible
 530 * through 0x3d0.  This quirk doesn't seem to be necessary on newer cards
 531 * that use the I/O port BAR5 window but it doesn't hurt to leave it.
 532 */
 533typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State;
 534static const char *nv3d0_states[] = { "NONE", "SELECT",
 535                                      "WINDOW", "READ", "WRITE" };
 536
 537typedef struct VFIONvidia3d0Quirk {
 538    VFIOPCIDevice *vdev;
 539    VFIONvidia3d0State state;
 540    uint32_t offset;
 541} VFIONvidia3d0Quirk;
 542
 543static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque,
 544                                           hwaddr addr, unsigned size)
 545{
 546    VFIONvidia3d0Quirk *quirk = opaque;
 547    VFIOPCIDevice *vdev = quirk->vdev;
 548
 549    quirk->state = NONE;
 550
 551    return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 552                         addr + 0x14, size);
 553}
 554
 555static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr,
 556                                        uint64_t data, unsigned size)
 557{
 558    VFIONvidia3d0Quirk *quirk = opaque;
 559    VFIOPCIDevice *vdev = quirk->vdev;
 560    VFIONvidia3d0State old_state = quirk->state;
 561
 562    quirk->state = NONE;
 563
 564    switch (data) {
 565    case 0x338:
 566        if (old_state == NONE) {
 567            quirk->state = SELECT;
 568            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 569                                              nv3d0_states[quirk->state]);
 570        }
 571        break;
 572    case 0x538:
 573        if (old_state == WINDOW) {
 574            quirk->state = READ;
 575            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 576                                              nv3d0_states[quirk->state]);
 577        }
 578        break;
 579    case 0x738:
 580        if (old_state == WINDOW) {
 581            quirk->state = WRITE;
 582            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 583                                              nv3d0_states[quirk->state]);
 584        }
 585        break;
 586    }
 587
 588    vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 589                   addr + 0x14, data, size);
 590}
 591
 592static const MemoryRegionOps vfio_nvidia_3d4_quirk = {
 593    .read = vfio_nvidia_3d4_quirk_read,
 594    .write = vfio_nvidia_3d4_quirk_write,
 595    .endianness = DEVICE_LITTLE_ENDIAN,
 596};
 597
 598static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
 599                                           hwaddr addr, unsigned size)
 600{
 601    VFIONvidia3d0Quirk *quirk = opaque;
 602    VFIOPCIDevice *vdev = quirk->vdev;
 603    VFIONvidia3d0State old_state = quirk->state;
 604    uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 605                                  addr + 0x10, size);
 606
 607    quirk->state = NONE;
 608
 609    if (old_state == READ &&
 610        (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
 611        uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
 612
 613        data = vfio_pci_read_config(&vdev->pdev, offset, size);
 614        trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name,
 615                                         offset, size, data);
 616    }
 617
 618    return data;
 619}
 620
 621static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
 622                                        uint64_t data, unsigned size)
 623{
 624    VFIONvidia3d0Quirk *quirk = opaque;
 625    VFIOPCIDevice *vdev = quirk->vdev;
 626    VFIONvidia3d0State old_state = quirk->state;
 627
 628    quirk->state = NONE;
 629
 630    if (old_state == SELECT) {
 631        quirk->offset = (uint32_t)data;
 632        quirk->state = WINDOW;
 633        trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 634                                          nv3d0_states[quirk->state]);
 635    } else if (old_state == WRITE) {
 636        if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
 637            uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
 638
 639            vfio_pci_write_config(&vdev->pdev, offset, data, size);
 640            trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name,
 641                                              offset, data, size);
 642            return;
 643        }
 644    }
 645
 646    vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 647                   addr + 0x10, data, size);
 648}
 649
 650static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
 651    .read = vfio_nvidia_3d0_quirk_read,
 652    .write = vfio_nvidia_3d0_quirk_write,
 653    .endianness = DEVICE_LITTLE_ENDIAN,
 654};
 655
 656static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
 657{
 658    VFIOQuirk *quirk;
 659    VFIONvidia3d0Quirk *data;
 660
 661    if (vdev->no_geforce_quirks ||
 662        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 663        !vdev->bars[1].region.size) {
 664        return;
 665    }
 666
 667    quirk = vfio_quirk_alloc(2);
 668    quirk->data = data = g_malloc0(sizeof(*data));
 669    data->vdev = vdev;
 670
 671    memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk,
 672                          data, "vfio-nvidia-3d4-quirk", 2);
 673    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 674                                0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]);
 675
 676    memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk,
 677                          data, "vfio-nvidia-3d0-quirk", 2);
 678    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 679                                0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]);
 680
 681    QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
 682                      quirk, next);
 683
 684    trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name);
 685}
 686
 687/*
 688 * The second quirk is documented in envytools.  The I/O port BAR5 is just
 689 * a set of address/data ports to the MMIO BARs.  The BAR we care about is
 690 * again BAR0.  This backdoor is apparently a bit newer than the one above
 691 * so we need to not only trap 256 bytes @0x1800, but all of PCI config
 692 * space, including extended space is available at the 4k @0x88000.
 693 */
 694typedef struct VFIONvidiaBAR5Quirk {
 695    uint32_t master;
 696    uint32_t enable;
 697    MemoryRegion *addr_mem;
 698    MemoryRegion *data_mem;
 699    bool enabled;
 700    VFIOConfigWindowQuirk window; /* last for match data */
 701} VFIONvidiaBAR5Quirk;
 702
 703static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5)
 704{
 705    VFIOPCIDevice *vdev = bar5->window.vdev;
 706
 707    if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) {
 708        return;
 709    }
 710
 711    bar5->enabled = !bar5->enabled;
 712    trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name,
 713                                       bar5->enabled ?  "Enable" : "Disable");
 714    memory_region_set_enabled(bar5->addr_mem, bar5->enabled);
 715    memory_region_set_enabled(bar5->data_mem, bar5->enabled);
 716}
 717
 718static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque,
 719                                                   hwaddr addr, unsigned size)
 720{
 721    VFIONvidiaBAR5Quirk *bar5 = opaque;
 722    VFIOPCIDevice *vdev = bar5->window.vdev;
 723
 724    return vfio_region_read(&vdev->bars[5].region, addr, size);
 725}
 726
 727static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr,
 728                                                uint64_t data, unsigned size)
 729{
 730    VFIONvidiaBAR5Quirk *bar5 = opaque;
 731    VFIOPCIDevice *vdev = bar5->window.vdev;
 732
 733    vfio_region_write(&vdev->bars[5].region, addr, data, size);
 734
 735    bar5->master = data;
 736    vfio_nvidia_bar5_enable(bar5);
 737}
 738
 739static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = {
 740    .read = vfio_nvidia_bar5_quirk_master_read,
 741    .write = vfio_nvidia_bar5_quirk_master_write,
 742    .endianness = DEVICE_LITTLE_ENDIAN,
 743};
 744
 745static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque,
 746                                                   hwaddr addr, unsigned size)
 747{
 748    VFIONvidiaBAR5Quirk *bar5 = opaque;
 749    VFIOPCIDevice *vdev = bar5->window.vdev;
 750
 751    return vfio_region_read(&vdev->bars[5].region, addr + 4, size);
 752}
 753
 754static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr,
 755                                                uint64_t data, unsigned size)
 756{
 757    VFIONvidiaBAR5Quirk *bar5 = opaque;
 758    VFIOPCIDevice *vdev = bar5->window.vdev;
 759
 760    vfio_region_write(&vdev->bars[5].region, addr + 4, data, size);
 761
 762    bar5->enable = data;
 763    vfio_nvidia_bar5_enable(bar5);
 764}
 765
 766static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = {
 767    .read = vfio_nvidia_bar5_quirk_enable_read,
 768    .write = vfio_nvidia_bar5_quirk_enable_write,
 769    .endianness = DEVICE_LITTLE_ENDIAN,
 770};
 771
 772static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
 773{
 774    VFIOQuirk *quirk;
 775    VFIONvidiaBAR5Quirk *bar5;
 776    VFIOConfigWindowQuirk *window;
 777
 778    if (vdev->no_geforce_quirks ||
 779        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 780        !vdev->vga || nr != 5 || !vdev->bars[5].ioport) {
 781        return;
 782    }
 783
 784    quirk = vfio_quirk_alloc(4);
 785    bar5 = quirk->data = g_malloc0(sizeof(*bar5) +
 786                                   (sizeof(VFIOConfigWindowMatch) * 2));
 787    window = &bar5->window;
 788
 789    window->vdev = vdev;
 790    window->address_offset = 0x8;
 791    window->data_offset = 0xc;
 792    window->nr_matches = 2;
 793    window->matches[0].match = 0x1800;
 794    window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1;
 795    window->matches[1].match = 0x88000;
 796    window->matches[1].mask = vdev->config_size - 1;
 797    window->bar = nr;
 798    window->addr_mem = bar5->addr_mem = &quirk->mem[0];
 799    window->data_mem = bar5->data_mem = &quirk->mem[1];
 800
 801    memory_region_init_io(window->addr_mem, OBJECT(vdev),
 802                          &vfio_generic_window_address_quirk, window,
 803                          "vfio-nvidia-bar5-window-address-quirk", 4);
 804    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 805                                        window->address_offset,
 806                                        window->addr_mem, 1);
 807    memory_region_set_enabled(window->addr_mem, false);
 808
 809    memory_region_init_io(window->data_mem, OBJECT(vdev),
 810                          &vfio_generic_window_data_quirk, window,
 811                          "vfio-nvidia-bar5-window-data-quirk", 4);
 812    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 813                                        window->data_offset,
 814                                        window->data_mem, 1);
 815    memory_region_set_enabled(window->data_mem, false);
 816
 817    memory_region_init_io(&quirk->mem[2], OBJECT(vdev),
 818                          &vfio_nvidia_bar5_quirk_master, bar5,
 819                          "vfio-nvidia-bar5-master-quirk", 4);
 820    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 821                                        0, &quirk->mem[2], 1);
 822
 823    memory_region_init_io(&quirk->mem[3], OBJECT(vdev),
 824                          &vfio_nvidia_bar5_quirk_enable, bar5,
 825                          "vfio-nvidia-bar5-enable-quirk", 4);
 826    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 827                                        4, &quirk->mem[3], 1);
 828
 829    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 830
 831    trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
 832}
 833
 834typedef struct LastDataSet {
 835    VFIOQuirk *quirk;
 836    hwaddr addr;
 837    uint64_t data;
 838    unsigned size;
 839    int hits;
 840    int added;
 841} LastDataSet;
 842
 843#define MAX_DYN_IOEVENTFD 10
 844#define HITS_FOR_IOEVENTFD 10
 845
 846/*
 847 * Finally, BAR0 itself.  We want to redirect any accesses to either
 848 * 0x1800 or 0x88000 through the PCI config space access functions.
 849 */
 850static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
 851                                           uint64_t data, unsigned size)
 852{
 853    VFIOConfigMirrorQuirk *mirror = opaque;
 854    VFIOPCIDevice *vdev = mirror->vdev;
 855    PCIDevice *pdev = &vdev->pdev;
 856    LastDataSet *last = (LastDataSet *)&mirror->data;
 857
 858    vfio_generic_quirk_mirror_write(opaque, addr, data, size);
 859
 860    /*
 861     * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
 862     * MSI capability ID register.  Both the ID and next register are
 863     * read-only, so we allow writes covering either of those to real hw.
 864     */
 865    if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
 866        vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
 867        vfio_region_write(&vdev->bars[mirror->bar].region,
 868                          addr + mirror->offset, data, size);
 869        trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
 870    }
 871
 872    /*
 873     * Automatically add an ioeventfd to handle any repeated write with the
 874     * same data and size above the standard PCI config space header.  This is
 875     * primarily expected to accelerate the MSI-ACK behavior, such as noted
 876     * above.  Current hardware/drivers should trigger an ioeventfd at config
 877     * offset 0x704 (region offset 0x88704), with data 0x0, size 4.
 878     *
 879     * The criteria of 10 successive hits is arbitrary but reliably adds the
 880     * MSI-ACK region.  Note that as some writes are bypassed via the ioeventfd,
 881     * the remaining ones have a greater chance of being seen successively.
 882     * To avoid the pathological case of burning up all of QEMU's open file
 883     * handles, arbitrarily limit this algorithm from adding no more than 10
 884     * ioeventfds, print an error if we would have added an 11th, and then
 885     * stop counting.
 886     */
 887    if (!vdev->no_kvm_ioeventfd &&
 888        addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) {
 889        if (addr != last->addr || data != last->data || size != last->size) {
 890            last->addr = addr;
 891            last->data = data;
 892            last->size = size;
 893            last->hits = 1;
 894        } else if (++last->hits >= HITS_FOR_IOEVENTFD) {
 895            if (last->added < MAX_DYN_IOEVENTFD) {
 896                VFIOIOEventFD *ioeventfd;
 897                ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size,
 898                                        data, &vdev->bars[mirror->bar].region,
 899                                        mirror->offset + addr, true);
 900                if (ioeventfd) {
 901                    VFIOQuirk *quirk = last->quirk;
 902
 903                    QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next);
 904                    last->added++;
 905                }
 906            } else {
 907                last->added++;
 908                warn_report("NVIDIA ioeventfd queue full for %s, unable to "
 909                            "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", "
 910                            "size %u", vdev->vbasedev.name, addr, data, size);
 911            }
 912        }
 913    }
 914}
 915
 916static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
 917    .read = vfio_generic_quirk_mirror_read,
 918    .write = vfio_nvidia_quirk_mirror_write,
 919    .endianness = DEVICE_LITTLE_ENDIAN,
 920};
 921
 922static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
 923{
 924    VFIOConfigMirrorQuirk *mirror = quirk->data;
 925    LastDataSet *last = (LastDataSet *)&mirror->data;
 926
 927    last->addr = last->data = last->size = last->hits = last->added = 0;
 928
 929    vfio_drop_dynamic_eventfds(vdev, quirk);
 930}
 931
 932static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
 933{
 934    VFIOQuirk *quirk;
 935    VFIOConfigMirrorQuirk *mirror;
 936    LastDataSet *last;
 937
 938    if (vdev->no_geforce_quirks ||
 939        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 940        !vfio_is_vga(vdev) || nr != 0) {
 941        return;
 942    }
 943
 944    quirk = vfio_quirk_alloc(1);
 945    quirk->reset = vfio_nvidia_bar0_quirk_reset;
 946    mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
 947    mirror->mem = quirk->mem;
 948    mirror->vdev = vdev;
 949    mirror->offset = 0x88000;
 950    mirror->bar = nr;
 951    last = (LastDataSet *)&mirror->data;
 952    last->quirk = quirk;
 953
 954    memory_region_init_io(mirror->mem, OBJECT(vdev),
 955                          &vfio_nvidia_mirror_quirk, mirror,
 956                          "vfio-nvidia-bar0-88000-mirror-quirk",
 957                          vdev->config_size);
 958    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 959                                        mirror->offset, mirror->mem, 1);
 960
 961    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 962
 963    /* The 0x1800 offset mirror only seems to get used by legacy VGA */
 964    if (vdev->vga) {
 965        quirk = vfio_quirk_alloc(1);
 966        quirk->reset = vfio_nvidia_bar0_quirk_reset;
 967        mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
 968        mirror->mem = quirk->mem;
 969        mirror->vdev = vdev;
 970        mirror->offset = 0x1800;
 971        mirror->bar = nr;
 972        last = (LastDataSet *)&mirror->data;
 973        last->quirk = quirk;
 974
 975        memory_region_init_io(mirror->mem, OBJECT(vdev),
 976                              &vfio_nvidia_mirror_quirk, mirror,
 977                              "vfio-nvidia-bar0-1800-mirror-quirk",
 978                              PCI_CONFIG_SPACE_SIZE);
 979        memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 980                                            mirror->offset, mirror->mem, 1);
 981
 982        QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 983    }
 984
 985    trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name);
 986}
 987
 988/*
 989 * TODO - Some Nvidia devices provide config access to their companion HDA
 990 * device and even to their parent bridge via these config space mirrors.
 991 * Add quirks for those regions.
 992 */
 993
 994#define PCI_VENDOR_ID_REALTEK 0x10ec
 995
 996/*
 997 * RTL8168 devices have a backdoor that can access the MSI-X table.  At BAR2
 998 * offset 0x70 there is a dword data register, offset 0x74 is a dword address
 999 * register.  According to the Linux r8169 driver, the MSI-X table is addressed
1000 * when the "type" portion of the address register is set to 0x1.  This appears
1001 * to be bits 16:30.  Bit 31 is both a write indicator and some sort of
1002 * "address latched" indicator.  Bits 12:15 are a mask field, which we can
1003 * ignore because the MSI-X table should always be accessed as a dword (full
1004 * mask).  Bits 0:11 is offset within the type.
1005 *
1006 * Example trace:
1007 *
1008 * Read from MSI-X table offset 0
1009 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
1010 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
1011 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
1012 *
1013 * Write 0xfee00000 to MSI-X table offset 0
1014 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
1015 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
1016 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
1017 */
1018typedef struct VFIOrtl8168Quirk {
1019    VFIOPCIDevice *vdev;
1020    uint32_t addr;
1021    uint32_t data;
1022    bool enabled;
1023} VFIOrtl8168Quirk;
1024
1025static uint64_t vfio_rtl8168_quirk_address_read(void *opaque,
1026                                                hwaddr addr, unsigned size)
1027{
1028    VFIOrtl8168Quirk *rtl = opaque;
1029    VFIOPCIDevice *vdev = rtl->vdev;
1030    uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size);
1031
1032    if (rtl->enabled) {
1033        data = rtl->addr ^ 0x80000000U; /* latch/complete */
1034        trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data);
1035    }
1036
1037    return data;
1038}
1039
1040static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr,
1041                                             uint64_t data, unsigned size)
1042{
1043    VFIOrtl8168Quirk *rtl = opaque;
1044    VFIOPCIDevice *vdev = rtl->vdev;
1045
1046    rtl->enabled = false;
1047
1048    if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */
1049        rtl->enabled = true;
1050        rtl->addr = (uint32_t)data;
1051
1052        if (data & 0x80000000U) { /* Do write */
1053            if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
1054                hwaddr offset = data & 0xfff;
1055                uint64_t val = rtl->data;
1056
1057                trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name,
1058                                                    (uint16_t)offset, val);
1059
1060                /* Write to the proper guest MSI-X table instead */
1061                memory_region_dispatch_write(&vdev->pdev.msix_table_mmio,
1062                                             offset, val,
1063                                             size_memop(size) | MO_LE,
1064                                             MEMTXATTRS_UNSPECIFIED);
1065            }
1066            return; /* Do not write guest MSI-X data to hardware */
1067        }
1068    }
1069
1070    vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size);
1071}
1072
1073static const MemoryRegionOps vfio_rtl_address_quirk = {
1074    .read = vfio_rtl8168_quirk_address_read,
1075    .write = vfio_rtl8168_quirk_address_write,
1076    .valid = {
1077        .min_access_size = 4,
1078        .max_access_size = 4,
1079        .unaligned = false,
1080    },
1081    .endianness = DEVICE_LITTLE_ENDIAN,
1082};
1083
1084static uint64_t vfio_rtl8168_quirk_data_read(void *opaque,
1085                                             hwaddr addr, unsigned size)
1086{
1087    VFIOrtl8168Quirk *rtl = opaque;
1088    VFIOPCIDevice *vdev = rtl->vdev;
1089    uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size);
1090
1091    if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
1092        hwaddr offset = rtl->addr & 0xfff;
1093        memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset,
1094                                    &data, size_memop(size) | MO_LE,
1095                                    MEMTXATTRS_UNSPECIFIED);
1096        trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data);
1097    }
1098
1099    return data;
1100}
1101
1102static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr,
1103                                          uint64_t data, unsigned size)
1104{
1105    VFIOrtl8168Quirk *rtl = opaque;
1106    VFIOPCIDevice *vdev = rtl->vdev;
1107
1108    rtl->data = (uint32_t)data;
1109
1110    vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size);
1111}
1112
1113static const MemoryRegionOps vfio_rtl_data_quirk = {
1114    .read = vfio_rtl8168_quirk_data_read,
1115    .write = vfio_rtl8168_quirk_data_write,
1116    .valid = {
1117        .min_access_size = 4,
1118        .max_access_size = 4,
1119        .unaligned = false,
1120    },
1121    .endianness = DEVICE_LITTLE_ENDIAN,
1122};
1123
1124static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr)
1125{
1126    VFIOQuirk *quirk;
1127    VFIOrtl8168Quirk *rtl;
1128
1129    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) {
1130        return;
1131    }
1132
1133    quirk = vfio_quirk_alloc(2);
1134    quirk->data = rtl = g_malloc0(sizeof(*rtl));
1135    rtl->vdev = vdev;
1136
1137    memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
1138                          &vfio_rtl_address_quirk, rtl,
1139                          "vfio-rtl8168-window-address-quirk", 4);
1140    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1141                                        0x74, &quirk->mem[0], 1);
1142
1143    memory_region_init_io(&quirk->mem[1], OBJECT(vdev),
1144                          &vfio_rtl_data_quirk, rtl,
1145                          "vfio-rtl8168-window-data-quirk", 4);
1146    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1147                                        0x70, &quirk->mem[1], 1);
1148
1149    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1150
1151    trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name);
1152}
1153
1154#define IGD_ASLS 0xfc /* ASL Storage Register */
1155
1156/*
1157 * The OpRegion includes the Video BIOS Table, which seems important for
1158 * telling the driver what sort of outputs it has.  Without this, the device
1159 * may work in the guest, but we may not get output.  This also requires BIOS
1160 * support to reserve and populate a section of guest memory sufficient for
1161 * the table and to write the base address of that memory to the ASLS register
1162 * of the IGD device.
1163 */
1164int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
1165                               struct vfio_region_info *info, Error **errp)
1166{
1167    int ret;
1168
1169    vdev->igd_opregion = g_malloc0(info->size);
1170    ret = pread(vdev->vbasedev.fd, vdev->igd_opregion,
1171                info->size, info->offset);
1172    if (ret != info->size) {
1173        error_setg(errp, "failed to read IGD OpRegion");
1174        g_free(vdev->igd_opregion);
1175        vdev->igd_opregion = NULL;
1176        return -EINVAL;
1177    }
1178
1179    /*
1180     * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to
1181     * allocate 32bit reserved memory for, copy these contents into, and write
1182     * the reserved memory base address to the device ASLS register at 0xFC.
1183     * Alignment of this reserved region seems flexible, but using a 4k page
1184     * alignment seems to work well.  This interface assumes a single IGD
1185     * device, which may be at VM address 00:02.0 in legacy mode or another
1186     * address in UPT mode.
1187     *
1188     * NB, there may be future use cases discovered where the VM should have
1189     * direct interaction with the host OpRegion, in which case the write to
1190     * the ASLS register would trigger MemoryRegion setup to enable that.
1191     */
1192    fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion",
1193                    vdev->igd_opregion, info->size);
1194
1195    trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name);
1196
1197    pci_set_long(vdev->pdev.config + IGD_ASLS, 0);
1198    pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0);
1199    pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0);
1200
1201    return 0;
1202}
1203
1204/*
1205 * Common quirk probe entry points.
1206 */
1207void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
1208{
1209    vfio_vga_probe_ati_3c3_quirk(vdev);
1210    vfio_vga_probe_nvidia_3d0_quirk(vdev);
1211}
1212
1213void vfio_vga_quirk_exit(VFIOPCIDevice *vdev)
1214{
1215    VFIOQuirk *quirk;
1216    int i, j;
1217
1218    for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1219        QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) {
1220            for (j = 0; j < quirk->nr_mem; j++) {
1221                memory_region_del_subregion(&vdev->vga->region[i].mem,
1222                                            &quirk->mem[j]);
1223            }
1224        }
1225    }
1226}
1227
1228void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev)
1229{
1230    int i, j;
1231
1232    for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1233        while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) {
1234            VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks);
1235            QLIST_REMOVE(quirk, next);
1236            for (j = 0; j < quirk->nr_mem; j++) {
1237                object_unparent(OBJECT(&quirk->mem[j]));
1238            }
1239            g_free(quirk->mem);
1240            g_free(quirk->data);
1241            g_free(quirk);
1242        }
1243    }
1244}
1245
1246void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
1247{
1248    vfio_probe_ati_bar4_quirk(vdev, nr);
1249    vfio_probe_ati_bar2_quirk(vdev, nr);
1250    vfio_probe_nvidia_bar5_quirk(vdev, nr);
1251    vfio_probe_nvidia_bar0_quirk(vdev, nr);
1252    vfio_probe_rtl8168_bar2_quirk(vdev, nr);
1253#ifdef CONFIG_VFIO_IGD
1254    vfio_probe_igd_bar4_quirk(vdev, nr);
1255#endif
1256}
1257
1258void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
1259{
1260    VFIOBAR *bar = &vdev->bars[nr];
1261    VFIOQuirk *quirk;
1262    int i;
1263
1264    QLIST_FOREACH(quirk, &bar->quirks, next) {
1265        while (!QLIST_EMPTY(&quirk->ioeventfds)) {
1266            vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds));
1267        }
1268
1269        for (i = 0; i < quirk->nr_mem; i++) {
1270            memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
1271        }
1272    }
1273}
1274
1275void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr)
1276{
1277    VFIOBAR *bar = &vdev->bars[nr];
1278    int i;
1279
1280    while (!QLIST_EMPTY(&bar->quirks)) {
1281        VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1282        QLIST_REMOVE(quirk, next);
1283        for (i = 0; i < quirk->nr_mem; i++) {
1284            object_unparent(OBJECT(&quirk->mem[i]));
1285        }
1286        g_free(quirk->mem);
1287        g_free(quirk->data);
1288        g_free(quirk);
1289    }
1290}
1291
1292/*
1293 * Reset quirks
1294 */
1295void vfio_quirk_reset(VFIOPCIDevice *vdev)
1296{
1297    int i;
1298
1299    for (i = 0; i < PCI_ROM_SLOT; i++) {
1300        VFIOQuirk *quirk;
1301        VFIOBAR *bar = &vdev->bars[i];
1302
1303        QLIST_FOREACH(quirk, &bar->quirks, next) {
1304            if (quirk->reset) {
1305                quirk->reset(vdev, quirk);
1306            }
1307        }
1308    }
1309}
1310
1311/*
1312 * AMD Radeon PCI config reset, based on Linux:
1313 *   drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running()
1314 *   drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset
1315 *   drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc()
1316 *   drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock()
1317 * IDs: include/drm/drm_pciids.h
1318 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0
1319 *
1320 * Bonaire and Hawaii GPUs do not respond to a bus reset.  This is a bug in the
1321 * hardware that should be fixed on future ASICs.  The symptom of this is that
1322 * once the accerlated driver loads, Windows guests will bsod on subsequent
1323 * attmpts to load the driver, such as after VM reset or shutdown/restart.  To
1324 * work around this, we do an AMD specific PCI config reset, followed by an SMC
1325 * reset.  The PCI config reset only works if SMC firmware is running, so we
1326 * have a dependency on the state of the device as to whether this reset will
1327 * be effective.  There are still cases where we won't be able to kick the
1328 * device into working, but this greatly improves the usability overall.  The
1329 * config reset magic is relatively common on AMD GPUs, but the setup and SMC
1330 * poking is largely ASIC specific.
1331 */
1332static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev)
1333{
1334    uint32_t clk, pc_c;
1335
1336    /*
1337     * Registers 200h and 204h are index and data registers for accessing
1338     * indirect configuration registers within the device.
1339     */
1340    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1341    clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1342    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4);
1343    pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1344
1345    return (!(clk & 1) && (0x20100 <= pc_c));
1346}
1347
1348/*
1349 * The scope of a config reset is controlled by a mode bit in the misc register
1350 * and a fuse, exposed as a bit in another register.  The fuse is the default
1351 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula
1352 * scope = !(misc ^ fuse), where the resulting scope is defined the same as
1353 * the fuse.  A truth table therefore tells us that if misc == fuse, we need
1354 * to flip the value of the bit in the misc register.
1355 */
1356static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev)
1357{
1358    uint32_t misc, fuse;
1359    bool a, b;
1360
1361    vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4);
1362    fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1363    b = fuse & 64;
1364
1365    vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4);
1366    misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1367    a = misc & 2;
1368
1369    if (a == b) {
1370        vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4);
1371        vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */
1372    }
1373}
1374
1375static int vfio_radeon_reset(VFIOPCIDevice *vdev)
1376{
1377    PCIDevice *pdev = &vdev->pdev;
1378    int i, ret = 0;
1379    uint32_t data;
1380
1381    /* Defer to a kernel implemented reset */
1382    if (vdev->vbasedev.reset_works) {
1383        trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name);
1384        return -ENODEV;
1385    }
1386
1387    /* Enable only memory BAR access */
1388    vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2);
1389
1390    /* Reset only works if SMC firmware is loaded and running */
1391    if (!vfio_radeon_smc_is_running(vdev)) {
1392        ret = -EINVAL;
1393        trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name);
1394        goto out;
1395    }
1396
1397    /* Make sure only the GFX function is reset */
1398    vfio_radeon_set_gfx_only_reset(vdev);
1399
1400    /* AMD PCI config reset */
1401    vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4);
1402    usleep(100);
1403
1404    /* Read back the memory size to make sure we're out of reset */
1405    for (i = 0; i < 100000; i++) {
1406        if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) {
1407            goto reset_smc;
1408        }
1409        usleep(1);
1410    }
1411
1412    trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name);
1413
1414reset_smc:
1415    /* Reset SMC */
1416    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4);
1417    data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1418    data |= 1;
1419    vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
1420
1421    /* Disable SMC clock */
1422    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1423    data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1424    data |= 1;
1425    vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
1426
1427    trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name);
1428
1429out:
1430    /* Restore PCI command register */
1431    vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2);
1432
1433    return ret;
1434}
1435
1436void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev)
1437{
1438    switch (vdev->vendor_id) {
1439    case 0x1002:
1440        switch (vdev->device_id) {
1441        /* Bonaire */
1442        case 0x6649: /* Bonaire [FirePro W5100] */
1443        case 0x6650:
1444        case 0x6651:
1445        case 0x6658: /* Bonaire XTX [Radeon R7 260X] */
1446        case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */
1447        case 0x665d: /* Bonaire [Radeon R7 200 Series] */
1448        /* Hawaii */
1449        case 0x67A0: /* Hawaii XT GL [FirePro W9100] */
1450        case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */
1451        case 0x67A2:
1452        case 0x67A8:
1453        case 0x67A9:
1454        case 0x67AA:
1455        case 0x67B0: /* Hawaii XT [Radeon R9 290X] */
1456        case 0x67B1: /* Hawaii PRO [Radeon R9 290] */
1457        case 0x67B8:
1458        case 0x67B9:
1459        case 0x67BA:
1460        case 0x67BE:
1461            vdev->resetfn = vfio_radeon_reset;
1462            trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name);
1463            break;
1464        }
1465        break;
1466    }
1467}
1468
1469/*
1470 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify
1471 * devices as a member of a clique.  Devices within the same clique ID
1472 * are capable of direct P2P.  It's the user's responsibility that this
1473 * is correct.  The spec says that this may reside at any unused config
1474 * offset, but reserves and recommends hypervisors place this at C8h.
1475 * The spec also states that the hypervisor should place this capability
1476 * at the end of the capability list, thus next is defined as 0h.
1477 *
1478 * +----------------+----------------+----------------+----------------+
1479 * | sig 7:0 ('P')  |  vndr len (8h) |    next (0h)   |   cap id (9h)  |
1480 * +----------------+----------------+----------------+----------------+
1481 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)|          sig 23:8 ('P2')        |
1482 * +---------------------------------+---------------------------------+
1483 *
1484 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
1485 */
1486static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v,
1487                                       const char *name, void *opaque,
1488                                       Error **errp)
1489{
1490    DeviceState *dev = DEVICE(obj);
1491    Property *prop = opaque;
1492    uint8_t *ptr = qdev_get_prop_ptr(dev, prop);
1493
1494    visit_type_uint8(v, name, ptr, errp);
1495}
1496
1497static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v,
1498                                       const char *name, void *opaque,
1499                                       Error **errp)
1500{
1501    DeviceState *dev = DEVICE(obj);
1502    Property *prop = opaque;
1503    uint8_t value, *ptr = qdev_get_prop_ptr(dev, prop);
1504    Error *local_err = NULL;
1505
1506    if (dev->realized) {
1507        qdev_prop_set_after_realize(dev, name, errp);
1508        return;
1509    }
1510
1511    visit_type_uint8(v, name, &value, &local_err);
1512    if (local_err) {
1513        error_propagate(errp, local_err);
1514        return;
1515    }
1516
1517    if (value & ~0xF) {
1518        error_setg(errp, "Property %s: valid range 0-15", name);
1519        return;
1520    }
1521
1522    *ptr = value;
1523}
1524
1525const PropertyInfo qdev_prop_nv_gpudirect_clique = {
1526    .name = "uint4",
1527    .description = "NVIDIA GPUDirect Clique ID (0 - 15)",
1528    .get = get_nv_gpudirect_clique_id,
1529    .set = set_nv_gpudirect_clique_id,
1530};
1531
1532static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
1533{
1534    PCIDevice *pdev = &vdev->pdev;
1535    int ret, pos = 0xC8;
1536
1537    if (vdev->nv_gpudirect_clique == 0xFF) {
1538        return 0;
1539    }
1540
1541    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
1542        error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor");
1543        return -EINVAL;
1544    }
1545
1546    if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) !=
1547        PCI_BASE_CLASS_DISPLAY) {
1548        error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class");
1549        return -EINVAL;
1550    }
1551
1552    ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
1553    if (ret < 0) {
1554        error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
1555        return ret;
1556    }
1557
1558    memset(vdev->emulated_config_bits + pos, 0xFF, 8);
1559    pos += PCI_CAP_FLAGS;
1560    pci_set_byte(pdev->config + pos++, 8);
1561    pci_set_byte(pdev->config + pos++, 'P');
1562    pci_set_byte(pdev->config + pos++, '2');
1563    pci_set_byte(pdev->config + pos++, 'P');
1564    pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3);
1565    pci_set_byte(pdev->config + pos, 0);
1566
1567    return 0;
1568}
1569
1570int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
1571{
1572    int ret;
1573
1574    ret = vfio_add_nv_gpudirect_cap(vdev, errp);
1575    if (ret) {
1576        return ret;
1577    }
1578
1579    return 0;
1580}
1581
1582static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v,
1583                                     const char *name,
1584                                     void *opaque, Error **errp)
1585{
1586    uint64_t tgt = (uintptr_t) opaque;
1587    visit_type_uint64(v, name, &tgt, errp);
1588}
1589
1590static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v,
1591                                                 const char *name,
1592                                                 void *opaque, Error **errp)
1593{
1594    uint32_t link_speed = (uint32_t)(uintptr_t) opaque;
1595    visit_type_uint32(v, name, &link_speed, errp);
1596}
1597
1598int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp)
1599{
1600    int ret;
1601    void *p;
1602    struct vfio_region_info *nv2reg = NULL;
1603    struct vfio_info_cap_header *hdr;
1604    struct vfio_region_info_cap_nvlink2_ssatgt *cap;
1605    VFIOQuirk *quirk;
1606
1607    ret = vfio_get_dev_region_info(&vdev->vbasedev,
1608                                   VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
1609                                   PCI_VENDOR_ID_NVIDIA,
1610                                   VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
1611                                   &nv2reg);
1612    if (ret) {
1613        return ret;
1614    }
1615
1616    hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
1617    if (!hdr) {
1618        ret = -ENODEV;
1619        goto free_exit;
1620    }
1621    cap = (void *) hdr;
1622
1623    p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE | PROT_EXEC,
1624             MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset);
1625    if (p == MAP_FAILED) {
1626        ret = -errno;
1627        goto free_exit;
1628    }
1629
1630    quirk = vfio_quirk_alloc(1);
1631    memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr",
1632                               nv2reg->size, p);
1633    QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
1634
1635    object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
1636                        vfio_pci_nvlink2_get_tgt, NULL, NULL,
1637                        (void *) (uintptr_t) cap->tgt, NULL);
1638    trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt,
1639                                          nv2reg->size);
1640free_exit:
1641    g_free(nv2reg);
1642
1643    return ret;
1644}
1645
1646int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp)
1647{
1648    int ret;
1649    void *p;
1650    struct vfio_region_info *atsdreg = NULL;
1651    struct vfio_info_cap_header *hdr;
1652    struct vfio_region_info_cap_nvlink2_ssatgt *captgt;
1653    struct vfio_region_info_cap_nvlink2_lnkspd *capspeed;
1654    VFIOQuirk *quirk;
1655
1656    ret = vfio_get_dev_region_info(&vdev->vbasedev,
1657                                   VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
1658                                   PCI_VENDOR_ID_IBM,
1659                                   VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
1660                                   &atsdreg);
1661    if (ret) {
1662        return ret;
1663    }
1664
1665    hdr = vfio_get_region_info_cap(atsdreg,
1666                                   VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
1667    if (!hdr) {
1668        ret = -ENODEV;
1669        goto free_exit;
1670    }
1671    captgt = (void *) hdr;
1672
1673    hdr = vfio_get_region_info_cap(atsdreg,
1674                                   VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD);
1675    if (!hdr) {
1676        ret = -ENODEV;
1677        goto free_exit;
1678    }
1679    capspeed = (void *) hdr;
1680
1681    /* Some NVLink bridges may not have assigned ATSD */
1682    if (atsdreg->size) {
1683        p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE | PROT_EXEC,
1684                 MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset);
1685        if (p == MAP_FAILED) {
1686            ret = -errno;
1687            goto free_exit;
1688        }
1689
1690        quirk = vfio_quirk_alloc(1);
1691        memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev),
1692                                          "nvlink2-atsd-mr", atsdreg->size, p);
1693        QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
1694    }
1695
1696    object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
1697                        vfio_pci_nvlink2_get_tgt, NULL, NULL,
1698                        (void *) (uintptr_t) captgt->tgt, NULL);
1699    trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt,
1700                                              atsdreg->size);
1701
1702    object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32",
1703                        vfio_pci_nvlink2_get_link_speed, NULL, NULL,
1704                        (void *) (uintptr_t) capspeed->link_speed, NULL);
1705    trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name,
1706                                              capspeed->link_speed);
1707free_exit:
1708    g_free(atsdreg);
1709
1710    return ret;
1711}
1712