qemu/hw/vfio/pci-quirks.c
<<
>>
Prefs
   1/*
   2 * device quirks for PCI devices
   3 *
   4 * Copyright Red Hat, Inc. 2012-2015
   5 *
   6 * Authors:
   7 *  Alex Williamson <alex.williamson@redhat.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 */
  12
  13#include "qemu/osdep.h"
  14#include "exec/memop.h"
  15#include "qemu/units.h"
  16#include "qemu/error-report.h"
  17#include "qemu/main-loop.h"
  18#include "qemu/module.h"
  19#include "qemu/range.h"
  20#include "qapi/error.h"
  21#include "qapi/visitor.h"
  22#include <sys/ioctl.h>
  23#include "hw/hw.h"
  24#include "hw/nvram/fw_cfg.h"
  25#include "hw/qdev-properties.h"
  26#include "pci.h"
  27#include "trace.h"
  28
  29/* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */
  30static bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t device)
  31{
  32    return (vendor == PCI_ANY_ID || vendor == vdev->vendor_id) &&
  33           (device == PCI_ANY_ID || device == vdev->device_id);
  34}
  35
  36static bool vfio_is_vga(VFIOPCIDevice *vdev)
  37{
  38    PCIDevice *pdev = &vdev->pdev;
  39    uint16_t class = pci_get_word(pdev->config + PCI_CLASS_DEVICE);
  40
  41    return class == PCI_CLASS_DISPLAY_VGA;
  42}
  43
  44/*
  45 * List of device ids/vendor ids for which to disable
  46 * option rom loading. This avoids the guest hangs during rom
  47 * execution as noticed with the BCM 57810 card for lack of a
  48 * more better way to handle such issues.
  49 * The  user can still override by specifying a romfile or
  50 * rombar=1.
  51 * Please see https://bugs.launchpad.net/qemu/+bug/1284874
  52 * for an analysis of the 57810 card hang. When adding
  53 * a new vendor id/device id combination below, please also add
  54 * your card/environment details and information that could
  55 * help in debugging to the bug tracking this issue
  56 */
  57static const struct {
  58    uint32_t vendor;
  59    uint32_t device;
  60} romblacklist[] = {
  61    { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */
  62};
  63
  64bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev)
  65{
  66    int i;
  67
  68    for (i = 0 ; i < ARRAY_SIZE(romblacklist); i++) {
  69        if (vfio_pci_is(vdev, romblacklist[i].vendor, romblacklist[i].device)) {
  70            trace_vfio_quirk_rom_blacklisted(vdev->vbasedev.name,
  71                                             romblacklist[i].vendor,
  72                                             romblacklist[i].device);
  73            return true;
  74        }
  75    }
  76    return false;
  77}
  78
  79/*
  80 * Device specific region quirks (mostly backdoors to PCI config space)
  81 */
  82
  83/*
  84 * The generic window quirks operate on an address and data register,
  85 * vfio_generic_window_address_quirk handles the address register and
  86 * vfio_generic_window_data_quirk handles the data register.  These ops
  87 * pass reads and writes through to hardware until a value matching the
  88 * stored address match/mask is written.  When this occurs, the data
  89 * register access emulated PCI config space for the device rather than
  90 * passing through accesses.  This enables devices where PCI config space
  91 * is accessible behind a window register to maintain the virtualization
  92 * provided through vfio.
  93 */
  94typedef struct VFIOConfigWindowMatch {
  95    uint32_t match;
  96    uint32_t mask;
  97} VFIOConfigWindowMatch;
  98
  99typedef struct VFIOConfigWindowQuirk {
 100    struct VFIOPCIDevice *vdev;
 101
 102    uint32_t address_val;
 103
 104    uint32_t address_offset;
 105    uint32_t data_offset;
 106
 107    bool window_enabled;
 108    uint8_t bar;
 109
 110    MemoryRegion *addr_mem;
 111    MemoryRegion *data_mem;
 112
 113    uint32_t nr_matches;
 114    VFIOConfigWindowMatch matches[];
 115} VFIOConfigWindowQuirk;
 116
 117static uint64_t vfio_generic_window_quirk_address_read(void *opaque,
 118                                                       hwaddr addr,
 119                                                       unsigned size)
 120{
 121    VFIOConfigWindowQuirk *window = opaque;
 122    VFIOPCIDevice *vdev = window->vdev;
 123
 124    return vfio_region_read(&vdev->bars[window->bar].region,
 125                            addr + window->address_offset, size);
 126}
 127
 128static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr,
 129                                                    uint64_t data,
 130                                                    unsigned size)
 131{
 132    VFIOConfigWindowQuirk *window = opaque;
 133    VFIOPCIDevice *vdev = window->vdev;
 134    int i;
 135
 136    window->window_enabled = false;
 137
 138    vfio_region_write(&vdev->bars[window->bar].region,
 139                      addr + window->address_offset, data, size);
 140
 141    for (i = 0; i < window->nr_matches; i++) {
 142        if ((data & ~window->matches[i].mask) == window->matches[i].match) {
 143            window->window_enabled = true;
 144            window->address_val = data & window->matches[i].mask;
 145            trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name,
 146                                    memory_region_name(window->addr_mem), data);
 147            break;
 148        }
 149    }
 150}
 151
 152static const MemoryRegionOps vfio_generic_window_address_quirk = {
 153    .read = vfio_generic_window_quirk_address_read,
 154    .write = vfio_generic_window_quirk_address_write,
 155    .endianness = DEVICE_LITTLE_ENDIAN,
 156};
 157
 158static uint64_t vfio_generic_window_quirk_data_read(void *opaque,
 159                                                    hwaddr addr, unsigned size)
 160{
 161    VFIOConfigWindowQuirk *window = opaque;
 162    VFIOPCIDevice *vdev = window->vdev;
 163    uint64_t data;
 164
 165    /* Always read data reg, discard if window enabled */
 166    data = vfio_region_read(&vdev->bars[window->bar].region,
 167                            addr + window->data_offset, size);
 168
 169    if (window->window_enabled) {
 170        data = vfio_pci_read_config(&vdev->pdev, window->address_val, size);
 171        trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name,
 172                                    memory_region_name(window->data_mem), data);
 173    }
 174
 175    return data;
 176}
 177
 178static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr,
 179                                                 uint64_t data, unsigned size)
 180{
 181    VFIOConfigWindowQuirk *window = opaque;
 182    VFIOPCIDevice *vdev = window->vdev;
 183
 184    if (window->window_enabled) {
 185        vfio_pci_write_config(&vdev->pdev, window->address_val, data, size);
 186        trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name,
 187                                    memory_region_name(window->data_mem), data);
 188        return;
 189    }
 190
 191    vfio_region_write(&vdev->bars[window->bar].region,
 192                      addr + window->data_offset, data, size);
 193}
 194
 195static const MemoryRegionOps vfio_generic_window_data_quirk = {
 196    .read = vfio_generic_window_quirk_data_read,
 197    .write = vfio_generic_window_quirk_data_write,
 198    .endianness = DEVICE_LITTLE_ENDIAN,
 199};
 200
 201/*
 202 * The generic mirror quirk handles devices which expose PCI config space
 203 * through a region within a BAR.  When enabled, reads and writes are
 204 * redirected through to emulated PCI config space.  XXX if PCI config space
 205 * used memory regions, this could just be an alias.
 206 */
 207typedef struct VFIOConfigMirrorQuirk {
 208    struct VFIOPCIDevice *vdev;
 209    uint32_t offset;
 210    uint8_t bar;
 211    MemoryRegion *mem;
 212    uint8_t data[];
 213} VFIOConfigMirrorQuirk;
 214
 215static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
 216                                               hwaddr addr, unsigned size)
 217{
 218    VFIOConfigMirrorQuirk *mirror = opaque;
 219    VFIOPCIDevice *vdev = mirror->vdev;
 220    uint64_t data;
 221
 222    /* Read and discard in case the hardware cares */
 223    (void)vfio_region_read(&vdev->bars[mirror->bar].region,
 224                           addr + mirror->offset, size);
 225
 226    data = vfio_pci_read_config(&vdev->pdev, addr, size);
 227    trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name,
 228                                         memory_region_name(mirror->mem),
 229                                         addr, data);
 230    return data;
 231}
 232
 233static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr,
 234                                            uint64_t data, unsigned size)
 235{
 236    VFIOConfigMirrorQuirk *mirror = opaque;
 237    VFIOPCIDevice *vdev = mirror->vdev;
 238
 239    vfio_pci_write_config(&vdev->pdev, addr, data, size);
 240    trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name,
 241                                          memory_region_name(mirror->mem),
 242                                          addr, data);
 243}
 244
 245static const MemoryRegionOps vfio_generic_mirror_quirk = {
 246    .read = vfio_generic_quirk_mirror_read,
 247    .write = vfio_generic_quirk_mirror_write,
 248    .endianness = DEVICE_LITTLE_ENDIAN,
 249};
 250
 251/* Is range1 fully contained within range2?  */
 252static bool vfio_range_contained(uint64_t first1, uint64_t len1,
 253                                 uint64_t first2, uint64_t len2) {
 254    return (first1 >= first2 && first1 + len1 <= first2 + len2);
 255}
 256
 257#define PCI_VENDOR_ID_ATI               0x1002
 258
 259/*
 260 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
 261 * through VGA register 0x3c3.  On newer cards, the I/O port BAR is always
 262 * BAR4 (older cards like the X550 used BAR1, but we don't care to support
 263 * those).  Note that on bare metal, a read of 0x3c3 doesn't always return the
 264 * I/O port BAR address.  Originally this was coded to return the virtual BAR
 265 * address only if the physical register read returns the actual BAR address,
 266 * but users have reported greater success if we return the virtual address
 267 * unconditionally.
 268 */
 269static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
 270                                        hwaddr addr, unsigned size)
 271{
 272    VFIOPCIDevice *vdev = opaque;
 273    uint64_t data = vfio_pci_read_config(&vdev->pdev,
 274                                         PCI_BASE_ADDRESS_4 + 1, size);
 275
 276    trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data);
 277
 278    return data;
 279}
 280
 281static const MemoryRegionOps vfio_ati_3c3_quirk = {
 282    .read = vfio_ati_3c3_quirk_read,
 283    .endianness = DEVICE_LITTLE_ENDIAN,
 284};
 285
 286static VFIOQuirk *vfio_quirk_alloc(int nr_mem)
 287{
 288    VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
 289    QLIST_INIT(&quirk->ioeventfds);
 290    quirk->mem = g_new0(MemoryRegion, nr_mem);
 291    quirk->nr_mem = nr_mem;
 292
 293    return quirk;
 294}
 295
 296static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
 297{
 298    QLIST_REMOVE(ioeventfd, next);
 299    memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
 300                              true, ioeventfd->data, &ioeventfd->e);
 301
 302    if (ioeventfd->vfio) {
 303        struct vfio_device_ioeventfd vfio_ioeventfd;
 304
 305        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
 306        vfio_ioeventfd.flags = ioeventfd->size;
 307        vfio_ioeventfd.data = ioeventfd->data;
 308        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
 309                                ioeventfd->region_addr;
 310        vfio_ioeventfd.fd = -1;
 311
 312        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) {
 313            error_report("Failed to remove vfio ioeventfd for %s+0x%"
 314                         HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)",
 315                         memory_region_name(ioeventfd->mr), ioeventfd->addr,
 316                         ioeventfd->size, ioeventfd->data);
 317        }
 318    } else {
 319        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
 320                            NULL, NULL, NULL);
 321    }
 322
 323    event_notifier_cleanup(&ioeventfd->e);
 324    trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
 325                              (uint64_t)ioeventfd->addr, ioeventfd->size,
 326                              ioeventfd->data);
 327    g_free(ioeventfd);
 328}
 329
 330static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
 331{
 332    VFIOIOEventFD *ioeventfd, *tmp;
 333
 334    QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
 335        if (ioeventfd->dynamic) {
 336            vfio_ioeventfd_exit(vdev, ioeventfd);
 337        }
 338    }
 339}
 340
 341static void vfio_ioeventfd_handler(void *opaque)
 342{
 343    VFIOIOEventFD *ioeventfd = opaque;
 344
 345    if (event_notifier_test_and_clear(&ioeventfd->e)) {
 346        vfio_region_write(ioeventfd->region, ioeventfd->region_addr,
 347                          ioeventfd->data, ioeventfd->size);
 348        trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr),
 349                                     (uint64_t)ioeventfd->addr, ioeventfd->size,
 350                                     ioeventfd->data);
 351    }
 352}
 353
 354static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
 355                                          MemoryRegion *mr, hwaddr addr,
 356                                          unsigned size, uint64_t data,
 357                                          VFIORegion *region,
 358                                          hwaddr region_addr, bool dynamic)
 359{
 360    VFIOIOEventFD *ioeventfd;
 361
 362    if (vdev->no_kvm_ioeventfd) {
 363        return NULL;
 364    }
 365
 366    ioeventfd = g_malloc0(sizeof(*ioeventfd));
 367
 368    if (event_notifier_init(&ioeventfd->e, 0)) {
 369        g_free(ioeventfd);
 370        return NULL;
 371    }
 372
 373    /*
 374     * MemoryRegion and relative offset, plus additional ioeventfd setup
 375     * parameters for configuring and later tearing down KVM ioeventfd.
 376     */
 377    ioeventfd->mr = mr;
 378    ioeventfd->addr = addr;
 379    ioeventfd->size = size;
 380    ioeventfd->data = data;
 381    ioeventfd->dynamic = dynamic;
 382    /*
 383     * VFIORegion and relative offset for implementing the userspace
 384     * handler.  data & size fields shared for both uses.
 385     */
 386    ioeventfd->region = region;
 387    ioeventfd->region_addr = region_addr;
 388
 389    if (!vdev->no_vfio_ioeventfd) {
 390        struct vfio_device_ioeventfd vfio_ioeventfd;
 391
 392        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
 393        vfio_ioeventfd.flags = ioeventfd->size;
 394        vfio_ioeventfd.data = ioeventfd->data;
 395        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
 396                                ioeventfd->region_addr;
 397        vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e);
 398
 399        ioeventfd->vfio = !ioctl(vdev->vbasedev.fd,
 400                                 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
 401    }
 402
 403    if (!ioeventfd->vfio) {
 404        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
 405                            vfio_ioeventfd_handler, NULL, ioeventfd);
 406    }
 407
 408    memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
 409                              true, ioeventfd->data, &ioeventfd->e);
 410    trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
 411                              size, data, ioeventfd->vfio);
 412
 413    return ioeventfd;
 414}
 415
 416static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
 417{
 418    VFIOQuirk *quirk;
 419
 420    /*
 421     * As long as the BAR is >= 256 bytes it will be aligned such that the
 422     * lower byte is always zero.  Filter out anything else, if it exists.
 423     */
 424    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 425        !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) {
 426        return;
 427    }
 428
 429    quirk = vfio_quirk_alloc(1);
 430
 431    memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev,
 432                          "vfio-ati-3c3-quirk", 1);
 433    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 434                                3 /* offset 3 bytes from 0x3c0 */, quirk->mem);
 435
 436    QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
 437                      quirk, next);
 438
 439    trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name);
 440}
 441
 442/*
 443 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI
 444 * config space through MMIO BAR2 at offset 0x4000.  Nothing seems to access
 445 * the MMIO space directly, but a window to this space is provided through
 446 * I/O port BAR4.  Offset 0x0 is the address register and offset 0x4 is the
 447 * data register.  When the address is programmed to a range of 0x4000-0x4fff
 448 * PCI configuration space is available.  Experimentation seems to indicate
 449 * that read-only may be provided by hardware.
 450 */
 451static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 452{
 453    VFIOQuirk *quirk;
 454    VFIOConfigWindowQuirk *window;
 455
 456    /* This windows doesn't seem to be used except by legacy VGA code */
 457    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 458        !vdev->vga || nr != 4) {
 459        return;
 460    }
 461
 462    quirk = vfio_quirk_alloc(2);
 463    window = quirk->data = g_malloc0(sizeof(*window) +
 464                                     sizeof(VFIOConfigWindowMatch));
 465    window->vdev = vdev;
 466    window->address_offset = 0;
 467    window->data_offset = 4;
 468    window->nr_matches = 1;
 469    window->matches[0].match = 0x4000;
 470    window->matches[0].mask = vdev->config_size - 1;
 471    window->bar = nr;
 472    window->addr_mem = &quirk->mem[0];
 473    window->data_mem = &quirk->mem[1];
 474
 475    memory_region_init_io(window->addr_mem, OBJECT(vdev),
 476                          &vfio_generic_window_address_quirk, window,
 477                          "vfio-ati-bar4-window-address-quirk", 4);
 478    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 479                                        window->address_offset,
 480                                        window->addr_mem, 1);
 481
 482    memory_region_init_io(window->data_mem, OBJECT(vdev),
 483                          &vfio_generic_window_data_quirk, window,
 484                          "vfio-ati-bar4-window-data-quirk", 4);
 485    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 486                                        window->data_offset,
 487                                        window->data_mem, 1);
 488
 489    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 490
 491    trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name);
 492}
 493
 494/*
 495 * Trap the BAR2 MMIO mirror to config space as well.
 496 */
 497static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr)
 498{
 499    VFIOQuirk *quirk;
 500    VFIOConfigMirrorQuirk *mirror;
 501
 502    /* Only enable on newer devices where BAR2 is 64bit */
 503    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 504        !vdev->vga || nr != 2 || !vdev->bars[2].mem64) {
 505        return;
 506    }
 507
 508    quirk = vfio_quirk_alloc(1);
 509    mirror = quirk->data = g_malloc0(sizeof(*mirror));
 510    mirror->mem = quirk->mem;
 511    mirror->vdev = vdev;
 512    mirror->offset = 0x4000;
 513    mirror->bar = nr;
 514
 515    memory_region_init_io(mirror->mem, OBJECT(vdev),
 516                          &vfio_generic_mirror_quirk, mirror,
 517                          "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE);
 518    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 519                                        mirror->offset, mirror->mem, 1);
 520
 521    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 522
 523    trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name);
 524}
 525
 526/*
 527 * Older ATI/AMD cards like the X550 have a similar window to that above.
 528 * I/O port BAR1 provides a window to a mirror of PCI config space located
 529 * in BAR2 at offset 0xf00.  We don't care to support such older cards, but
 530 * note it for future reference.
 531 */
 532
 533/*
 534 * Nvidia has several different methods to get to config space, the
 535 * nouveu project has several of these documented here:
 536 * https://github.com/pathscale/envytools/tree/master/hwdocs
 537 *
 538 * The first quirk is actually not documented in envytools and is found
 539 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]).  This is an
 540 * NV46 chipset.  The backdoor uses the legacy VGA I/O ports to access
 541 * the mirror of PCI config space found at BAR0 offset 0x1800.  The access
 542 * sequence first writes 0x338 to I/O port 0x3d4.  The target offset is
 543 * then written to 0x3d0.  Finally 0x538 is written for a read and 0x738
 544 * is written for a write to 0x3d4.  The BAR0 offset is then accessible
 545 * through 0x3d0.  This quirk doesn't seem to be necessary on newer cards
 546 * that use the I/O port BAR5 window but it doesn't hurt to leave it.
 547 */
 548typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State;
 549static const char *nv3d0_states[] = { "NONE", "SELECT",
 550                                      "WINDOW", "READ", "WRITE" };
 551
 552typedef struct VFIONvidia3d0Quirk {
 553    VFIOPCIDevice *vdev;
 554    VFIONvidia3d0State state;
 555    uint32_t offset;
 556} VFIONvidia3d0Quirk;
 557
 558static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque,
 559                                           hwaddr addr, unsigned size)
 560{
 561    VFIONvidia3d0Quirk *quirk = opaque;
 562    VFIOPCIDevice *vdev = quirk->vdev;
 563
 564    quirk->state = NONE;
 565
 566    return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 567                         addr + 0x14, size);
 568}
 569
 570static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr,
 571                                        uint64_t data, unsigned size)
 572{
 573    VFIONvidia3d0Quirk *quirk = opaque;
 574    VFIOPCIDevice *vdev = quirk->vdev;
 575    VFIONvidia3d0State old_state = quirk->state;
 576
 577    quirk->state = NONE;
 578
 579    switch (data) {
 580    case 0x338:
 581        if (old_state == NONE) {
 582            quirk->state = SELECT;
 583            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 584                                              nv3d0_states[quirk->state]);
 585        }
 586        break;
 587    case 0x538:
 588        if (old_state == WINDOW) {
 589            quirk->state = READ;
 590            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 591                                              nv3d0_states[quirk->state]);
 592        }
 593        break;
 594    case 0x738:
 595        if (old_state == WINDOW) {
 596            quirk->state = WRITE;
 597            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 598                                              nv3d0_states[quirk->state]);
 599        }
 600        break;
 601    }
 602
 603    vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 604                   addr + 0x14, data, size);
 605}
 606
 607static const MemoryRegionOps vfio_nvidia_3d4_quirk = {
 608    .read = vfio_nvidia_3d4_quirk_read,
 609    .write = vfio_nvidia_3d4_quirk_write,
 610    .endianness = DEVICE_LITTLE_ENDIAN,
 611};
 612
 613static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
 614                                           hwaddr addr, unsigned size)
 615{
 616    VFIONvidia3d0Quirk *quirk = opaque;
 617    VFIOPCIDevice *vdev = quirk->vdev;
 618    VFIONvidia3d0State old_state = quirk->state;
 619    uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 620                                  addr + 0x10, size);
 621
 622    quirk->state = NONE;
 623
 624    if (old_state == READ &&
 625        (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
 626        uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
 627
 628        data = vfio_pci_read_config(&vdev->pdev, offset, size);
 629        trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name,
 630                                         offset, size, data);
 631    }
 632
 633    return data;
 634}
 635
 636static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
 637                                        uint64_t data, unsigned size)
 638{
 639    VFIONvidia3d0Quirk *quirk = opaque;
 640    VFIOPCIDevice *vdev = quirk->vdev;
 641    VFIONvidia3d0State old_state = quirk->state;
 642
 643    quirk->state = NONE;
 644
 645    if (old_state == SELECT) {
 646        quirk->offset = (uint32_t)data;
 647        quirk->state = WINDOW;
 648        trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 649                                          nv3d0_states[quirk->state]);
 650    } else if (old_state == WRITE) {
 651        if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
 652            uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
 653
 654            vfio_pci_write_config(&vdev->pdev, offset, data, size);
 655            trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name,
 656                                              offset, data, size);
 657            return;
 658        }
 659    }
 660
 661    vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 662                   addr + 0x10, data, size);
 663}
 664
 665static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
 666    .read = vfio_nvidia_3d0_quirk_read,
 667    .write = vfio_nvidia_3d0_quirk_write,
 668    .endianness = DEVICE_LITTLE_ENDIAN,
 669};
 670
 671static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
 672{
 673    VFIOQuirk *quirk;
 674    VFIONvidia3d0Quirk *data;
 675
 676    if (vdev->no_geforce_quirks ||
 677        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 678        !vdev->bars[1].region.size) {
 679        return;
 680    }
 681
 682    quirk = vfio_quirk_alloc(2);
 683    quirk->data = data = g_malloc0(sizeof(*data));
 684    data->vdev = vdev;
 685
 686    memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk,
 687                          data, "vfio-nvidia-3d4-quirk", 2);
 688    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 689                                0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]);
 690
 691    memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk,
 692                          data, "vfio-nvidia-3d0-quirk", 2);
 693    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 694                                0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]);
 695
 696    QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
 697                      quirk, next);
 698
 699    trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name);
 700}
 701
 702/*
 703 * The second quirk is documented in envytools.  The I/O port BAR5 is just
 704 * a set of address/data ports to the MMIO BARs.  The BAR we care about is
 705 * again BAR0.  This backdoor is apparently a bit newer than the one above
 706 * so we need to not only trap 256 bytes @0x1800, but all of PCI config
 707 * space, including extended space is available at the 4k @0x88000.
 708 */
 709typedef struct VFIONvidiaBAR5Quirk {
 710    uint32_t master;
 711    uint32_t enable;
 712    MemoryRegion *addr_mem;
 713    MemoryRegion *data_mem;
 714    bool enabled;
 715    VFIOConfigWindowQuirk window; /* last for match data */
 716} VFIONvidiaBAR5Quirk;
 717
 718static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5)
 719{
 720    VFIOPCIDevice *vdev = bar5->window.vdev;
 721
 722    if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) {
 723        return;
 724    }
 725
 726    bar5->enabled = !bar5->enabled;
 727    trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name,
 728                                       bar5->enabled ?  "Enable" : "Disable");
 729    memory_region_set_enabled(bar5->addr_mem, bar5->enabled);
 730    memory_region_set_enabled(bar5->data_mem, bar5->enabled);
 731}
 732
 733static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque,
 734                                                   hwaddr addr, unsigned size)
 735{
 736    VFIONvidiaBAR5Quirk *bar5 = opaque;
 737    VFIOPCIDevice *vdev = bar5->window.vdev;
 738
 739    return vfio_region_read(&vdev->bars[5].region, addr, size);
 740}
 741
 742static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr,
 743                                                uint64_t data, unsigned size)
 744{
 745    VFIONvidiaBAR5Quirk *bar5 = opaque;
 746    VFIOPCIDevice *vdev = bar5->window.vdev;
 747
 748    vfio_region_write(&vdev->bars[5].region, addr, data, size);
 749
 750    bar5->master = data;
 751    vfio_nvidia_bar5_enable(bar5);
 752}
 753
 754static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = {
 755    .read = vfio_nvidia_bar5_quirk_master_read,
 756    .write = vfio_nvidia_bar5_quirk_master_write,
 757    .endianness = DEVICE_LITTLE_ENDIAN,
 758};
 759
 760static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque,
 761                                                   hwaddr addr, unsigned size)
 762{
 763    VFIONvidiaBAR5Quirk *bar5 = opaque;
 764    VFIOPCIDevice *vdev = bar5->window.vdev;
 765
 766    return vfio_region_read(&vdev->bars[5].region, addr + 4, size);
 767}
 768
 769static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr,
 770                                                uint64_t data, unsigned size)
 771{
 772    VFIONvidiaBAR5Quirk *bar5 = opaque;
 773    VFIOPCIDevice *vdev = bar5->window.vdev;
 774
 775    vfio_region_write(&vdev->bars[5].region, addr + 4, data, size);
 776
 777    bar5->enable = data;
 778    vfio_nvidia_bar5_enable(bar5);
 779}
 780
 781static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = {
 782    .read = vfio_nvidia_bar5_quirk_enable_read,
 783    .write = vfio_nvidia_bar5_quirk_enable_write,
 784    .endianness = DEVICE_LITTLE_ENDIAN,
 785};
 786
 787static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
 788{
 789    VFIOQuirk *quirk;
 790    VFIONvidiaBAR5Quirk *bar5;
 791    VFIOConfigWindowQuirk *window;
 792
 793    if (vdev->no_geforce_quirks ||
 794        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 795        !vdev->vga || nr != 5 || !vdev->bars[5].ioport) {
 796        return;
 797    }
 798
 799    quirk = vfio_quirk_alloc(4);
 800    bar5 = quirk->data = g_malloc0(sizeof(*bar5) +
 801                                   (sizeof(VFIOConfigWindowMatch) * 2));
 802    window = &bar5->window;
 803
 804    window->vdev = vdev;
 805    window->address_offset = 0x8;
 806    window->data_offset = 0xc;
 807    window->nr_matches = 2;
 808    window->matches[0].match = 0x1800;
 809    window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1;
 810    window->matches[1].match = 0x88000;
 811    window->matches[1].mask = vdev->config_size - 1;
 812    window->bar = nr;
 813    window->addr_mem = bar5->addr_mem = &quirk->mem[0];
 814    window->data_mem = bar5->data_mem = &quirk->mem[1];
 815
 816    memory_region_init_io(window->addr_mem, OBJECT(vdev),
 817                          &vfio_generic_window_address_quirk, window,
 818                          "vfio-nvidia-bar5-window-address-quirk", 4);
 819    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 820                                        window->address_offset,
 821                                        window->addr_mem, 1);
 822    memory_region_set_enabled(window->addr_mem, false);
 823
 824    memory_region_init_io(window->data_mem, OBJECT(vdev),
 825                          &vfio_generic_window_data_quirk, window,
 826                          "vfio-nvidia-bar5-window-data-quirk", 4);
 827    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 828                                        window->data_offset,
 829                                        window->data_mem, 1);
 830    memory_region_set_enabled(window->data_mem, false);
 831
 832    memory_region_init_io(&quirk->mem[2], OBJECT(vdev),
 833                          &vfio_nvidia_bar5_quirk_master, bar5,
 834                          "vfio-nvidia-bar5-master-quirk", 4);
 835    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 836                                        0, &quirk->mem[2], 1);
 837
 838    memory_region_init_io(&quirk->mem[3], OBJECT(vdev),
 839                          &vfio_nvidia_bar5_quirk_enable, bar5,
 840                          "vfio-nvidia-bar5-enable-quirk", 4);
 841    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 842                                        4, &quirk->mem[3], 1);
 843
 844    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 845
 846    trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
 847}
 848
 849typedef struct LastDataSet {
 850    VFIOQuirk *quirk;
 851    hwaddr addr;
 852    uint64_t data;
 853    unsigned size;
 854    int hits;
 855    int added;
 856} LastDataSet;
 857
 858#define MAX_DYN_IOEVENTFD 10
 859#define HITS_FOR_IOEVENTFD 10
 860
 861/*
 862 * Finally, BAR0 itself.  We want to redirect any accesses to either
 863 * 0x1800 or 0x88000 through the PCI config space access functions.
 864 */
 865static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
 866                                           uint64_t data, unsigned size)
 867{
 868    VFIOConfigMirrorQuirk *mirror = opaque;
 869    VFIOPCIDevice *vdev = mirror->vdev;
 870    PCIDevice *pdev = &vdev->pdev;
 871    LastDataSet *last = (LastDataSet *)&mirror->data;
 872
 873    vfio_generic_quirk_mirror_write(opaque, addr, data, size);
 874
 875    /*
 876     * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
 877     * MSI capability ID register.  Both the ID and next register are
 878     * read-only, so we allow writes covering either of those to real hw.
 879     */
 880    if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
 881        vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
 882        vfio_region_write(&vdev->bars[mirror->bar].region,
 883                          addr + mirror->offset, data, size);
 884        trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
 885    }
 886
 887    /*
 888     * Automatically add an ioeventfd to handle any repeated write with the
 889     * same data and size above the standard PCI config space header.  This is
 890     * primarily expected to accelerate the MSI-ACK behavior, such as noted
 891     * above.  Current hardware/drivers should trigger an ioeventfd at config
 892     * offset 0x704 (region offset 0x88704), with data 0x0, size 4.
 893     *
 894     * The criteria of 10 successive hits is arbitrary but reliably adds the
 895     * MSI-ACK region.  Note that as some writes are bypassed via the ioeventfd,
 896     * the remaining ones have a greater chance of being seen successively.
 897     * To avoid the pathological case of burning up all of QEMU's open file
 898     * handles, arbitrarily limit this algorithm from adding no more than 10
 899     * ioeventfds, print an error if we would have added an 11th, and then
 900     * stop counting.
 901     */
 902    if (!vdev->no_kvm_ioeventfd &&
 903        addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) {
 904        if (addr != last->addr || data != last->data || size != last->size) {
 905            last->addr = addr;
 906            last->data = data;
 907            last->size = size;
 908            last->hits = 1;
 909        } else if (++last->hits >= HITS_FOR_IOEVENTFD) {
 910            if (last->added < MAX_DYN_IOEVENTFD) {
 911                VFIOIOEventFD *ioeventfd;
 912                ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size,
 913                                        data, &vdev->bars[mirror->bar].region,
 914                                        mirror->offset + addr, true);
 915                if (ioeventfd) {
 916                    VFIOQuirk *quirk = last->quirk;
 917
 918                    QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next);
 919                    last->added++;
 920                }
 921            } else {
 922                last->added++;
 923                warn_report("NVIDIA ioeventfd queue full for %s, unable to "
 924                            "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", "
 925                            "size %u", vdev->vbasedev.name, addr, data, size);
 926            }
 927        }
 928    }
 929}
 930
 931static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
 932    .read = vfio_generic_quirk_mirror_read,
 933    .write = vfio_nvidia_quirk_mirror_write,
 934    .endianness = DEVICE_LITTLE_ENDIAN,
 935};
 936
 937static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
 938{
 939    VFIOConfigMirrorQuirk *mirror = quirk->data;
 940    LastDataSet *last = (LastDataSet *)&mirror->data;
 941
 942    last->addr = last->data = last->size = last->hits = last->added = 0;
 943
 944    vfio_drop_dynamic_eventfds(vdev, quirk);
 945}
 946
 947static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
 948{
 949    VFIOQuirk *quirk;
 950    VFIOConfigMirrorQuirk *mirror;
 951    LastDataSet *last;
 952
 953    if (vdev->no_geforce_quirks ||
 954        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 955        !vfio_is_vga(vdev) || nr != 0) {
 956        return;
 957    }
 958
 959    quirk = vfio_quirk_alloc(1);
 960    quirk->reset = vfio_nvidia_bar0_quirk_reset;
 961    mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
 962    mirror->mem = quirk->mem;
 963    mirror->vdev = vdev;
 964    mirror->offset = 0x88000;
 965    mirror->bar = nr;
 966    last = (LastDataSet *)&mirror->data;
 967    last->quirk = quirk;
 968
 969    memory_region_init_io(mirror->mem, OBJECT(vdev),
 970                          &vfio_nvidia_mirror_quirk, mirror,
 971                          "vfio-nvidia-bar0-88000-mirror-quirk",
 972                          vdev->config_size);
 973    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 974                                        mirror->offset, mirror->mem, 1);
 975
 976    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 977
 978    /* The 0x1800 offset mirror only seems to get used by legacy VGA */
 979    if (vdev->vga) {
 980        quirk = vfio_quirk_alloc(1);
 981        quirk->reset = vfio_nvidia_bar0_quirk_reset;
 982        mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
 983        mirror->mem = quirk->mem;
 984        mirror->vdev = vdev;
 985        mirror->offset = 0x1800;
 986        mirror->bar = nr;
 987        last = (LastDataSet *)&mirror->data;
 988        last->quirk = quirk;
 989
 990        memory_region_init_io(mirror->mem, OBJECT(vdev),
 991                              &vfio_nvidia_mirror_quirk, mirror,
 992                              "vfio-nvidia-bar0-1800-mirror-quirk",
 993                              PCI_CONFIG_SPACE_SIZE);
 994        memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 995                                            mirror->offset, mirror->mem, 1);
 996
 997        QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 998    }
 999
1000    trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name);
1001}
1002
1003/*
1004 * TODO - Some Nvidia devices provide config access to their companion HDA
1005 * device and even to their parent bridge via these config space mirrors.
1006 * Add quirks for those regions.
1007 */
1008
1009#define PCI_VENDOR_ID_REALTEK 0x10ec
1010
1011/*
1012 * RTL8168 devices have a backdoor that can access the MSI-X table.  At BAR2
1013 * offset 0x70 there is a dword data register, offset 0x74 is a dword address
1014 * register.  According to the Linux r8169 driver, the MSI-X table is addressed
1015 * when the "type" portion of the address register is set to 0x1.  This appears
1016 * to be bits 16:30.  Bit 31 is both a write indicator and some sort of
1017 * "address latched" indicator.  Bits 12:15 are a mask field, which we can
1018 * ignore because the MSI-X table should always be accessed as a dword (full
1019 * mask).  Bits 0:11 is offset within the type.
1020 *
1021 * Example trace:
1022 *
1023 * Read from MSI-X table offset 0
1024 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
1025 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
1026 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
1027 *
1028 * Write 0xfee00000 to MSI-X table offset 0
1029 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
1030 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
1031 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
1032 */
1033typedef struct VFIOrtl8168Quirk {
1034    VFIOPCIDevice *vdev;
1035    uint32_t addr;
1036    uint32_t data;
1037    bool enabled;
1038} VFIOrtl8168Quirk;
1039
1040static uint64_t vfio_rtl8168_quirk_address_read(void *opaque,
1041                                                hwaddr addr, unsigned size)
1042{
1043    VFIOrtl8168Quirk *rtl = opaque;
1044    VFIOPCIDevice *vdev = rtl->vdev;
1045    uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size);
1046
1047    if (rtl->enabled) {
1048        data = rtl->addr ^ 0x80000000U; /* latch/complete */
1049        trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data);
1050    }
1051
1052    return data;
1053}
1054
1055static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr,
1056                                             uint64_t data, unsigned size)
1057{
1058    VFIOrtl8168Quirk *rtl = opaque;
1059    VFIOPCIDevice *vdev = rtl->vdev;
1060
1061    rtl->enabled = false;
1062
1063    if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */
1064        rtl->enabled = true;
1065        rtl->addr = (uint32_t)data;
1066
1067        if (data & 0x80000000U) { /* Do write */
1068            if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
1069                hwaddr offset = data & 0xfff;
1070                uint64_t val = rtl->data;
1071
1072                trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name,
1073                                                    (uint16_t)offset, val);
1074
1075                /* Write to the proper guest MSI-X table instead */
1076                memory_region_dispatch_write(&vdev->pdev.msix_table_mmio,
1077                                             offset, val,
1078                                             size_memop(size) | MO_LE,
1079                                             MEMTXATTRS_UNSPECIFIED);
1080            }
1081            return; /* Do not write guest MSI-X data to hardware */
1082        }
1083    }
1084
1085    vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size);
1086}
1087
1088static const MemoryRegionOps vfio_rtl_address_quirk = {
1089    .read = vfio_rtl8168_quirk_address_read,
1090    .write = vfio_rtl8168_quirk_address_write,
1091    .valid = {
1092        .min_access_size = 4,
1093        .max_access_size = 4,
1094        .unaligned = false,
1095    },
1096    .endianness = DEVICE_LITTLE_ENDIAN,
1097};
1098
1099static uint64_t vfio_rtl8168_quirk_data_read(void *opaque,
1100                                             hwaddr addr, unsigned size)
1101{
1102    VFIOrtl8168Quirk *rtl = opaque;
1103    VFIOPCIDevice *vdev = rtl->vdev;
1104    uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size);
1105
1106    if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
1107        hwaddr offset = rtl->addr & 0xfff;
1108        memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset,
1109                                    &data, size_memop(size) | MO_LE,
1110                                    MEMTXATTRS_UNSPECIFIED);
1111        trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data);
1112    }
1113
1114    return data;
1115}
1116
1117static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr,
1118                                          uint64_t data, unsigned size)
1119{
1120    VFIOrtl8168Quirk *rtl = opaque;
1121    VFIOPCIDevice *vdev = rtl->vdev;
1122
1123    rtl->data = (uint32_t)data;
1124
1125    vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size);
1126}
1127
1128static const MemoryRegionOps vfio_rtl_data_quirk = {
1129    .read = vfio_rtl8168_quirk_data_read,
1130    .write = vfio_rtl8168_quirk_data_write,
1131    .valid = {
1132        .min_access_size = 4,
1133        .max_access_size = 4,
1134        .unaligned = false,
1135    },
1136    .endianness = DEVICE_LITTLE_ENDIAN,
1137};
1138
1139static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr)
1140{
1141    VFIOQuirk *quirk;
1142    VFIOrtl8168Quirk *rtl;
1143
1144    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) {
1145        return;
1146    }
1147
1148    quirk = vfio_quirk_alloc(2);
1149    quirk->data = rtl = g_malloc0(sizeof(*rtl));
1150    rtl->vdev = vdev;
1151
1152    memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
1153                          &vfio_rtl_address_quirk, rtl,
1154                          "vfio-rtl8168-window-address-quirk", 4);
1155    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1156                                        0x74, &quirk->mem[0], 1);
1157
1158    memory_region_init_io(&quirk->mem[1], OBJECT(vdev),
1159                          &vfio_rtl_data_quirk, rtl,
1160                          "vfio-rtl8168-window-data-quirk", 4);
1161    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1162                                        0x70, &quirk->mem[1], 1);
1163
1164    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1165
1166    trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name);
1167}
1168
1169/*
1170 * Intel IGD support
1171 *
1172 * Obviously IGD is not a discrete device, this is evidenced not only by it
1173 * being integrated into the CPU, but by the various chipset and BIOS
1174 * dependencies that it brings along with it.  Intel is trying to move away
1175 * from this and Broadwell and newer devices can run in what Intel calls
1176 * "Universal Pass-Through" mode, or UPT.  Theoretically in UPT mode, nothing
1177 * more is required beyond assigning the IGD device to a VM.  There are
1178 * however support limitations to this mode.  It only supports IGD as a
1179 * secondary graphics device in the VM and it doesn't officially support any
1180 * physical outputs.
1181 *
1182 * The code here attempts to enable what we'll call legacy mode assignment,
1183 * IGD retains most of the capabilities we expect for it to have on bare
1184 * metal.  To enable this mode, the IGD device must be assigned to the VM
1185 * at PCI address 00:02.0, it must have a ROM, it very likely needs VGA
1186 * support, we must have VM BIOS support for reserving and populating some
1187 * of the required tables, and we need to tweak the chipset with revisions
1188 * and IDs and an LPC/ISA bridge device.  The intention is to make all of
1189 * this happen automatically by installing the device at the correct VM PCI
1190 * bus address.  If any of the conditions are not met, we cross our fingers
1191 * and hope the user knows better.
1192 *
1193 * NB - It is possible to enable physical outputs in UPT mode by supplying
1194 * an OpRegion table.  We don't do this by default because the guest driver
1195 * behaves differently if an OpRegion is provided and no monitor is attached
1196 * vs no OpRegion and a monitor being attached or not.  Effectively, if a
1197 * headless setup is desired, the OpRegion gets in the way of that.
1198 */
1199
1200/*
1201 * This presumes the device is already known to be an Intel VGA device, so we
1202 * take liberties in which device ID bits match which generation.  This should
1203 * not be taken as an indication that all the devices are supported, or even
1204 * supportable, some of them don't even support VT-d.
1205 * See linux:include/drm/i915_pciids.h for IDs.
1206 */
1207static int igd_gen(VFIOPCIDevice *vdev)
1208{
1209    if ((vdev->device_id & 0xfff) == 0xa84) {
1210        return 8; /* Broxton */
1211    }
1212
1213    switch (vdev->device_id & 0xff00) {
1214    /* Old, untested, unavailable, unknown */
1215    case 0x0000:
1216    case 0x2500:
1217    case 0x2700:
1218    case 0x2900:
1219    case 0x2a00:
1220    case 0x2e00:
1221    case 0x3500:
1222    case 0xa000:
1223        return -1;
1224    /* SandyBridge, IvyBridge, ValleyView, Haswell */
1225    case 0x0100:
1226    case 0x0400:
1227    case 0x0a00:
1228    case 0x0c00:
1229    case 0x0d00:
1230    case 0x0f00:
1231        return 6;
1232    /* BroadWell, CherryView, SkyLake, KabyLake */
1233    case 0x1600:
1234    case 0x1900:
1235    case 0x2200:
1236    case 0x5900:
1237        return 8;
1238    }
1239
1240    return 8; /* Assume newer is compatible */
1241}
1242
1243typedef struct VFIOIGDQuirk {
1244    struct VFIOPCIDevice *vdev;
1245    uint32_t index;
1246    uint32_t bdsm;
1247} VFIOIGDQuirk;
1248
1249#define IGD_GMCH 0x50 /* Graphics Control Register */
1250#define IGD_BDSM 0x5c /* Base Data of Stolen Memory */
1251#define IGD_ASLS 0xfc /* ASL Storage Register */
1252
1253/*
1254 * The OpRegion includes the Video BIOS Table, which seems important for
1255 * telling the driver what sort of outputs it has.  Without this, the device
1256 * may work in the guest, but we may not get output.  This also requires BIOS
1257 * support to reserve and populate a section of guest memory sufficient for
1258 * the table and to write the base address of that memory to the ASLS register
1259 * of the IGD device.
1260 */
1261int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
1262                               struct vfio_region_info *info, Error **errp)
1263{
1264    int ret;
1265
1266    vdev->igd_opregion = g_malloc0(info->size);
1267    ret = pread(vdev->vbasedev.fd, vdev->igd_opregion,
1268                info->size, info->offset);
1269    if (ret != info->size) {
1270        error_setg(errp, "failed to read IGD OpRegion");
1271        g_free(vdev->igd_opregion);
1272        vdev->igd_opregion = NULL;
1273        return -EINVAL;
1274    }
1275
1276    /*
1277     * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to
1278     * allocate 32bit reserved memory for, copy these contents into, and write
1279     * the reserved memory base address to the device ASLS register at 0xFC.
1280     * Alignment of this reserved region seems flexible, but using a 4k page
1281     * alignment seems to work well.  This interface assumes a single IGD
1282     * device, which may be at VM address 00:02.0 in legacy mode or another
1283     * address in UPT mode.
1284     *
1285     * NB, there may be future use cases discovered where the VM should have
1286     * direct interaction with the host OpRegion, in which case the write to
1287     * the ASLS register would trigger MemoryRegion setup to enable that.
1288     */
1289    fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion",
1290                    vdev->igd_opregion, info->size);
1291
1292    trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name);
1293
1294    pci_set_long(vdev->pdev.config + IGD_ASLS, 0);
1295    pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0);
1296    pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0);
1297
1298    return 0;
1299}
1300
1301/*
1302 * The rather short list of registers that we copy from the host devices.
1303 * The LPC/ISA bridge values are definitely needed to support the vBIOS, the
1304 * host bridge values may or may not be needed depending on the guest OS.
1305 * Since we're only munging revision and subsystem values on the host bridge,
1306 * we don't require our own device.  The LPC/ISA bridge needs to be our very
1307 * own though.
1308 */
1309typedef struct {
1310    uint8_t offset;
1311    uint8_t len;
1312} IGDHostInfo;
1313
1314static const IGDHostInfo igd_host_bridge_infos[] = {
1315    {PCI_REVISION_ID,         2},
1316    {PCI_SUBSYSTEM_VENDOR_ID, 2},
1317    {PCI_SUBSYSTEM_ID,        2},
1318};
1319
1320static const IGDHostInfo igd_lpc_bridge_infos[] = {
1321    {PCI_VENDOR_ID,           2},
1322    {PCI_DEVICE_ID,           2},
1323    {PCI_REVISION_ID,         2},
1324    {PCI_SUBSYSTEM_VENDOR_ID, 2},
1325    {PCI_SUBSYSTEM_ID,        2},
1326};
1327
1328static int vfio_pci_igd_copy(VFIOPCIDevice *vdev, PCIDevice *pdev,
1329                             struct vfio_region_info *info,
1330                             const IGDHostInfo *list, int len)
1331{
1332    int i, ret;
1333
1334    for (i = 0; i < len; i++) {
1335        ret = pread(vdev->vbasedev.fd, pdev->config + list[i].offset,
1336                    list[i].len, info->offset + list[i].offset);
1337        if (ret != list[i].len) {
1338            error_report("IGD copy failed: %m");
1339            return -errno;
1340        }
1341    }
1342
1343    return 0;
1344}
1345
1346/*
1347 * Stuff a few values into the host bridge.
1348 */
1349static int vfio_pci_igd_host_init(VFIOPCIDevice *vdev,
1350                                  struct vfio_region_info *info)
1351{
1352    PCIBus *bus;
1353    PCIDevice *host_bridge;
1354    int ret;
1355
1356    bus = pci_device_root_bus(&vdev->pdev);
1357    host_bridge = pci_find_device(bus, 0, PCI_DEVFN(0, 0));
1358
1359    if (!host_bridge) {
1360        error_report("Can't find host bridge");
1361        return -ENODEV;
1362    }
1363
1364    ret = vfio_pci_igd_copy(vdev, host_bridge, info, igd_host_bridge_infos,
1365                            ARRAY_SIZE(igd_host_bridge_infos));
1366    if (!ret) {
1367        trace_vfio_pci_igd_host_bridge_enabled(vdev->vbasedev.name);
1368    }
1369
1370    return ret;
1371}
1372
1373/*
1374 * IGD LPC/ISA bridge support code.  The vBIOS needs this, but we can't write
1375 * arbitrary values into just any bridge, so we must create our own.  We try
1376 * to handle if the user has created it for us, which they might want to do
1377 * to enable multifunction so we don't occupy the whole PCI slot.
1378 */
1379static void vfio_pci_igd_lpc_bridge_realize(PCIDevice *pdev, Error **errp)
1380{
1381    if (pdev->devfn != PCI_DEVFN(0x1f, 0)) {
1382        error_setg(errp, "VFIO dummy ISA/LPC bridge must have address 1f.0");
1383    }
1384}
1385
1386static void vfio_pci_igd_lpc_bridge_class_init(ObjectClass *klass, void *data)
1387{
1388    DeviceClass *dc = DEVICE_CLASS(klass);
1389    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1390
1391    set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
1392    dc->desc = "VFIO dummy ISA/LPC bridge for IGD assignment";
1393    dc->hotpluggable = false;
1394    k->realize = vfio_pci_igd_lpc_bridge_realize;
1395    k->class_id = PCI_CLASS_BRIDGE_ISA;
1396}
1397
1398static TypeInfo vfio_pci_igd_lpc_bridge_info = {
1399    .name = "vfio-pci-igd-lpc-bridge",
1400    .parent = TYPE_PCI_DEVICE,
1401    .class_init = vfio_pci_igd_lpc_bridge_class_init,
1402    .interfaces = (InterfaceInfo[]) {
1403        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
1404        { },
1405    },
1406};
1407
1408static void vfio_pci_igd_register_types(void)
1409{
1410    type_register_static(&vfio_pci_igd_lpc_bridge_info);
1411}
1412
1413type_init(vfio_pci_igd_register_types)
1414
1415static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev,
1416                                 struct vfio_region_info *info)
1417{
1418    PCIDevice *lpc_bridge;
1419    int ret;
1420
1421    lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev),
1422                                 0, PCI_DEVFN(0x1f, 0));
1423    if (!lpc_bridge) {
1424        lpc_bridge = pci_create_simple(pci_device_root_bus(&vdev->pdev),
1425                                 PCI_DEVFN(0x1f, 0), "vfio-pci-igd-lpc-bridge");
1426    }
1427
1428    ret = vfio_pci_igd_copy(vdev, lpc_bridge, info, igd_lpc_bridge_infos,
1429                            ARRAY_SIZE(igd_lpc_bridge_infos));
1430    if (!ret) {
1431        trace_vfio_pci_igd_lpc_bridge_enabled(vdev->vbasedev.name);
1432    }
1433
1434    return ret;
1435}
1436
1437/*
1438 * IGD Gen8 and newer support up to 8MB for the GTT and use a 64bit PTE
1439 * entry, older IGDs use 2MB and 32bit.  Each PTE maps a 4k page.  Therefore
1440 * we either have 2M/4k * 4 = 2k or 8M/4k * 8 = 16k as the maximum iobar index
1441 * for programming the GTT.
1442 *
1443 * See linux:include/drm/i915_drm.h for shift and mask values.
1444 */
1445static int vfio_igd_gtt_max(VFIOPCIDevice *vdev)
1446{
1447    uint32_t gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch));
1448    int ggms, gen = igd_gen(vdev);
1449
1450    gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch));
1451    ggms = (gmch >> (gen < 8 ? 8 : 6)) & 0x3;
1452    if (gen > 6) {
1453        ggms = 1 << ggms;
1454    }
1455
1456    ggms *= MiB;
1457
1458    return (ggms / (4 * KiB)) * (gen < 8 ? 4 : 8);
1459}
1460
1461/*
1462 * The IGD ROM will make use of stolen memory (GGMS) for support of VESA modes.
1463 * Somehow the host stolen memory range is used for this, but how the ROM gets
1464 * it is a mystery, perhaps it's hardcoded into the ROM.  Thankfully though, it
1465 * reprograms the GTT through the IOBAR where we can trap it and transpose the
1466 * programming to the VM allocated buffer.  That buffer gets reserved by the VM
1467 * firmware via the fw_cfg entry added below.  Here we're just monitoring the
1468 * IOBAR address and data registers to detect a write sequence targeting the
1469 * GTTADR.  This code is developed by observed behavior and doesn't have a
1470 * direct spec reference, unfortunately.
1471 */
1472static uint64_t vfio_igd_quirk_data_read(void *opaque,
1473                                         hwaddr addr, unsigned size)
1474{
1475    VFIOIGDQuirk *igd = opaque;
1476    VFIOPCIDevice *vdev = igd->vdev;
1477
1478    igd->index = ~0;
1479
1480    return vfio_region_read(&vdev->bars[4].region, addr + 4, size);
1481}
1482
1483static void vfio_igd_quirk_data_write(void *opaque, hwaddr addr,
1484                                      uint64_t data, unsigned size)
1485{
1486    VFIOIGDQuirk *igd = opaque;
1487    VFIOPCIDevice *vdev = igd->vdev;
1488    uint64_t val = data;
1489    int gen = igd_gen(vdev);
1490
1491    /*
1492     * Programming the GGMS starts at index 0x1 and uses every 4th index (ie.
1493     * 0x1, 0x5, 0x9, 0xd,...).  For pre-Gen8 each 4-byte write is a whole PTE
1494     * entry, with 0th bit enable set.  For Gen8 and up, PTEs are 64bit, so
1495     * entries 0x5 & 0xd are the high dword, in our case zero.  Each PTE points
1496     * to a 4k page, which we translate to a page from the VM allocated region,
1497     * pointed to by the BDSM register.  If this is not set, we fail.
1498     *
1499     * We trap writes to the full configured GTT size, but we typically only
1500     * see the vBIOS writing up to (nearly) the 1MB barrier.  In fact it often
1501     * seems to miss the last entry for an even 1MB GTT.  Doing a gratuitous
1502     * write of that last entry does work, but is hopefully unnecessary since
1503     * we clear the previous GTT on initialization.
1504     */
1505    if ((igd->index % 4 == 1) && igd->index < vfio_igd_gtt_max(vdev)) {
1506        if (gen < 8 || (igd->index % 8 == 1)) {
1507            uint32_t base;
1508
1509            base = pci_get_long(vdev->pdev.config + IGD_BDSM);
1510            if (!base) {
1511                hw_error("vfio-igd: Guest attempted to program IGD GTT before "
1512                         "BIOS reserved stolen memory.  Unsupported BIOS?");
1513            }
1514
1515            val = data - igd->bdsm + base;
1516        } else {
1517            val = 0; /* upper 32bits of pte, we only enable below 4G PTEs */
1518        }
1519
1520        trace_vfio_pci_igd_bar4_write(vdev->vbasedev.name,
1521                                      igd->index, data, val);
1522    }
1523
1524    vfio_region_write(&vdev->bars[4].region, addr + 4, val, size);
1525
1526    igd->index = ~0;
1527}
1528
1529static const MemoryRegionOps vfio_igd_data_quirk = {
1530    .read = vfio_igd_quirk_data_read,
1531    .write = vfio_igd_quirk_data_write,
1532    .endianness = DEVICE_LITTLE_ENDIAN,
1533};
1534
1535static uint64_t vfio_igd_quirk_index_read(void *opaque,
1536                                          hwaddr addr, unsigned size)
1537{
1538    VFIOIGDQuirk *igd = opaque;
1539    VFIOPCIDevice *vdev = igd->vdev;
1540
1541    igd->index = ~0;
1542
1543    return vfio_region_read(&vdev->bars[4].region, addr, size);
1544}
1545
1546static void vfio_igd_quirk_index_write(void *opaque, hwaddr addr,
1547                                       uint64_t data, unsigned size)
1548{
1549    VFIOIGDQuirk *igd = opaque;
1550    VFIOPCIDevice *vdev = igd->vdev;
1551
1552    igd->index = data;
1553
1554    vfio_region_write(&vdev->bars[4].region, addr, data, size);
1555}
1556
1557static const MemoryRegionOps vfio_igd_index_quirk = {
1558    .read = vfio_igd_quirk_index_read,
1559    .write = vfio_igd_quirk_index_write,
1560    .endianness = DEVICE_LITTLE_ENDIAN,
1561};
1562
1563static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
1564{
1565    struct vfio_region_info *rom = NULL, *opregion = NULL,
1566                            *host = NULL, *lpc = NULL;
1567    VFIOQuirk *quirk;
1568    VFIOIGDQuirk *igd;
1569    PCIDevice *lpc_bridge;
1570    int i, ret, ggms_mb, gms_mb = 0, gen;
1571    uint64_t *bdsm_size;
1572    uint32_t gmch;
1573    uint16_t cmd_orig, cmd;
1574    Error *err = NULL;
1575
1576    /*
1577     * This must be an Intel VGA device at address 00:02.0 for us to even
1578     * consider enabling legacy mode.  The vBIOS has dependencies on the
1579     * PCI bus address.
1580     */
1581    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) ||
1582        !vfio_is_vga(vdev) || nr != 4 ||
1583        &vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev),
1584                                       0, PCI_DEVFN(0x2, 0))) {
1585        return;
1586    }
1587
1588    /*
1589     * We need to create an LPC/ISA bridge at PCI bus address 00:1f.0 that we
1590     * can stuff host values into, so if there's already one there and it's not
1591     * one we can hack on, legacy mode is no-go.  Sorry Q35.
1592     */
1593    lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev),
1594                                 0, PCI_DEVFN(0x1f, 0));
1595    if (lpc_bridge && !object_dynamic_cast(OBJECT(lpc_bridge),
1596                                           "vfio-pci-igd-lpc-bridge")) {
1597        error_report("IGD device %s cannot support legacy mode due to existing "
1598                     "devices at address 1f.0", vdev->vbasedev.name);
1599        return;
1600    }
1601
1602    /*
1603     * IGD is not a standard, they like to change their specs often.  We
1604     * only attempt to support back to SandBridge and we hope that newer
1605     * devices maintain compatibility with generation 8.
1606     */
1607    gen = igd_gen(vdev);
1608    if (gen != 6 && gen != 8) {
1609        error_report("IGD device %s is unsupported in legacy mode, "
1610                     "try SandyBridge or newer", vdev->vbasedev.name);
1611        return;
1612    }
1613
1614    /*
1615     * Most of what we're doing here is to enable the ROM to run, so if
1616     * there's no ROM, there's no point in setting up this quirk.
1617     * NB. We only seem to get BIOS ROMs, so a UEFI VM would need CSM support.
1618     */
1619    ret = vfio_get_region_info(&vdev->vbasedev,
1620                               VFIO_PCI_ROM_REGION_INDEX, &rom);
1621    if ((ret || !rom->size) && !vdev->pdev.romfile) {
1622        error_report("IGD device %s has no ROM, legacy mode disabled",
1623                     vdev->vbasedev.name);
1624        goto out;
1625    }
1626
1627    /*
1628     * Ignore the hotplug corner case, mark the ROM failed, we can't
1629     * create the devices we need for legacy mode in the hotplug scenario.
1630     */
1631    if (vdev->pdev.qdev.hotplugged) {
1632        error_report("IGD device %s hotplugged, ROM disabled, "
1633                     "legacy mode disabled", vdev->vbasedev.name);
1634        vdev->rom_read_failed = true;
1635        goto out;
1636    }
1637
1638    /*
1639     * Check whether we have all the vfio device specific regions to
1640     * support legacy mode (added in Linux v4.6).  If not, bail.
1641     */
1642    ret = vfio_get_dev_region_info(&vdev->vbasedev,
1643                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
1644                        VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
1645    if (ret) {
1646        error_report("IGD device %s does not support OpRegion access,"
1647                     "legacy mode disabled", vdev->vbasedev.name);
1648        goto out;
1649    }
1650
1651    ret = vfio_get_dev_region_info(&vdev->vbasedev,
1652                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
1653                        VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host);
1654    if (ret) {
1655        error_report("IGD device %s does not support host bridge access,"
1656                     "legacy mode disabled", vdev->vbasedev.name);
1657        goto out;
1658    }
1659
1660    ret = vfio_get_dev_region_info(&vdev->vbasedev,
1661                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
1662                        VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc);
1663    if (ret) {
1664        error_report("IGD device %s does not support LPC bridge access,"
1665                     "legacy mode disabled", vdev->vbasedev.name);
1666        goto out;
1667    }
1668
1669    gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4);
1670
1671    /*
1672     * If IGD VGA Disable is clear (expected) and VGA is not already enabled,
1673     * try to enable it.  Probably shouldn't be using legacy mode without VGA,
1674     * but also no point in us enabling VGA if disabled in hardware.
1675     */
1676    if (!(gmch & 0x2) && !vdev->vga && vfio_populate_vga(vdev, &err)) {
1677        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
1678        error_report("IGD device %s failed to enable VGA access, "
1679                     "legacy mode disabled", vdev->vbasedev.name);
1680        goto out;
1681    }
1682
1683    /* Create our LPC/ISA bridge */
1684    ret = vfio_pci_igd_lpc_init(vdev, lpc);
1685    if (ret) {
1686        error_report("IGD device %s failed to create LPC bridge, "
1687                     "legacy mode disabled", vdev->vbasedev.name);
1688        goto out;
1689    }
1690
1691    /* Stuff some host values into the VM PCI host bridge */
1692    ret = vfio_pci_igd_host_init(vdev, host);
1693    if (ret) {
1694        error_report("IGD device %s failed to modify host bridge, "
1695                     "legacy mode disabled", vdev->vbasedev.name);
1696        goto out;
1697    }
1698
1699    /* Setup OpRegion access */
1700    ret = vfio_pci_igd_opregion_init(vdev, opregion, &err);
1701    if (ret) {
1702        error_append_hint(&err, "IGD legacy mode disabled\n");
1703        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
1704        goto out;
1705    }
1706
1707    /* Setup our quirk to munge GTT addresses to the VM allocated buffer */
1708    quirk = vfio_quirk_alloc(2);
1709    igd = quirk->data = g_malloc0(sizeof(*igd));
1710    igd->vdev = vdev;
1711    igd->index = ~0;
1712    igd->bdsm = vfio_pci_read_config(&vdev->pdev, IGD_BDSM, 4);
1713    igd->bdsm &= ~((1 * MiB) - 1); /* 1MB aligned */
1714
1715    memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_igd_index_quirk,
1716                          igd, "vfio-igd-index-quirk", 4);
1717    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1718                                        0, &quirk->mem[0], 1);
1719
1720    memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_igd_data_quirk,
1721                          igd, "vfio-igd-data-quirk", 4);
1722    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1723                                        4, &quirk->mem[1], 1);
1724
1725    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1726
1727    /* Determine the size of stolen memory needed for GTT */
1728    ggms_mb = (gmch >> (gen < 8 ? 8 : 6)) & 0x3;
1729    if (gen > 6) {
1730        ggms_mb = 1 << ggms_mb;
1731    }
1732
1733    /*
1734     * Assume we have no GMS memory, but allow it to be overrided by device
1735     * option (experimental).  The spec doesn't actually allow zero GMS when
1736     * when IVD (IGD VGA Disable) is clear, but the claim is that it's unused,
1737     * so let's not waste VM memory for it.
1738     */
1739    gmch &= ~((gen < 8 ? 0x1f : 0xff) << (gen < 8 ? 3 : 8));
1740
1741    if (vdev->igd_gms) {
1742        if (vdev->igd_gms <= 0x10) {
1743            gms_mb = vdev->igd_gms * 32;
1744            gmch |= vdev->igd_gms << (gen < 8 ? 3 : 8);
1745        } else {
1746            error_report("Unsupported IGD GMS value 0x%x", vdev->igd_gms);
1747            vdev->igd_gms = 0;
1748        }
1749    }
1750
1751    /*
1752     * Request reserved memory for stolen memory via fw_cfg.  VM firmware
1753     * must allocate a 1MB aligned reserved memory region below 4GB with
1754     * the requested size (in bytes) for use by the Intel PCI class VGA
1755     * device at VM address 00:02.0.  The base address of this reserved
1756     * memory region must be written to the device BDSM regsiter at PCI
1757     * config offset 0x5C.
1758     */
1759    bdsm_size = g_malloc(sizeof(*bdsm_size));
1760    *bdsm_size = cpu_to_le64((ggms_mb + gms_mb) * MiB);
1761    fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size",
1762                    bdsm_size, sizeof(*bdsm_size));
1763
1764    /* GMCH is read-only, emulated */
1765    pci_set_long(vdev->pdev.config + IGD_GMCH, gmch);
1766    pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0);
1767    pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0);
1768
1769    /* BDSM is read-write, emulated.  The BIOS needs to be able to write it */
1770    pci_set_long(vdev->pdev.config + IGD_BDSM, 0);
1771    pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0);
1772    pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0);
1773
1774    /*
1775     * This IOBAR gives us access to GTTADR, which allows us to write to
1776     * the GTT itself.  So let's go ahead and write zero to all the GTT
1777     * entries to avoid spurious DMA faults.  Be sure I/O access is enabled
1778     * before talking to the device.
1779     */
1780    if (pread(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig),
1781              vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) {
1782        error_report("IGD device %s - failed to read PCI command register",
1783                     vdev->vbasedev.name);
1784    }
1785
1786    cmd = cmd_orig | PCI_COMMAND_IO;
1787
1788    if (pwrite(vdev->vbasedev.fd, &cmd, sizeof(cmd),
1789               vdev->config_offset + PCI_COMMAND) != sizeof(cmd)) {
1790        error_report("IGD device %s - failed to write PCI command register",
1791                     vdev->vbasedev.name);
1792    }
1793
1794    for (i = 1; i < vfio_igd_gtt_max(vdev); i += 4) {
1795        vfio_region_write(&vdev->bars[4].region, 0, i, 4);
1796        vfio_region_write(&vdev->bars[4].region, 4, 0, 4);
1797    }
1798
1799    if (pwrite(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig),
1800               vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) {
1801        error_report("IGD device %s - failed to restore PCI command register",
1802                     vdev->vbasedev.name);
1803    }
1804
1805    trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, ggms_mb + gms_mb);
1806
1807out:
1808    g_free(rom);
1809    g_free(opregion);
1810    g_free(host);
1811    g_free(lpc);
1812}
1813
1814/*
1815 * Common quirk probe entry points.
1816 */
1817void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
1818{
1819    vfio_vga_probe_ati_3c3_quirk(vdev);
1820    vfio_vga_probe_nvidia_3d0_quirk(vdev);
1821}
1822
1823void vfio_vga_quirk_exit(VFIOPCIDevice *vdev)
1824{
1825    VFIOQuirk *quirk;
1826    int i, j;
1827
1828    for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1829        QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) {
1830            for (j = 0; j < quirk->nr_mem; j++) {
1831                memory_region_del_subregion(&vdev->vga->region[i].mem,
1832                                            &quirk->mem[j]);
1833            }
1834        }
1835    }
1836}
1837
1838void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev)
1839{
1840    int i, j;
1841
1842    for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1843        while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) {
1844            VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks);
1845            QLIST_REMOVE(quirk, next);
1846            for (j = 0; j < quirk->nr_mem; j++) {
1847                object_unparent(OBJECT(&quirk->mem[j]));
1848            }
1849            g_free(quirk->mem);
1850            g_free(quirk->data);
1851            g_free(quirk);
1852        }
1853    }
1854}
1855
1856void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
1857{
1858    vfio_probe_ati_bar4_quirk(vdev, nr);
1859    vfio_probe_ati_bar2_quirk(vdev, nr);
1860    vfio_probe_nvidia_bar5_quirk(vdev, nr);
1861    vfio_probe_nvidia_bar0_quirk(vdev, nr);
1862    vfio_probe_rtl8168_bar2_quirk(vdev, nr);
1863    vfio_probe_igd_bar4_quirk(vdev, nr);
1864}
1865
1866void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
1867{
1868    VFIOBAR *bar = &vdev->bars[nr];
1869    VFIOQuirk *quirk;
1870    int i;
1871
1872    QLIST_FOREACH(quirk, &bar->quirks, next) {
1873        while (!QLIST_EMPTY(&quirk->ioeventfds)) {
1874            vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds));
1875        }
1876
1877        for (i = 0; i < quirk->nr_mem; i++) {
1878            memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
1879        }
1880    }
1881}
1882
1883void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr)
1884{
1885    VFIOBAR *bar = &vdev->bars[nr];
1886    int i;
1887
1888    while (!QLIST_EMPTY(&bar->quirks)) {
1889        VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1890        QLIST_REMOVE(quirk, next);
1891        for (i = 0; i < quirk->nr_mem; i++) {
1892            object_unparent(OBJECT(&quirk->mem[i]));
1893        }
1894        g_free(quirk->mem);
1895        g_free(quirk->data);
1896        g_free(quirk);
1897    }
1898}
1899
1900/*
1901 * Reset quirks
1902 */
1903void vfio_quirk_reset(VFIOPCIDevice *vdev)
1904{
1905    int i;
1906
1907    for (i = 0; i < PCI_ROM_SLOT; i++) {
1908        VFIOQuirk *quirk;
1909        VFIOBAR *bar = &vdev->bars[i];
1910
1911        QLIST_FOREACH(quirk, &bar->quirks, next) {
1912            if (quirk->reset) {
1913                quirk->reset(vdev, quirk);
1914            }
1915        }
1916    }
1917}
1918
1919/*
1920 * AMD Radeon PCI config reset, based on Linux:
1921 *   drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running()
1922 *   drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset
1923 *   drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc()
1924 *   drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock()
1925 * IDs: include/drm/drm_pciids.h
1926 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0
1927 *
1928 * Bonaire and Hawaii GPUs do not respond to a bus reset.  This is a bug in the
1929 * hardware that should be fixed on future ASICs.  The symptom of this is that
1930 * once the accerlated driver loads, Windows guests will bsod on subsequent
1931 * attmpts to load the driver, such as after VM reset or shutdown/restart.  To
1932 * work around this, we do an AMD specific PCI config reset, followed by an SMC
1933 * reset.  The PCI config reset only works if SMC firmware is running, so we
1934 * have a dependency on the state of the device as to whether this reset will
1935 * be effective.  There are still cases where we won't be able to kick the
1936 * device into working, but this greatly improves the usability overall.  The
1937 * config reset magic is relatively common on AMD GPUs, but the setup and SMC
1938 * poking is largely ASIC specific.
1939 */
1940static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev)
1941{
1942    uint32_t clk, pc_c;
1943
1944    /*
1945     * Registers 200h and 204h are index and data registers for accessing
1946     * indirect configuration registers within the device.
1947     */
1948    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1949    clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1950    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4);
1951    pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1952
1953    return (!(clk & 1) && (0x20100 <= pc_c));
1954}
1955
1956/*
1957 * The scope of a config reset is controlled by a mode bit in the misc register
1958 * and a fuse, exposed as a bit in another register.  The fuse is the default
1959 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula
1960 * scope = !(misc ^ fuse), where the resulting scope is defined the same as
1961 * the fuse.  A truth table therefore tells us that if misc == fuse, we need
1962 * to flip the value of the bit in the misc register.
1963 */
1964static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev)
1965{
1966    uint32_t misc, fuse;
1967    bool a, b;
1968
1969    vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4);
1970    fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1971    b = fuse & 64;
1972
1973    vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4);
1974    misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1975    a = misc & 2;
1976
1977    if (a == b) {
1978        vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4);
1979        vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */
1980    }
1981}
1982
1983static int vfio_radeon_reset(VFIOPCIDevice *vdev)
1984{
1985    PCIDevice *pdev = &vdev->pdev;
1986    int i, ret = 0;
1987    uint32_t data;
1988
1989    /* Defer to a kernel implemented reset */
1990    if (vdev->vbasedev.reset_works) {
1991        trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name);
1992        return -ENODEV;
1993    }
1994
1995    /* Enable only memory BAR access */
1996    vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2);
1997
1998    /* Reset only works if SMC firmware is loaded and running */
1999    if (!vfio_radeon_smc_is_running(vdev)) {
2000        ret = -EINVAL;
2001        trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name);
2002        goto out;
2003    }
2004
2005    /* Make sure only the GFX function is reset */
2006    vfio_radeon_set_gfx_only_reset(vdev);
2007
2008    /* AMD PCI config reset */
2009    vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4);
2010    usleep(100);
2011
2012    /* Read back the memory size to make sure we're out of reset */
2013    for (i = 0; i < 100000; i++) {
2014        if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) {
2015            goto reset_smc;
2016        }
2017        usleep(1);
2018    }
2019
2020    trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name);
2021
2022reset_smc:
2023    /* Reset SMC */
2024    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4);
2025    data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
2026    data |= 1;
2027    vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
2028
2029    /* Disable SMC clock */
2030    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
2031    data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
2032    data |= 1;
2033    vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
2034
2035    trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name);
2036
2037out:
2038    /* Restore PCI command register */
2039    vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2);
2040
2041    return ret;
2042}
2043
2044void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev)
2045{
2046    switch (vdev->vendor_id) {
2047    case 0x1002:
2048        switch (vdev->device_id) {
2049        /* Bonaire */
2050        case 0x6649: /* Bonaire [FirePro W5100] */
2051        case 0x6650:
2052        case 0x6651:
2053        case 0x6658: /* Bonaire XTX [Radeon R7 260X] */
2054        case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */
2055        case 0x665d: /* Bonaire [Radeon R7 200 Series] */
2056        /* Hawaii */
2057        case 0x67A0: /* Hawaii XT GL [FirePro W9100] */
2058        case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */
2059        case 0x67A2:
2060        case 0x67A8:
2061        case 0x67A9:
2062        case 0x67AA:
2063        case 0x67B0: /* Hawaii XT [Radeon R9 290X] */
2064        case 0x67B1: /* Hawaii PRO [Radeon R9 290] */
2065        case 0x67B8:
2066        case 0x67B9:
2067        case 0x67BA:
2068        case 0x67BE:
2069            vdev->resetfn = vfio_radeon_reset;
2070            trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name);
2071            break;
2072        }
2073        break;
2074    }
2075}
2076
2077/*
2078 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify
2079 * devices as a member of a clique.  Devices within the same clique ID
2080 * are capable of direct P2P.  It's the user's responsibility that this
2081 * is correct.  The spec says that this may reside at any unused config
2082 * offset, but reserves and recommends hypervisors place this at C8h.
2083 * The spec also states that the hypervisor should place this capability
2084 * at the end of the capability list, thus next is defined as 0h.
2085 *
2086 * +----------------+----------------+----------------+----------------+
2087 * | sig 7:0 ('P')  |  vndr len (8h) |    next (0h)   |   cap id (9h)  |
2088 * +----------------+----------------+----------------+----------------+
2089 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)|          sig 23:8 ('P2')        |
2090 * +---------------------------------+---------------------------------+
2091 *
2092 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
2093 */
2094static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v,
2095                                       const char *name, void *opaque,
2096                                       Error **errp)
2097{
2098    DeviceState *dev = DEVICE(obj);
2099    Property *prop = opaque;
2100    uint8_t *ptr = qdev_get_prop_ptr(dev, prop);
2101
2102    visit_type_uint8(v, name, ptr, errp);
2103}
2104
2105static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v,
2106                                       const char *name, void *opaque,
2107                                       Error **errp)
2108{
2109    DeviceState *dev = DEVICE(obj);
2110    Property *prop = opaque;
2111    uint8_t value, *ptr = qdev_get_prop_ptr(dev, prop);
2112    Error *local_err = NULL;
2113
2114    if (dev->realized) {
2115        qdev_prop_set_after_realize(dev, name, errp);
2116        return;
2117    }
2118
2119    visit_type_uint8(v, name, &value, &local_err);
2120    if (local_err) {
2121        error_propagate(errp, local_err);
2122        return;
2123    }
2124
2125    if (value & ~0xF) {
2126        error_setg(errp, "Property %s: valid range 0-15", name);
2127        return;
2128    }
2129
2130    *ptr = value;
2131}
2132
2133const PropertyInfo qdev_prop_nv_gpudirect_clique = {
2134    .name = "uint4",
2135    .description = "NVIDIA GPUDirect Clique ID (0 - 15)",
2136    .get = get_nv_gpudirect_clique_id,
2137    .set = set_nv_gpudirect_clique_id,
2138};
2139
2140static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
2141{
2142    PCIDevice *pdev = &vdev->pdev;
2143    int ret, pos = 0xC8;
2144
2145    if (vdev->nv_gpudirect_clique == 0xFF) {
2146        return 0;
2147    }
2148
2149    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
2150        error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor");
2151        return -EINVAL;
2152    }
2153
2154    if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) !=
2155        PCI_BASE_CLASS_DISPLAY) {
2156        error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class");
2157        return -EINVAL;
2158    }
2159
2160    ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
2161    if (ret < 0) {
2162        error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
2163        return ret;
2164    }
2165
2166    memset(vdev->emulated_config_bits + pos, 0xFF, 8);
2167    pos += PCI_CAP_FLAGS;
2168    pci_set_byte(pdev->config + pos++, 8);
2169    pci_set_byte(pdev->config + pos++, 'P');
2170    pci_set_byte(pdev->config + pos++, '2');
2171    pci_set_byte(pdev->config + pos++, 'P');
2172    pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3);
2173    pci_set_byte(pdev->config + pos, 0);
2174
2175    return 0;
2176}
2177
2178int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
2179{
2180    int ret;
2181
2182    ret = vfio_add_nv_gpudirect_cap(vdev, errp);
2183    if (ret) {
2184        return ret;
2185    }
2186
2187    return 0;
2188}
2189
2190static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v,
2191                                     const char *name,
2192                                     void *opaque, Error **errp)
2193{
2194    uint64_t tgt = (uintptr_t) opaque;
2195    visit_type_uint64(v, name, &tgt, errp);
2196}
2197
2198static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v,
2199                                                 const char *name,
2200                                                 void *opaque, Error **errp)
2201{
2202    uint32_t link_speed = (uint32_t)(uintptr_t) opaque;
2203    visit_type_uint32(v, name, &link_speed, errp);
2204}
2205
2206int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp)
2207{
2208    int ret;
2209    void *p;
2210    struct vfio_region_info *nv2reg = NULL;
2211    struct vfio_info_cap_header *hdr;
2212    struct vfio_region_info_cap_nvlink2_ssatgt *cap;
2213    VFIOQuirk *quirk;
2214
2215    ret = vfio_get_dev_region_info(&vdev->vbasedev,
2216                                   VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
2217                                   PCI_VENDOR_ID_NVIDIA,
2218                                   VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
2219                                   &nv2reg);
2220    if (ret) {
2221        return ret;
2222    }
2223
2224    hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
2225    if (!hdr) {
2226        ret = -ENODEV;
2227        goto free_exit;
2228    }
2229    cap = (void *) hdr;
2230
2231    p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE | PROT_EXEC,
2232             MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset);
2233    if (p == MAP_FAILED) {
2234        ret = -errno;
2235        goto free_exit;
2236    }
2237
2238    quirk = vfio_quirk_alloc(1);
2239    memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr",
2240                               nv2reg->size, p);
2241    QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
2242
2243    object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
2244                        vfio_pci_nvlink2_get_tgt, NULL, NULL,
2245                        (void *) (uintptr_t) cap->tgt, NULL);
2246    trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt,
2247                                          nv2reg->size);
2248free_exit:
2249    g_free(nv2reg);
2250
2251    return ret;
2252}
2253
2254int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp)
2255{
2256    int ret;
2257    void *p;
2258    struct vfio_region_info *atsdreg = NULL;
2259    struct vfio_info_cap_header *hdr;
2260    struct vfio_region_info_cap_nvlink2_ssatgt *captgt;
2261    struct vfio_region_info_cap_nvlink2_lnkspd *capspeed;
2262    VFIOQuirk *quirk;
2263
2264    ret = vfio_get_dev_region_info(&vdev->vbasedev,
2265                                   VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
2266                                   PCI_VENDOR_ID_IBM,
2267                                   VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
2268                                   &atsdreg);
2269    if (ret) {
2270        return ret;
2271    }
2272
2273    hdr = vfio_get_region_info_cap(atsdreg,
2274                                   VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
2275    if (!hdr) {
2276        ret = -ENODEV;
2277        goto free_exit;
2278    }
2279    captgt = (void *) hdr;
2280
2281    hdr = vfio_get_region_info_cap(atsdreg,
2282                                   VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD);
2283    if (!hdr) {
2284        ret = -ENODEV;
2285        goto free_exit;
2286    }
2287    capspeed = (void *) hdr;
2288
2289    /* Some NVLink bridges may not have assigned ATSD */
2290    if (atsdreg->size) {
2291        p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE | PROT_EXEC,
2292                 MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset);
2293        if (p == MAP_FAILED) {
2294            ret = -errno;
2295            goto free_exit;
2296        }
2297
2298        quirk = vfio_quirk_alloc(1);
2299        memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev),
2300                                          "nvlink2-atsd-mr", atsdreg->size, p);
2301        QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
2302    }
2303
2304    object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
2305                        vfio_pci_nvlink2_get_tgt, NULL, NULL,
2306                        (void *) (uintptr_t) captgt->tgt, NULL);
2307    trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt,
2308                                              atsdreg->size);
2309
2310    object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32",
2311                        vfio_pci_nvlink2_get_link_speed, NULL, NULL,
2312                        (void *) (uintptr_t) capspeed->link_speed, NULL);
2313    trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name,
2314                                              capspeed->link_speed);
2315free_exit:
2316    g_free(atsdreg);
2317
2318    return ret;
2319}
2320