qemu/hw/vfio/pci-quirks.c
<<
>>
Prefs
   1/*
   2 * device quirks for PCI devices
   3 *
   4 * Copyright Red Hat, Inc. 2012-2015
   5 *
   6 * Authors:
   7 *  Alex Williamson <alex.williamson@redhat.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 */
  12
  13#include "qemu/osdep.h"
  14#include "qemu/units.h"
  15#include "qemu/error-report.h"
  16#include "qemu/main-loop.h"
  17#include "qemu/module.h"
  18#include "qemu/range.h"
  19#include "qapi/error.h"
  20#include "qapi/visitor.h"
  21#include <sys/ioctl.h>
  22#include "hw/nvram/fw_cfg.h"
  23#include "pci.h"
  24#include "trace.h"
  25
  26/* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot match hw */
  27static bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t device)
  28{
  29    return (vendor == PCI_ANY_ID || vendor == vdev->vendor_id) &&
  30           (device == PCI_ANY_ID || device == vdev->device_id);
  31}
  32
  33static bool vfio_is_vga(VFIOPCIDevice *vdev)
  34{
  35    PCIDevice *pdev = &vdev->pdev;
  36    uint16_t class = pci_get_word(pdev->config + PCI_CLASS_DEVICE);
  37
  38    return class == PCI_CLASS_DISPLAY_VGA;
  39}
  40
  41/*
  42 * List of device ids/vendor ids for which to disable
  43 * option rom loading. This avoids the guest hangs during rom
  44 * execution as noticed with the BCM 57810 card for lack of a
  45 * more better way to handle such issues.
  46 * The  user can still override by specifying a romfile or
  47 * rombar=1.
  48 * Please see https://bugs.launchpad.net/qemu/+bug/1284874
  49 * for an analysis of the 57810 card hang. When adding
  50 * a new vendor id/device id combination below, please also add
  51 * your card/environment details and information that could
  52 * help in debugging to the bug tracking this issue
  53 */
  54static const struct {
  55    uint32_t vendor;
  56    uint32_t device;
  57} romblacklist[] = {
  58    { 0x14e4, 0x168e }, /* Broadcom BCM 57810 */
  59};
  60
  61bool vfio_blacklist_opt_rom(VFIOPCIDevice *vdev)
  62{
  63    int i;
  64
  65    for (i = 0 ; i < ARRAY_SIZE(romblacklist); i++) {
  66        if (vfio_pci_is(vdev, romblacklist[i].vendor, romblacklist[i].device)) {
  67            trace_vfio_quirk_rom_blacklisted(vdev->vbasedev.name,
  68                                             romblacklist[i].vendor,
  69                                             romblacklist[i].device);
  70            return true;
  71        }
  72    }
  73    return false;
  74}
  75
  76/*
  77 * Device specific region quirks (mostly backdoors to PCI config space)
  78 */
  79
  80/*
  81 * The generic window quirks operate on an address and data register,
  82 * vfio_generic_window_address_quirk handles the address register and
  83 * vfio_generic_window_data_quirk handles the data register.  These ops
  84 * pass reads and writes through to hardware until a value matching the
  85 * stored address match/mask is written.  When this occurs, the data
  86 * register access emulated PCI config space for the device rather than
  87 * passing through accesses.  This enables devices where PCI config space
  88 * is accessible behind a window register to maintain the virtualization
  89 * provided through vfio.
  90 */
  91typedef struct VFIOConfigWindowMatch {
  92    uint32_t match;
  93    uint32_t mask;
  94} VFIOConfigWindowMatch;
  95
  96typedef struct VFIOConfigWindowQuirk {
  97    struct VFIOPCIDevice *vdev;
  98
  99    uint32_t address_val;
 100
 101    uint32_t address_offset;
 102    uint32_t data_offset;
 103
 104    bool window_enabled;
 105    uint8_t bar;
 106
 107    MemoryRegion *addr_mem;
 108    MemoryRegion *data_mem;
 109
 110    uint32_t nr_matches;
 111    VFIOConfigWindowMatch matches[];
 112} VFIOConfigWindowQuirk;
 113
 114static uint64_t vfio_generic_window_quirk_address_read(void *opaque,
 115                                                       hwaddr addr,
 116                                                       unsigned size)
 117{
 118    VFIOConfigWindowQuirk *window = opaque;
 119    VFIOPCIDevice *vdev = window->vdev;
 120
 121    return vfio_region_read(&vdev->bars[window->bar].region,
 122                            addr + window->address_offset, size);
 123}
 124
 125static void vfio_generic_window_quirk_address_write(void *opaque, hwaddr addr,
 126                                                    uint64_t data,
 127                                                    unsigned size)
 128{
 129    VFIOConfigWindowQuirk *window = opaque;
 130    VFIOPCIDevice *vdev = window->vdev;
 131    int i;
 132
 133    window->window_enabled = false;
 134
 135    vfio_region_write(&vdev->bars[window->bar].region,
 136                      addr + window->address_offset, data, size);
 137
 138    for (i = 0; i < window->nr_matches; i++) {
 139        if ((data & ~window->matches[i].mask) == window->matches[i].match) {
 140            window->window_enabled = true;
 141            window->address_val = data & window->matches[i].mask;
 142            trace_vfio_quirk_generic_window_address_write(vdev->vbasedev.name,
 143                                    memory_region_name(window->addr_mem), data);
 144            break;
 145        }
 146    }
 147}
 148
 149static const MemoryRegionOps vfio_generic_window_address_quirk = {
 150    .read = vfio_generic_window_quirk_address_read,
 151    .write = vfio_generic_window_quirk_address_write,
 152    .endianness = DEVICE_LITTLE_ENDIAN,
 153};
 154
 155static uint64_t vfio_generic_window_quirk_data_read(void *opaque,
 156                                                    hwaddr addr, unsigned size)
 157{
 158    VFIOConfigWindowQuirk *window = opaque;
 159    VFIOPCIDevice *vdev = window->vdev;
 160    uint64_t data;
 161
 162    /* Always read data reg, discard if window enabled */
 163    data = vfio_region_read(&vdev->bars[window->bar].region,
 164                            addr + window->data_offset, size);
 165
 166    if (window->window_enabled) {
 167        data = vfio_pci_read_config(&vdev->pdev, window->address_val, size);
 168        trace_vfio_quirk_generic_window_data_read(vdev->vbasedev.name,
 169                                    memory_region_name(window->data_mem), data);
 170    }
 171
 172    return data;
 173}
 174
 175static void vfio_generic_window_quirk_data_write(void *opaque, hwaddr addr,
 176                                                 uint64_t data, unsigned size)
 177{
 178    VFIOConfigWindowQuirk *window = opaque;
 179    VFIOPCIDevice *vdev = window->vdev;
 180
 181    if (window->window_enabled) {
 182        vfio_pci_write_config(&vdev->pdev, window->address_val, data, size);
 183        trace_vfio_quirk_generic_window_data_write(vdev->vbasedev.name,
 184                                    memory_region_name(window->data_mem), data);
 185        return;
 186    }
 187
 188    vfio_region_write(&vdev->bars[window->bar].region,
 189                      addr + window->data_offset, data, size);
 190}
 191
 192static const MemoryRegionOps vfio_generic_window_data_quirk = {
 193    .read = vfio_generic_window_quirk_data_read,
 194    .write = vfio_generic_window_quirk_data_write,
 195    .endianness = DEVICE_LITTLE_ENDIAN,
 196};
 197
 198/*
 199 * The generic mirror quirk handles devices which expose PCI config space
 200 * through a region within a BAR.  When enabled, reads and writes are
 201 * redirected through to emulated PCI config space.  XXX if PCI config space
 202 * used memory regions, this could just be an alias.
 203 */
 204typedef struct VFIOConfigMirrorQuirk {
 205    struct VFIOPCIDevice *vdev;
 206    uint32_t offset;
 207    uint8_t bar;
 208    MemoryRegion *mem;
 209    uint8_t data[];
 210} VFIOConfigMirrorQuirk;
 211
 212static uint64_t vfio_generic_quirk_mirror_read(void *opaque,
 213                                               hwaddr addr, unsigned size)
 214{
 215    VFIOConfigMirrorQuirk *mirror = opaque;
 216    VFIOPCIDevice *vdev = mirror->vdev;
 217    uint64_t data;
 218
 219    /* Read and discard in case the hardware cares */
 220    (void)vfio_region_read(&vdev->bars[mirror->bar].region,
 221                           addr + mirror->offset, size);
 222
 223    data = vfio_pci_read_config(&vdev->pdev, addr, size);
 224    trace_vfio_quirk_generic_mirror_read(vdev->vbasedev.name,
 225                                         memory_region_name(mirror->mem),
 226                                         addr, data);
 227    return data;
 228}
 229
 230static void vfio_generic_quirk_mirror_write(void *opaque, hwaddr addr,
 231                                            uint64_t data, unsigned size)
 232{
 233    VFIOConfigMirrorQuirk *mirror = opaque;
 234    VFIOPCIDevice *vdev = mirror->vdev;
 235
 236    vfio_pci_write_config(&vdev->pdev, addr, data, size);
 237    trace_vfio_quirk_generic_mirror_write(vdev->vbasedev.name,
 238                                          memory_region_name(mirror->mem),
 239                                          addr, data);
 240}
 241
 242static const MemoryRegionOps vfio_generic_mirror_quirk = {
 243    .read = vfio_generic_quirk_mirror_read,
 244    .write = vfio_generic_quirk_mirror_write,
 245    .endianness = DEVICE_LITTLE_ENDIAN,
 246};
 247
 248/* Is range1 fully contained within range2?  */
 249static bool vfio_range_contained(uint64_t first1, uint64_t len1,
 250                                 uint64_t first2, uint64_t len2) {
 251    return (first1 >= first2 && first1 + len1 <= first2 + len2);
 252}
 253
 254#define PCI_VENDOR_ID_ATI               0x1002
 255
 256/*
 257 * Radeon HD cards (HD5450 & HD7850) report the upper byte of the I/O port BAR
 258 * through VGA register 0x3c3.  On newer cards, the I/O port BAR is always
 259 * BAR4 (older cards like the X550 used BAR1, but we don't care to support
 260 * those).  Note that on bare metal, a read of 0x3c3 doesn't always return the
 261 * I/O port BAR address.  Originally this was coded to return the virtual BAR
 262 * address only if the physical register read returns the actual BAR address,
 263 * but users have reported greater success if we return the virtual address
 264 * unconditionally.
 265 */
 266static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
 267                                        hwaddr addr, unsigned size)
 268{
 269    VFIOPCIDevice *vdev = opaque;
 270    uint64_t data = vfio_pci_read_config(&vdev->pdev,
 271                                         PCI_BASE_ADDRESS_4 + 1, size);
 272
 273    trace_vfio_quirk_ati_3c3_read(vdev->vbasedev.name, data);
 274
 275    return data;
 276}
 277
 278static const MemoryRegionOps vfio_ati_3c3_quirk = {
 279    .read = vfio_ati_3c3_quirk_read,
 280    .endianness = DEVICE_LITTLE_ENDIAN,
 281};
 282
 283static VFIOQuirk *vfio_quirk_alloc(int nr_mem)
 284{
 285    VFIOQuirk *quirk = g_new0(VFIOQuirk, 1);
 286    QLIST_INIT(&quirk->ioeventfds);
 287    quirk->mem = g_new0(MemoryRegion, nr_mem);
 288    quirk->nr_mem = nr_mem;
 289
 290    return quirk;
 291}
 292
 293static void vfio_ioeventfd_exit(VFIOPCIDevice *vdev, VFIOIOEventFD *ioeventfd)
 294{
 295    QLIST_REMOVE(ioeventfd, next);
 296    memory_region_del_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
 297                              true, ioeventfd->data, &ioeventfd->e);
 298
 299    if (ioeventfd->vfio) {
 300        struct vfio_device_ioeventfd vfio_ioeventfd;
 301
 302        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
 303        vfio_ioeventfd.flags = ioeventfd->size;
 304        vfio_ioeventfd.data = ioeventfd->data;
 305        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
 306                                ioeventfd->region_addr;
 307        vfio_ioeventfd.fd = -1;
 308
 309        if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd)) {
 310            error_report("Failed to remove vfio ioeventfd for %s+0x%"
 311                         HWADDR_PRIx"[%d]:0x%"PRIx64" (%m)",
 312                         memory_region_name(ioeventfd->mr), ioeventfd->addr,
 313                         ioeventfd->size, ioeventfd->data);
 314        }
 315    } else {
 316        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
 317                            NULL, NULL, NULL);
 318    }
 319
 320    event_notifier_cleanup(&ioeventfd->e);
 321    trace_vfio_ioeventfd_exit(memory_region_name(ioeventfd->mr),
 322                              (uint64_t)ioeventfd->addr, ioeventfd->size,
 323                              ioeventfd->data);
 324    g_free(ioeventfd);
 325}
 326
 327static void vfio_drop_dynamic_eventfds(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
 328{
 329    VFIOIOEventFD *ioeventfd, *tmp;
 330
 331    QLIST_FOREACH_SAFE(ioeventfd, &quirk->ioeventfds, next, tmp) {
 332        if (ioeventfd->dynamic) {
 333            vfio_ioeventfd_exit(vdev, ioeventfd);
 334        }
 335    }
 336}
 337
 338static void vfio_ioeventfd_handler(void *opaque)
 339{
 340    VFIOIOEventFD *ioeventfd = opaque;
 341
 342    if (event_notifier_test_and_clear(&ioeventfd->e)) {
 343        vfio_region_write(ioeventfd->region, ioeventfd->region_addr,
 344                          ioeventfd->data, ioeventfd->size);
 345        trace_vfio_ioeventfd_handler(memory_region_name(ioeventfd->mr),
 346                                     (uint64_t)ioeventfd->addr, ioeventfd->size,
 347                                     ioeventfd->data);
 348    }
 349}
 350
 351static VFIOIOEventFD *vfio_ioeventfd_init(VFIOPCIDevice *vdev,
 352                                          MemoryRegion *mr, hwaddr addr,
 353                                          unsigned size, uint64_t data,
 354                                          VFIORegion *region,
 355                                          hwaddr region_addr, bool dynamic)
 356{
 357    VFIOIOEventFD *ioeventfd;
 358
 359    if (vdev->no_kvm_ioeventfd) {
 360        return NULL;
 361    }
 362
 363    ioeventfd = g_malloc0(sizeof(*ioeventfd));
 364
 365    if (event_notifier_init(&ioeventfd->e, 0)) {
 366        g_free(ioeventfd);
 367        return NULL;
 368    }
 369
 370    /*
 371     * MemoryRegion and relative offset, plus additional ioeventfd setup
 372     * parameters for configuring and later tearing down KVM ioeventfd.
 373     */
 374    ioeventfd->mr = mr;
 375    ioeventfd->addr = addr;
 376    ioeventfd->size = size;
 377    ioeventfd->data = data;
 378    ioeventfd->dynamic = dynamic;
 379    /*
 380     * VFIORegion and relative offset for implementing the userspace
 381     * handler.  data & size fields shared for both uses.
 382     */
 383    ioeventfd->region = region;
 384    ioeventfd->region_addr = region_addr;
 385
 386    if (!vdev->no_vfio_ioeventfd) {
 387        struct vfio_device_ioeventfd vfio_ioeventfd;
 388
 389        vfio_ioeventfd.argsz = sizeof(vfio_ioeventfd);
 390        vfio_ioeventfd.flags = ioeventfd->size;
 391        vfio_ioeventfd.data = ioeventfd->data;
 392        vfio_ioeventfd.offset = ioeventfd->region->fd_offset +
 393                                ioeventfd->region_addr;
 394        vfio_ioeventfd.fd = event_notifier_get_fd(&ioeventfd->e);
 395
 396        ioeventfd->vfio = !ioctl(vdev->vbasedev.fd,
 397                                 VFIO_DEVICE_IOEVENTFD, &vfio_ioeventfd);
 398    }
 399
 400    if (!ioeventfd->vfio) {
 401        qemu_set_fd_handler(event_notifier_get_fd(&ioeventfd->e),
 402                            vfio_ioeventfd_handler, NULL, ioeventfd);
 403    }
 404
 405    memory_region_add_eventfd(ioeventfd->mr, ioeventfd->addr, ioeventfd->size,
 406                              true, ioeventfd->data, &ioeventfd->e);
 407    trace_vfio_ioeventfd_init(memory_region_name(mr), (uint64_t)addr,
 408                              size, data, ioeventfd->vfio);
 409
 410    return ioeventfd;
 411}
 412
 413static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
 414{
 415    VFIOQuirk *quirk;
 416
 417    /*
 418     * As long as the BAR is >= 256 bytes it will be aligned such that the
 419     * lower byte is always zero.  Filter out anything else, if it exists.
 420     */
 421    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 422        !vdev->bars[4].ioport || vdev->bars[4].region.size < 256) {
 423        return;
 424    }
 425
 426    quirk = vfio_quirk_alloc(1);
 427
 428    memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev,
 429                          "vfio-ati-3c3-quirk", 1);
 430    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 431                                3 /* offset 3 bytes from 0x3c0 */, quirk->mem);
 432
 433    QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
 434                      quirk, next);
 435
 436    trace_vfio_quirk_ati_3c3_probe(vdev->vbasedev.name);
 437}
 438
 439/*
 440 * Newer ATI/AMD devices, including HD5450 and HD7850, have a mirror to PCI
 441 * config space through MMIO BAR2 at offset 0x4000.  Nothing seems to access
 442 * the MMIO space directly, but a window to this space is provided through
 443 * I/O port BAR4.  Offset 0x0 is the address register and offset 0x4 is the
 444 * data register.  When the address is programmed to a range of 0x4000-0x4fff
 445 * PCI configuration space is available.  Experimentation seems to indicate
 446 * that read-only may be provided by hardware.
 447 */
 448static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr)
 449{
 450    VFIOQuirk *quirk;
 451    VFIOConfigWindowQuirk *window;
 452
 453    /* This windows doesn't seem to be used except by legacy VGA code */
 454    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 455        !vdev->vga || nr != 4) {
 456        return;
 457    }
 458
 459    quirk = vfio_quirk_alloc(2);
 460    window = quirk->data = g_malloc0(sizeof(*window) +
 461                                     sizeof(VFIOConfigWindowMatch));
 462    window->vdev = vdev;
 463    window->address_offset = 0;
 464    window->data_offset = 4;
 465    window->nr_matches = 1;
 466    window->matches[0].match = 0x4000;
 467    window->matches[0].mask = vdev->config_size - 1;
 468    window->bar = nr;
 469    window->addr_mem = &quirk->mem[0];
 470    window->data_mem = &quirk->mem[1];
 471
 472    memory_region_init_io(window->addr_mem, OBJECT(vdev),
 473                          &vfio_generic_window_address_quirk, window,
 474                          "vfio-ati-bar4-window-address-quirk", 4);
 475    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 476                                        window->address_offset,
 477                                        window->addr_mem, 1);
 478
 479    memory_region_init_io(window->data_mem, OBJECT(vdev),
 480                          &vfio_generic_window_data_quirk, window,
 481                          "vfio-ati-bar4-window-data-quirk", 4);
 482    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 483                                        window->data_offset,
 484                                        window->data_mem, 1);
 485
 486    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 487
 488    trace_vfio_quirk_ati_bar4_probe(vdev->vbasedev.name);
 489}
 490
 491/*
 492 * Trap the BAR2 MMIO mirror to config space as well.
 493 */
 494static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr)
 495{
 496    VFIOQuirk *quirk;
 497    VFIOConfigMirrorQuirk *mirror;
 498
 499    /* Only enable on newer devices where BAR2 is 64bit */
 500    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) ||
 501        !vdev->vga || nr != 2 || !vdev->bars[2].mem64) {
 502        return;
 503    }
 504
 505    quirk = vfio_quirk_alloc(1);
 506    mirror = quirk->data = g_malloc0(sizeof(*mirror));
 507    mirror->mem = quirk->mem;
 508    mirror->vdev = vdev;
 509    mirror->offset = 0x4000;
 510    mirror->bar = nr;
 511
 512    memory_region_init_io(mirror->mem, OBJECT(vdev),
 513                          &vfio_generic_mirror_quirk, mirror,
 514                          "vfio-ati-bar2-4000-quirk", PCI_CONFIG_SPACE_SIZE);
 515    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 516                                        mirror->offset, mirror->mem, 1);
 517
 518    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 519
 520    trace_vfio_quirk_ati_bar2_probe(vdev->vbasedev.name);
 521}
 522
 523/*
 524 * Older ATI/AMD cards like the X550 have a similar window to that above.
 525 * I/O port BAR1 provides a window to a mirror of PCI config space located
 526 * in BAR2 at offset 0xf00.  We don't care to support such older cards, but
 527 * note it for future reference.
 528 */
 529
 530/*
 531 * Nvidia has several different methods to get to config space, the
 532 * nouveu project has several of these documented here:
 533 * https://github.com/pathscale/envytools/tree/master/hwdocs
 534 *
 535 * The first quirk is actually not documented in envytools and is found
 536 * on 10de:01d1 (NVIDIA Corporation G72 [GeForce 7300 LE]).  This is an
 537 * NV46 chipset.  The backdoor uses the legacy VGA I/O ports to access
 538 * the mirror of PCI config space found at BAR0 offset 0x1800.  The access
 539 * sequence first writes 0x338 to I/O port 0x3d4.  The target offset is
 540 * then written to 0x3d0.  Finally 0x538 is written for a read and 0x738
 541 * is written for a write to 0x3d4.  The BAR0 offset is then accessible
 542 * through 0x3d0.  This quirk doesn't seem to be necessary on newer cards
 543 * that use the I/O port BAR5 window but it doesn't hurt to leave it.
 544 */
 545typedef enum {NONE = 0, SELECT, WINDOW, READ, WRITE} VFIONvidia3d0State;
 546static const char *nv3d0_states[] = { "NONE", "SELECT",
 547                                      "WINDOW", "READ", "WRITE" };
 548
 549typedef struct VFIONvidia3d0Quirk {
 550    VFIOPCIDevice *vdev;
 551    VFIONvidia3d0State state;
 552    uint32_t offset;
 553} VFIONvidia3d0Quirk;
 554
 555static uint64_t vfio_nvidia_3d4_quirk_read(void *opaque,
 556                                           hwaddr addr, unsigned size)
 557{
 558    VFIONvidia3d0Quirk *quirk = opaque;
 559    VFIOPCIDevice *vdev = quirk->vdev;
 560
 561    quirk->state = NONE;
 562
 563    return vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 564                         addr + 0x14, size);
 565}
 566
 567static void vfio_nvidia_3d4_quirk_write(void *opaque, hwaddr addr,
 568                                        uint64_t data, unsigned size)
 569{
 570    VFIONvidia3d0Quirk *quirk = opaque;
 571    VFIOPCIDevice *vdev = quirk->vdev;
 572    VFIONvidia3d0State old_state = quirk->state;
 573
 574    quirk->state = NONE;
 575
 576    switch (data) {
 577    case 0x338:
 578        if (old_state == NONE) {
 579            quirk->state = SELECT;
 580            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 581                                              nv3d0_states[quirk->state]);
 582        }
 583        break;
 584    case 0x538:
 585        if (old_state == WINDOW) {
 586            quirk->state = READ;
 587            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 588                                              nv3d0_states[quirk->state]);
 589        }
 590        break;
 591    case 0x738:
 592        if (old_state == WINDOW) {
 593            quirk->state = WRITE;
 594            trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 595                                              nv3d0_states[quirk->state]);
 596        }
 597        break;
 598    }
 599
 600    vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 601                   addr + 0x14, data, size);
 602}
 603
 604static const MemoryRegionOps vfio_nvidia_3d4_quirk = {
 605    .read = vfio_nvidia_3d4_quirk_read,
 606    .write = vfio_nvidia_3d4_quirk_write,
 607    .endianness = DEVICE_LITTLE_ENDIAN,
 608};
 609
 610static uint64_t vfio_nvidia_3d0_quirk_read(void *opaque,
 611                                           hwaddr addr, unsigned size)
 612{
 613    VFIONvidia3d0Quirk *quirk = opaque;
 614    VFIOPCIDevice *vdev = quirk->vdev;
 615    VFIONvidia3d0State old_state = quirk->state;
 616    uint64_t data = vfio_vga_read(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 617                                  addr + 0x10, size);
 618
 619    quirk->state = NONE;
 620
 621    if (old_state == READ &&
 622        (quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
 623        uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
 624
 625        data = vfio_pci_read_config(&vdev->pdev, offset, size);
 626        trace_vfio_quirk_nvidia_3d0_read(vdev->vbasedev.name,
 627                                         offset, size, data);
 628    }
 629
 630    return data;
 631}
 632
 633static void vfio_nvidia_3d0_quirk_write(void *opaque, hwaddr addr,
 634                                        uint64_t data, unsigned size)
 635{
 636    VFIONvidia3d0Quirk *quirk = opaque;
 637    VFIOPCIDevice *vdev = quirk->vdev;
 638    VFIONvidia3d0State old_state = quirk->state;
 639
 640    quirk->state = NONE;
 641
 642    if (old_state == SELECT) {
 643        quirk->offset = (uint32_t)data;
 644        quirk->state = WINDOW;
 645        trace_vfio_quirk_nvidia_3d0_state(vdev->vbasedev.name,
 646                                          nv3d0_states[quirk->state]);
 647    } else if (old_state == WRITE) {
 648        if ((quirk->offset & ~(PCI_CONFIG_SPACE_SIZE - 1)) == 0x1800) {
 649            uint8_t offset = quirk->offset & (PCI_CONFIG_SPACE_SIZE - 1);
 650
 651            vfio_pci_write_config(&vdev->pdev, offset, data, size);
 652            trace_vfio_quirk_nvidia_3d0_write(vdev->vbasedev.name,
 653                                              offset, data, size);
 654            return;
 655        }
 656    }
 657
 658    vfio_vga_write(&vdev->vga->region[QEMU_PCI_VGA_IO_HI],
 659                   addr + 0x10, data, size);
 660}
 661
 662static const MemoryRegionOps vfio_nvidia_3d0_quirk = {
 663    .read = vfio_nvidia_3d0_quirk_read,
 664    .write = vfio_nvidia_3d0_quirk_write,
 665    .endianness = DEVICE_LITTLE_ENDIAN,
 666};
 667
 668static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
 669{
 670    VFIOQuirk *quirk;
 671    VFIONvidia3d0Quirk *data;
 672
 673    if (vdev->no_geforce_quirks ||
 674        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 675        !vdev->bars[1].region.size) {
 676        return;
 677    }
 678
 679    quirk = vfio_quirk_alloc(2);
 680    quirk->data = data = g_malloc0(sizeof(*data));
 681    data->vdev = vdev;
 682
 683    memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_nvidia_3d4_quirk,
 684                          data, "vfio-nvidia-3d4-quirk", 2);
 685    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 686                                0x14 /* 0x3c0 + 0x14 */, &quirk->mem[0]);
 687
 688    memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_nvidia_3d0_quirk,
 689                          data, "vfio-nvidia-3d0-quirk", 2);
 690    memory_region_add_subregion(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem,
 691                                0x10 /* 0x3c0 + 0x10 */, &quirk->mem[1]);
 692
 693    QLIST_INSERT_HEAD(&vdev->vga->region[QEMU_PCI_VGA_IO_HI].quirks,
 694                      quirk, next);
 695
 696    trace_vfio_quirk_nvidia_3d0_probe(vdev->vbasedev.name);
 697}
 698
 699/*
 700 * The second quirk is documented in envytools.  The I/O port BAR5 is just
 701 * a set of address/data ports to the MMIO BARs.  The BAR we care about is
 702 * again BAR0.  This backdoor is apparently a bit newer than the one above
 703 * so we need to not only trap 256 bytes @0x1800, but all of PCI config
 704 * space, including extended space is available at the 4k @0x88000.
 705 */
 706typedef struct VFIONvidiaBAR5Quirk {
 707    uint32_t master;
 708    uint32_t enable;
 709    MemoryRegion *addr_mem;
 710    MemoryRegion *data_mem;
 711    bool enabled;
 712    VFIOConfigWindowQuirk window; /* last for match data */
 713} VFIONvidiaBAR5Quirk;
 714
 715static void vfio_nvidia_bar5_enable(VFIONvidiaBAR5Quirk *bar5)
 716{
 717    VFIOPCIDevice *vdev = bar5->window.vdev;
 718
 719    if (((bar5->master & bar5->enable) & 0x1) == bar5->enabled) {
 720        return;
 721    }
 722
 723    bar5->enabled = !bar5->enabled;
 724    trace_vfio_quirk_nvidia_bar5_state(vdev->vbasedev.name,
 725                                       bar5->enabled ?  "Enable" : "Disable");
 726    memory_region_set_enabled(bar5->addr_mem, bar5->enabled);
 727    memory_region_set_enabled(bar5->data_mem, bar5->enabled);
 728}
 729
 730static uint64_t vfio_nvidia_bar5_quirk_master_read(void *opaque,
 731                                                   hwaddr addr, unsigned size)
 732{
 733    VFIONvidiaBAR5Quirk *bar5 = opaque;
 734    VFIOPCIDevice *vdev = bar5->window.vdev;
 735
 736    return vfio_region_read(&vdev->bars[5].region, addr, size);
 737}
 738
 739static void vfio_nvidia_bar5_quirk_master_write(void *opaque, hwaddr addr,
 740                                                uint64_t data, unsigned size)
 741{
 742    VFIONvidiaBAR5Quirk *bar5 = opaque;
 743    VFIOPCIDevice *vdev = bar5->window.vdev;
 744
 745    vfio_region_write(&vdev->bars[5].region, addr, data, size);
 746
 747    bar5->master = data;
 748    vfio_nvidia_bar5_enable(bar5);
 749}
 750
 751static const MemoryRegionOps vfio_nvidia_bar5_quirk_master = {
 752    .read = vfio_nvidia_bar5_quirk_master_read,
 753    .write = vfio_nvidia_bar5_quirk_master_write,
 754    .endianness = DEVICE_LITTLE_ENDIAN,
 755};
 756
 757static uint64_t vfio_nvidia_bar5_quirk_enable_read(void *opaque,
 758                                                   hwaddr addr, unsigned size)
 759{
 760    VFIONvidiaBAR5Quirk *bar5 = opaque;
 761    VFIOPCIDevice *vdev = bar5->window.vdev;
 762
 763    return vfio_region_read(&vdev->bars[5].region, addr + 4, size);
 764}
 765
 766static void vfio_nvidia_bar5_quirk_enable_write(void *opaque, hwaddr addr,
 767                                                uint64_t data, unsigned size)
 768{
 769    VFIONvidiaBAR5Quirk *bar5 = opaque;
 770    VFIOPCIDevice *vdev = bar5->window.vdev;
 771
 772    vfio_region_write(&vdev->bars[5].region, addr + 4, data, size);
 773
 774    bar5->enable = data;
 775    vfio_nvidia_bar5_enable(bar5);
 776}
 777
 778static const MemoryRegionOps vfio_nvidia_bar5_quirk_enable = {
 779    .read = vfio_nvidia_bar5_quirk_enable_read,
 780    .write = vfio_nvidia_bar5_quirk_enable_write,
 781    .endianness = DEVICE_LITTLE_ENDIAN,
 782};
 783
 784static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
 785{
 786    VFIOQuirk *quirk;
 787    VFIONvidiaBAR5Quirk *bar5;
 788    VFIOConfigWindowQuirk *window;
 789
 790    if (vdev->no_geforce_quirks ||
 791        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 792        !vdev->vga || nr != 5 || !vdev->bars[5].ioport) {
 793        return;
 794    }
 795
 796    quirk = vfio_quirk_alloc(4);
 797    bar5 = quirk->data = g_malloc0(sizeof(*bar5) +
 798                                   (sizeof(VFIOConfigWindowMatch) * 2));
 799    window = &bar5->window;
 800
 801    window->vdev = vdev;
 802    window->address_offset = 0x8;
 803    window->data_offset = 0xc;
 804    window->nr_matches = 2;
 805    window->matches[0].match = 0x1800;
 806    window->matches[0].mask = PCI_CONFIG_SPACE_SIZE - 1;
 807    window->matches[1].match = 0x88000;
 808    window->matches[1].mask = vdev->config_size - 1;
 809    window->bar = nr;
 810    window->addr_mem = bar5->addr_mem = &quirk->mem[0];
 811    window->data_mem = bar5->data_mem = &quirk->mem[1];
 812
 813    memory_region_init_io(window->addr_mem, OBJECT(vdev),
 814                          &vfio_generic_window_address_quirk, window,
 815                          "vfio-nvidia-bar5-window-address-quirk", 4);
 816    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 817                                        window->address_offset,
 818                                        window->addr_mem, 1);
 819    memory_region_set_enabled(window->addr_mem, false);
 820
 821    memory_region_init_io(window->data_mem, OBJECT(vdev),
 822                          &vfio_generic_window_data_quirk, window,
 823                          "vfio-nvidia-bar5-window-data-quirk", 4);
 824    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 825                                        window->data_offset,
 826                                        window->data_mem, 1);
 827    memory_region_set_enabled(window->data_mem, false);
 828
 829    memory_region_init_io(&quirk->mem[2], OBJECT(vdev),
 830                          &vfio_nvidia_bar5_quirk_master, bar5,
 831                          "vfio-nvidia-bar5-master-quirk", 4);
 832    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 833                                        0, &quirk->mem[2], 1);
 834
 835    memory_region_init_io(&quirk->mem[3], OBJECT(vdev),
 836                          &vfio_nvidia_bar5_quirk_enable, bar5,
 837                          "vfio-nvidia-bar5-enable-quirk", 4);
 838    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 839                                        4, &quirk->mem[3], 1);
 840
 841    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 842
 843    trace_vfio_quirk_nvidia_bar5_probe(vdev->vbasedev.name);
 844}
 845
 846typedef struct LastDataSet {
 847    VFIOQuirk *quirk;
 848    hwaddr addr;
 849    uint64_t data;
 850    unsigned size;
 851    int hits;
 852    int added;
 853} LastDataSet;
 854
 855#define MAX_DYN_IOEVENTFD 10
 856#define HITS_FOR_IOEVENTFD 10
 857
 858/*
 859 * Finally, BAR0 itself.  We want to redirect any accesses to either
 860 * 0x1800 or 0x88000 through the PCI config space access functions.
 861 */
 862static void vfio_nvidia_quirk_mirror_write(void *opaque, hwaddr addr,
 863                                           uint64_t data, unsigned size)
 864{
 865    VFIOConfigMirrorQuirk *mirror = opaque;
 866    VFIOPCIDevice *vdev = mirror->vdev;
 867    PCIDevice *pdev = &vdev->pdev;
 868    LastDataSet *last = (LastDataSet *)&mirror->data;
 869
 870    vfio_generic_quirk_mirror_write(opaque, addr, data, size);
 871
 872    /*
 873     * Nvidia seems to acknowledge MSI interrupts by writing 0xff to the
 874     * MSI capability ID register.  Both the ID and next register are
 875     * read-only, so we allow writes covering either of those to real hw.
 876     */
 877    if ((pdev->cap_present & QEMU_PCI_CAP_MSI) &&
 878        vfio_range_contained(addr, size, pdev->msi_cap, PCI_MSI_FLAGS)) {
 879        vfio_region_write(&vdev->bars[mirror->bar].region,
 880                          addr + mirror->offset, data, size);
 881        trace_vfio_quirk_nvidia_bar0_msi_ack(vdev->vbasedev.name);
 882    }
 883
 884    /*
 885     * Automatically add an ioeventfd to handle any repeated write with the
 886     * same data and size above the standard PCI config space header.  This is
 887     * primarily expected to accelerate the MSI-ACK behavior, such as noted
 888     * above.  Current hardware/drivers should trigger an ioeventfd at config
 889     * offset 0x704 (region offset 0x88704), with data 0x0, size 4.
 890     *
 891     * The criteria of 10 successive hits is arbitrary but reliably adds the
 892     * MSI-ACK region.  Note that as some writes are bypassed via the ioeventfd,
 893     * the remaining ones have a greater chance of being seen successively.
 894     * To avoid the pathological case of burning up all of QEMU's open file
 895     * handles, arbitrarily limit this algorithm from adding no more than 10
 896     * ioeventfds, print an error if we would have added an 11th, and then
 897     * stop counting.
 898     */
 899    if (!vdev->no_kvm_ioeventfd &&
 900        addr >= PCI_STD_HEADER_SIZEOF && last->added <= MAX_DYN_IOEVENTFD) {
 901        if (addr != last->addr || data != last->data || size != last->size) {
 902            last->addr = addr;
 903            last->data = data;
 904            last->size = size;
 905            last->hits = 1;
 906        } else if (++last->hits >= HITS_FOR_IOEVENTFD) {
 907            if (last->added < MAX_DYN_IOEVENTFD) {
 908                VFIOIOEventFD *ioeventfd;
 909                ioeventfd = vfio_ioeventfd_init(vdev, mirror->mem, addr, size,
 910                                        data, &vdev->bars[mirror->bar].region,
 911                                        mirror->offset + addr, true);
 912                if (ioeventfd) {
 913                    VFIOQuirk *quirk = last->quirk;
 914
 915                    QLIST_INSERT_HEAD(&quirk->ioeventfds, ioeventfd, next);
 916                    last->added++;
 917                }
 918            } else {
 919                last->added++;
 920                warn_report("NVIDIA ioeventfd queue full for %s, unable to "
 921                            "accelerate 0x%"HWADDR_PRIx", data 0x%"PRIx64", "
 922                            "size %u", vdev->vbasedev.name, addr, data, size);
 923            }
 924        }
 925    }
 926}
 927
 928static const MemoryRegionOps vfio_nvidia_mirror_quirk = {
 929    .read = vfio_generic_quirk_mirror_read,
 930    .write = vfio_nvidia_quirk_mirror_write,
 931    .endianness = DEVICE_LITTLE_ENDIAN,
 932};
 933
 934static void vfio_nvidia_bar0_quirk_reset(VFIOPCIDevice *vdev, VFIOQuirk *quirk)
 935{
 936    VFIOConfigMirrorQuirk *mirror = quirk->data;
 937    LastDataSet *last = (LastDataSet *)&mirror->data;
 938
 939    last->addr = last->data = last->size = last->hits = last->added = 0;
 940
 941    vfio_drop_dynamic_eventfds(vdev, quirk);
 942}
 943
 944static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
 945{
 946    VFIOQuirk *quirk;
 947    VFIOConfigMirrorQuirk *mirror;
 948    LastDataSet *last;
 949
 950    if (vdev->no_geforce_quirks ||
 951        !vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) ||
 952        !vfio_is_vga(vdev) || nr != 0) {
 953        return;
 954    }
 955
 956    quirk = vfio_quirk_alloc(1);
 957    quirk->reset = vfio_nvidia_bar0_quirk_reset;
 958    mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
 959    mirror->mem = quirk->mem;
 960    mirror->vdev = vdev;
 961    mirror->offset = 0x88000;
 962    mirror->bar = nr;
 963    last = (LastDataSet *)&mirror->data;
 964    last->quirk = quirk;
 965
 966    memory_region_init_io(mirror->mem, OBJECT(vdev),
 967                          &vfio_nvidia_mirror_quirk, mirror,
 968                          "vfio-nvidia-bar0-88000-mirror-quirk",
 969                          vdev->config_size);
 970    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 971                                        mirror->offset, mirror->mem, 1);
 972
 973    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 974
 975    /* The 0x1800 offset mirror only seems to get used by legacy VGA */
 976    if (vdev->vga) {
 977        quirk = vfio_quirk_alloc(1);
 978        quirk->reset = vfio_nvidia_bar0_quirk_reset;
 979        mirror = quirk->data = g_malloc0(sizeof(*mirror) + sizeof(LastDataSet));
 980        mirror->mem = quirk->mem;
 981        mirror->vdev = vdev;
 982        mirror->offset = 0x1800;
 983        mirror->bar = nr;
 984        last = (LastDataSet *)&mirror->data;
 985        last->quirk = quirk;
 986
 987        memory_region_init_io(mirror->mem, OBJECT(vdev),
 988                              &vfio_nvidia_mirror_quirk, mirror,
 989                              "vfio-nvidia-bar0-1800-mirror-quirk",
 990                              PCI_CONFIG_SPACE_SIZE);
 991        memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
 992                                            mirror->offset, mirror->mem, 1);
 993
 994        QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
 995    }
 996
 997    trace_vfio_quirk_nvidia_bar0_probe(vdev->vbasedev.name);
 998}
 999
1000/*
1001 * TODO - Some Nvidia devices provide config access to their companion HDA
1002 * device and even to their parent bridge via these config space mirrors.
1003 * Add quirks for those regions.
1004 */
1005
1006#define PCI_VENDOR_ID_REALTEK 0x10ec
1007
1008/*
1009 * RTL8168 devices have a backdoor that can access the MSI-X table.  At BAR2
1010 * offset 0x70 there is a dword data register, offset 0x74 is a dword address
1011 * register.  According to the Linux r8169 driver, the MSI-X table is addressed
1012 * when the "type" portion of the address register is set to 0x1.  This appears
1013 * to be bits 16:30.  Bit 31 is both a write indicator and some sort of
1014 * "address latched" indicator.  Bits 12:15 are a mask field, which we can
1015 * ignore because the MSI-X table should always be accessed as a dword (full
1016 * mask).  Bits 0:11 is offset within the type.
1017 *
1018 * Example trace:
1019 *
1020 * Read from MSI-X table offset 0
1021 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
1022 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
1023 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
1024 *
1025 * Write 0xfee00000 to MSI-X table offset 0
1026 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data
1027 * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
1028 * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
1029 */
1030typedef struct VFIOrtl8168Quirk {
1031    VFIOPCIDevice *vdev;
1032    uint32_t addr;
1033    uint32_t data;
1034    bool enabled;
1035} VFIOrtl8168Quirk;
1036
1037static uint64_t vfio_rtl8168_quirk_address_read(void *opaque,
1038                                                hwaddr addr, unsigned size)
1039{
1040    VFIOrtl8168Quirk *rtl = opaque;
1041    VFIOPCIDevice *vdev = rtl->vdev;
1042    uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x74, size);
1043
1044    if (rtl->enabled) {
1045        data = rtl->addr ^ 0x80000000U; /* latch/complete */
1046        trace_vfio_quirk_rtl8168_fake_latch(vdev->vbasedev.name, data);
1047    }
1048
1049    return data;
1050}
1051
1052static void vfio_rtl8168_quirk_address_write(void *opaque, hwaddr addr,
1053                                             uint64_t data, unsigned size)
1054{
1055    VFIOrtl8168Quirk *rtl = opaque;
1056    VFIOPCIDevice *vdev = rtl->vdev;
1057
1058    rtl->enabled = false;
1059
1060    if ((data & 0x7fff0000) == 0x10000) { /* MSI-X table */
1061        rtl->enabled = true;
1062        rtl->addr = (uint32_t)data;
1063
1064        if (data & 0x80000000U) { /* Do write */
1065            if (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
1066                hwaddr offset = data & 0xfff;
1067                uint64_t val = rtl->data;
1068
1069                trace_vfio_quirk_rtl8168_msix_write(vdev->vbasedev.name,
1070                                                    (uint16_t)offset, val);
1071
1072                /* Write to the proper guest MSI-X table instead */
1073                memory_region_dispatch_write(&vdev->pdev.msix_table_mmio,
1074                                             offset, val, size,
1075                                             MEMTXATTRS_UNSPECIFIED);
1076            }
1077            return; /* Do not write guest MSI-X data to hardware */
1078        }
1079    }
1080
1081    vfio_region_write(&vdev->bars[2].region, addr + 0x74, data, size);
1082}
1083
1084static const MemoryRegionOps vfio_rtl_address_quirk = {
1085    .read = vfio_rtl8168_quirk_address_read,
1086    .write = vfio_rtl8168_quirk_address_write,
1087    .valid = {
1088        .min_access_size = 4,
1089        .max_access_size = 4,
1090        .unaligned = false,
1091    },
1092    .endianness = DEVICE_LITTLE_ENDIAN,
1093};
1094
1095static uint64_t vfio_rtl8168_quirk_data_read(void *opaque,
1096                                             hwaddr addr, unsigned size)
1097{
1098    VFIOrtl8168Quirk *rtl = opaque;
1099    VFIOPCIDevice *vdev = rtl->vdev;
1100    uint64_t data = vfio_region_read(&vdev->bars[2].region, addr + 0x70, size);
1101
1102    if (rtl->enabled && (vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
1103        hwaddr offset = rtl->addr & 0xfff;
1104        memory_region_dispatch_read(&vdev->pdev.msix_table_mmio, offset,
1105                                    &data, size, MEMTXATTRS_UNSPECIFIED);
1106        trace_vfio_quirk_rtl8168_msix_read(vdev->vbasedev.name, offset, data);
1107    }
1108
1109    return data;
1110}
1111
1112static void vfio_rtl8168_quirk_data_write(void *opaque, hwaddr addr,
1113                                          uint64_t data, unsigned size)
1114{
1115    VFIOrtl8168Quirk *rtl = opaque;
1116    VFIOPCIDevice *vdev = rtl->vdev;
1117
1118    rtl->data = (uint32_t)data;
1119
1120    vfio_region_write(&vdev->bars[2].region, addr + 0x70, data, size);
1121}
1122
1123static const MemoryRegionOps vfio_rtl_data_quirk = {
1124    .read = vfio_rtl8168_quirk_data_read,
1125    .write = vfio_rtl8168_quirk_data_write,
1126    .valid = {
1127        .min_access_size = 4,
1128        .max_access_size = 4,
1129        .unaligned = false,
1130    },
1131    .endianness = DEVICE_LITTLE_ENDIAN,
1132};
1133
1134static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr)
1135{
1136    VFIOQuirk *quirk;
1137    VFIOrtl8168Quirk *rtl;
1138
1139    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_REALTEK, 0x8168) || nr != 2) {
1140        return;
1141    }
1142
1143    quirk = vfio_quirk_alloc(2);
1144    quirk->data = rtl = g_malloc0(sizeof(*rtl));
1145    rtl->vdev = vdev;
1146
1147    memory_region_init_io(&quirk->mem[0], OBJECT(vdev),
1148                          &vfio_rtl_address_quirk, rtl,
1149                          "vfio-rtl8168-window-address-quirk", 4);
1150    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1151                                        0x74, &quirk->mem[0], 1);
1152
1153    memory_region_init_io(&quirk->mem[1], OBJECT(vdev),
1154                          &vfio_rtl_data_quirk, rtl,
1155                          "vfio-rtl8168-window-data-quirk", 4);
1156    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1157                                        0x70, &quirk->mem[1], 1);
1158
1159    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1160
1161    trace_vfio_quirk_rtl8168_probe(vdev->vbasedev.name);
1162}
1163
1164/*
1165 * Intel IGD support
1166 *
1167 * Obviously IGD is not a discrete device, this is evidenced not only by it
1168 * being integrated into the CPU, but by the various chipset and BIOS
1169 * dependencies that it brings along with it.  Intel is trying to move away
1170 * from this and Broadwell and newer devices can run in what Intel calls
1171 * "Universal Pass-Through" mode, or UPT.  Theoretically in UPT mode, nothing
1172 * more is required beyond assigning the IGD device to a VM.  There are
1173 * however support limitations to this mode.  It only supports IGD as a
1174 * secondary graphics device in the VM and it doesn't officially support any
1175 * physical outputs.
1176 *
1177 * The code here attempts to enable what we'll call legacy mode assignment,
1178 * IGD retains most of the capabilities we expect for it to have on bare
1179 * metal.  To enable this mode, the IGD device must be assigned to the VM
1180 * at PCI address 00:02.0, it must have a ROM, it very likely needs VGA
1181 * support, we must have VM BIOS support for reserving and populating some
1182 * of the required tables, and we need to tweak the chipset with revisions
1183 * and IDs and an LPC/ISA bridge device.  The intention is to make all of
1184 * this happen automatically by installing the device at the correct VM PCI
1185 * bus address.  If any of the conditions are not met, we cross our fingers
1186 * and hope the user knows better.
1187 *
1188 * NB - It is possible to enable physical outputs in UPT mode by supplying
1189 * an OpRegion table.  We don't do this by default because the guest driver
1190 * behaves differently if an OpRegion is provided and no monitor is attached
1191 * vs no OpRegion and a monitor being attached or not.  Effectively, if a
1192 * headless setup is desired, the OpRegion gets in the way of that.
1193 */
1194
1195/*
1196 * This presumes the device is already known to be an Intel VGA device, so we
1197 * take liberties in which device ID bits match which generation.  This should
1198 * not be taken as an indication that all the devices are supported, or even
1199 * supportable, some of them don't even support VT-d.
1200 * See linux:include/drm/i915_pciids.h for IDs.
1201 */
1202static int igd_gen(VFIOPCIDevice *vdev)
1203{
1204    if ((vdev->device_id & 0xfff) == 0xa84) {
1205        return 8; /* Broxton */
1206    }
1207
1208    switch (vdev->device_id & 0xff00) {
1209    /* Old, untested, unavailable, unknown */
1210    case 0x0000:
1211    case 0x2500:
1212    case 0x2700:
1213    case 0x2900:
1214    case 0x2a00:
1215    case 0x2e00:
1216    case 0x3500:
1217    case 0xa000:
1218        return -1;
1219    /* SandyBridge, IvyBridge, ValleyView, Haswell */
1220    case 0x0100:
1221    case 0x0400:
1222    case 0x0a00:
1223    case 0x0c00:
1224    case 0x0d00:
1225    case 0x0f00:
1226        return 6;
1227    /* BroadWell, CherryView, SkyLake, KabyLake */
1228    case 0x1600:
1229    case 0x1900:
1230    case 0x2200:
1231    case 0x5900:
1232        return 8;
1233    }
1234
1235    return 8; /* Assume newer is compatible */
1236}
1237
1238typedef struct VFIOIGDQuirk {
1239    struct VFIOPCIDevice *vdev;
1240    uint32_t index;
1241    uint32_t bdsm;
1242} VFIOIGDQuirk;
1243
1244#define IGD_GMCH 0x50 /* Graphics Control Register */
1245#define IGD_BDSM 0x5c /* Base Data of Stolen Memory */
1246#define IGD_ASLS 0xfc /* ASL Storage Register */
1247
1248/*
1249 * The OpRegion includes the Video BIOS Table, which seems important for
1250 * telling the driver what sort of outputs it has.  Without this, the device
1251 * may work in the guest, but we may not get output.  This also requires BIOS
1252 * support to reserve and populate a section of guest memory sufficient for
1253 * the table and to write the base address of that memory to the ASLS register
1254 * of the IGD device.
1255 */
1256int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
1257                               struct vfio_region_info *info, Error **errp)
1258{
1259    int ret;
1260
1261    vdev->igd_opregion = g_malloc0(info->size);
1262    ret = pread(vdev->vbasedev.fd, vdev->igd_opregion,
1263                info->size, info->offset);
1264    if (ret != info->size) {
1265        error_setg(errp, "failed to read IGD OpRegion");
1266        g_free(vdev->igd_opregion);
1267        vdev->igd_opregion = NULL;
1268        return -EINVAL;
1269    }
1270
1271    /*
1272     * Provide fw_cfg with a copy of the OpRegion which the VM firmware is to
1273     * allocate 32bit reserved memory for, copy these contents into, and write
1274     * the reserved memory base address to the device ASLS register at 0xFC.
1275     * Alignment of this reserved region seems flexible, but using a 4k page
1276     * alignment seems to work well.  This interface assumes a single IGD
1277     * device, which may be at VM address 00:02.0 in legacy mode or another
1278     * address in UPT mode.
1279     *
1280     * NB, there may be future use cases discovered where the VM should have
1281     * direct interaction with the host OpRegion, in which case the write to
1282     * the ASLS register would trigger MemoryRegion setup to enable that.
1283     */
1284    fw_cfg_add_file(fw_cfg_find(), "etc/igd-opregion",
1285                    vdev->igd_opregion, info->size);
1286
1287    trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name);
1288
1289    pci_set_long(vdev->pdev.config + IGD_ASLS, 0);
1290    pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0);
1291    pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0);
1292
1293    return 0;
1294}
1295
1296/*
1297 * The rather short list of registers that we copy from the host devices.
1298 * The LPC/ISA bridge values are definitely needed to support the vBIOS, the
1299 * host bridge values may or may not be needed depending on the guest OS.
1300 * Since we're only munging revision and subsystem values on the host bridge,
1301 * we don't require our own device.  The LPC/ISA bridge needs to be our very
1302 * own though.
1303 */
1304typedef struct {
1305    uint8_t offset;
1306    uint8_t len;
1307} IGDHostInfo;
1308
1309static const IGDHostInfo igd_host_bridge_infos[] = {
1310    {PCI_REVISION_ID,         2},
1311    {PCI_SUBSYSTEM_VENDOR_ID, 2},
1312    {PCI_SUBSYSTEM_ID,        2},
1313};
1314
1315static const IGDHostInfo igd_lpc_bridge_infos[] = {
1316    {PCI_VENDOR_ID,           2},
1317    {PCI_DEVICE_ID,           2},
1318    {PCI_REVISION_ID,         2},
1319    {PCI_SUBSYSTEM_VENDOR_ID, 2},
1320    {PCI_SUBSYSTEM_ID,        2},
1321};
1322
1323static int vfio_pci_igd_copy(VFIOPCIDevice *vdev, PCIDevice *pdev,
1324                             struct vfio_region_info *info,
1325                             const IGDHostInfo *list, int len)
1326{
1327    int i, ret;
1328
1329    for (i = 0; i < len; i++) {
1330        ret = pread(vdev->vbasedev.fd, pdev->config + list[i].offset,
1331                    list[i].len, info->offset + list[i].offset);
1332        if (ret != list[i].len) {
1333            error_report("IGD copy failed: %m");
1334            return -errno;
1335        }
1336    }
1337
1338    return 0;
1339}
1340
1341/*
1342 * Stuff a few values into the host bridge.
1343 */
1344static int vfio_pci_igd_host_init(VFIOPCIDevice *vdev,
1345                                  struct vfio_region_info *info)
1346{
1347    PCIBus *bus;
1348    PCIDevice *host_bridge;
1349    int ret;
1350
1351    bus = pci_device_root_bus(&vdev->pdev);
1352    host_bridge = pci_find_device(bus, 0, PCI_DEVFN(0, 0));
1353
1354    if (!host_bridge) {
1355        error_report("Can't find host bridge");
1356        return -ENODEV;
1357    }
1358
1359    ret = vfio_pci_igd_copy(vdev, host_bridge, info, igd_host_bridge_infos,
1360                            ARRAY_SIZE(igd_host_bridge_infos));
1361    if (!ret) {
1362        trace_vfio_pci_igd_host_bridge_enabled(vdev->vbasedev.name);
1363    }
1364
1365    return ret;
1366}
1367
1368/*
1369 * IGD LPC/ISA bridge support code.  The vBIOS needs this, but we can't write
1370 * arbitrary values into just any bridge, so we must create our own.  We try
1371 * to handle if the user has created it for us, which they might want to do
1372 * to enable multifunction so we don't occupy the whole PCI slot.
1373 */
1374static void vfio_pci_igd_lpc_bridge_realize(PCIDevice *pdev, Error **errp)
1375{
1376    if (pdev->devfn != PCI_DEVFN(0x1f, 0)) {
1377        error_setg(errp, "VFIO dummy ISA/LPC bridge must have address 1f.0");
1378    }
1379}
1380
1381static void vfio_pci_igd_lpc_bridge_class_init(ObjectClass *klass, void *data)
1382{
1383    DeviceClass *dc = DEVICE_CLASS(klass);
1384    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1385
1386    set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
1387    dc->desc = "VFIO dummy ISA/LPC bridge for IGD assignment";
1388    dc->hotpluggable = false;
1389    k->realize = vfio_pci_igd_lpc_bridge_realize;
1390    k->class_id = PCI_CLASS_BRIDGE_ISA;
1391}
1392
1393static TypeInfo vfio_pci_igd_lpc_bridge_info = {
1394    .name = "vfio-pci-igd-lpc-bridge",
1395    .parent = TYPE_PCI_DEVICE,
1396    .class_init = vfio_pci_igd_lpc_bridge_class_init,
1397    .interfaces = (InterfaceInfo[]) {
1398        { INTERFACE_CONVENTIONAL_PCI_DEVICE },
1399        { },
1400    },
1401};
1402
1403static void vfio_pci_igd_register_types(void)
1404{
1405    type_register_static(&vfio_pci_igd_lpc_bridge_info);
1406}
1407
1408type_init(vfio_pci_igd_register_types)
1409
1410static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev,
1411                                 struct vfio_region_info *info)
1412{
1413    PCIDevice *lpc_bridge;
1414    int ret;
1415
1416    lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev),
1417                                 0, PCI_DEVFN(0x1f, 0));
1418    if (!lpc_bridge) {
1419        lpc_bridge = pci_create_simple(pci_device_root_bus(&vdev->pdev),
1420                                 PCI_DEVFN(0x1f, 0), "vfio-pci-igd-lpc-bridge");
1421    }
1422
1423    ret = vfio_pci_igd_copy(vdev, lpc_bridge, info, igd_lpc_bridge_infos,
1424                            ARRAY_SIZE(igd_lpc_bridge_infos));
1425    if (!ret) {
1426        trace_vfio_pci_igd_lpc_bridge_enabled(vdev->vbasedev.name);
1427    }
1428
1429    return ret;
1430}
1431
1432/*
1433 * IGD Gen8 and newer support up to 8MB for the GTT and use a 64bit PTE
1434 * entry, older IGDs use 2MB and 32bit.  Each PTE maps a 4k page.  Therefore
1435 * we either have 2M/4k * 4 = 2k or 8M/4k * 8 = 16k as the maximum iobar index
1436 * for programming the GTT.
1437 *
1438 * See linux:include/drm/i915_drm.h for shift and mask values.
1439 */
1440static int vfio_igd_gtt_max(VFIOPCIDevice *vdev)
1441{
1442    uint32_t gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch));
1443    int ggms, gen = igd_gen(vdev);
1444
1445    gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, sizeof(gmch));
1446    ggms = (gmch >> (gen < 8 ? 8 : 6)) & 0x3;
1447    if (gen > 6) {
1448        ggms = 1 << ggms;
1449    }
1450
1451    ggms *= MiB;
1452
1453    return (ggms / (4 * KiB)) * (gen < 8 ? 4 : 8);
1454}
1455
1456/*
1457 * The IGD ROM will make use of stolen memory (GGMS) for support of VESA modes.
1458 * Somehow the host stolen memory range is used for this, but how the ROM gets
1459 * it is a mystery, perhaps it's hardcoded into the ROM.  Thankfully though, it
1460 * reprograms the GTT through the IOBAR where we can trap it and transpose the
1461 * programming to the VM allocated buffer.  That buffer gets reserved by the VM
1462 * firmware via the fw_cfg entry added below.  Here we're just monitoring the
1463 * IOBAR address and data registers to detect a write sequence targeting the
1464 * GTTADR.  This code is developed by observed behavior and doesn't have a
1465 * direct spec reference, unfortunately.
1466 */
1467static uint64_t vfio_igd_quirk_data_read(void *opaque,
1468                                         hwaddr addr, unsigned size)
1469{
1470    VFIOIGDQuirk *igd = opaque;
1471    VFIOPCIDevice *vdev = igd->vdev;
1472
1473    igd->index = ~0;
1474
1475    return vfio_region_read(&vdev->bars[4].region, addr + 4, size);
1476}
1477
1478static void vfio_igd_quirk_data_write(void *opaque, hwaddr addr,
1479                                      uint64_t data, unsigned size)
1480{
1481    VFIOIGDQuirk *igd = opaque;
1482    VFIOPCIDevice *vdev = igd->vdev;
1483    uint64_t val = data;
1484    int gen = igd_gen(vdev);
1485
1486    /*
1487     * Programming the GGMS starts at index 0x1 and uses every 4th index (ie.
1488     * 0x1, 0x5, 0x9, 0xd,...).  For pre-Gen8 each 4-byte write is a whole PTE
1489     * entry, with 0th bit enable set.  For Gen8 and up, PTEs are 64bit, so
1490     * entries 0x5 & 0xd are the high dword, in our case zero.  Each PTE points
1491     * to a 4k page, which we translate to a page from the VM allocated region,
1492     * pointed to by the BDSM register.  If this is not set, we fail.
1493     *
1494     * We trap writes to the full configured GTT size, but we typically only
1495     * see the vBIOS writing up to (nearly) the 1MB barrier.  In fact it often
1496     * seems to miss the last entry for an even 1MB GTT.  Doing a gratuitous
1497     * write of that last entry does work, but is hopefully unnecessary since
1498     * we clear the previous GTT on initialization.
1499     */
1500    if ((igd->index % 4 == 1) && igd->index < vfio_igd_gtt_max(vdev)) {
1501        if (gen < 8 || (igd->index % 8 == 1)) {
1502            uint32_t base;
1503
1504            base = pci_get_long(vdev->pdev.config + IGD_BDSM);
1505            if (!base) {
1506                hw_error("vfio-igd: Guest attempted to program IGD GTT before "
1507                         "BIOS reserved stolen memory.  Unsupported BIOS?");
1508            }
1509
1510            val = data - igd->bdsm + base;
1511        } else {
1512            val = 0; /* upper 32bits of pte, we only enable below 4G PTEs */
1513        }
1514
1515        trace_vfio_pci_igd_bar4_write(vdev->vbasedev.name,
1516                                      igd->index, data, val);
1517    }
1518
1519    vfio_region_write(&vdev->bars[4].region, addr + 4, val, size);
1520
1521    igd->index = ~0;
1522}
1523
1524static const MemoryRegionOps vfio_igd_data_quirk = {
1525    .read = vfio_igd_quirk_data_read,
1526    .write = vfio_igd_quirk_data_write,
1527    .endianness = DEVICE_LITTLE_ENDIAN,
1528};
1529
1530static uint64_t vfio_igd_quirk_index_read(void *opaque,
1531                                          hwaddr addr, unsigned size)
1532{
1533    VFIOIGDQuirk *igd = opaque;
1534    VFIOPCIDevice *vdev = igd->vdev;
1535
1536    igd->index = ~0;
1537
1538    return vfio_region_read(&vdev->bars[4].region, addr, size);
1539}
1540
1541static void vfio_igd_quirk_index_write(void *opaque, hwaddr addr,
1542                                       uint64_t data, unsigned size)
1543{
1544    VFIOIGDQuirk *igd = opaque;
1545    VFIOPCIDevice *vdev = igd->vdev;
1546
1547    igd->index = data;
1548
1549    vfio_region_write(&vdev->bars[4].region, addr, data, size);
1550}
1551
1552static const MemoryRegionOps vfio_igd_index_quirk = {
1553    .read = vfio_igd_quirk_index_read,
1554    .write = vfio_igd_quirk_index_write,
1555    .endianness = DEVICE_LITTLE_ENDIAN,
1556};
1557
1558static void vfio_probe_igd_bar4_quirk(VFIOPCIDevice *vdev, int nr)
1559{
1560    struct vfio_region_info *rom = NULL, *opregion = NULL,
1561                            *host = NULL, *lpc = NULL;
1562    VFIOQuirk *quirk;
1563    VFIOIGDQuirk *igd;
1564    PCIDevice *lpc_bridge;
1565    int i, ret, ggms_mb, gms_mb = 0, gen;
1566    uint64_t *bdsm_size;
1567    uint32_t gmch;
1568    uint16_t cmd_orig, cmd;
1569    Error *err = NULL;
1570
1571    /*
1572     * This must be an Intel VGA device at address 00:02.0 for us to even
1573     * consider enabling legacy mode.  The vBIOS has dependencies on the
1574     * PCI bus address.
1575     */
1576    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) ||
1577        !vfio_is_vga(vdev) || nr != 4 ||
1578        &vdev->pdev != pci_find_device(pci_device_root_bus(&vdev->pdev),
1579                                       0, PCI_DEVFN(0x2, 0))) {
1580        return;
1581    }
1582
1583    /*
1584     * We need to create an LPC/ISA bridge at PCI bus address 00:1f.0 that we
1585     * can stuff host values into, so if there's already one there and it's not
1586     * one we can hack on, legacy mode is no-go.  Sorry Q35.
1587     */
1588    lpc_bridge = pci_find_device(pci_device_root_bus(&vdev->pdev),
1589                                 0, PCI_DEVFN(0x1f, 0));
1590    if (lpc_bridge && !object_dynamic_cast(OBJECT(lpc_bridge),
1591                                           "vfio-pci-igd-lpc-bridge")) {
1592        error_report("IGD device %s cannot support legacy mode due to existing "
1593                     "devices at address 1f.0", vdev->vbasedev.name);
1594        return;
1595    }
1596
1597    /*
1598     * IGD is not a standard, they like to change their specs often.  We
1599     * only attempt to support back to SandBridge and we hope that newer
1600     * devices maintain compatibility with generation 8.
1601     */
1602    gen = igd_gen(vdev);
1603    if (gen != 6 && gen != 8) {
1604        error_report("IGD device %s is unsupported in legacy mode, "
1605                     "try SandyBridge or newer", vdev->vbasedev.name);
1606        return;
1607    }
1608
1609    /*
1610     * Most of what we're doing here is to enable the ROM to run, so if
1611     * there's no ROM, there's no point in setting up this quirk.
1612     * NB. We only seem to get BIOS ROMs, so a UEFI VM would need CSM support.
1613     */
1614    ret = vfio_get_region_info(&vdev->vbasedev,
1615                               VFIO_PCI_ROM_REGION_INDEX, &rom);
1616    if ((ret || !rom->size) && !vdev->pdev.romfile) {
1617        error_report("IGD device %s has no ROM, legacy mode disabled",
1618                     vdev->vbasedev.name);
1619        goto out;
1620    }
1621
1622    /*
1623     * Ignore the hotplug corner case, mark the ROM failed, we can't
1624     * create the devices we need for legacy mode in the hotplug scenario.
1625     */
1626    if (vdev->pdev.qdev.hotplugged) {
1627        error_report("IGD device %s hotplugged, ROM disabled, "
1628                     "legacy mode disabled", vdev->vbasedev.name);
1629        vdev->rom_read_failed = true;
1630        goto out;
1631    }
1632
1633    /*
1634     * Check whether we have all the vfio device specific regions to
1635     * support legacy mode (added in Linux v4.6).  If not, bail.
1636     */
1637    ret = vfio_get_dev_region_info(&vdev->vbasedev,
1638                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
1639                        VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
1640    if (ret) {
1641        error_report("IGD device %s does not support OpRegion access,"
1642                     "legacy mode disabled", vdev->vbasedev.name);
1643        goto out;
1644    }
1645
1646    ret = vfio_get_dev_region_info(&vdev->vbasedev,
1647                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
1648                        VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &host);
1649    if (ret) {
1650        error_report("IGD device %s does not support host bridge access,"
1651                     "legacy mode disabled", vdev->vbasedev.name);
1652        goto out;
1653    }
1654
1655    ret = vfio_get_dev_region_info(&vdev->vbasedev,
1656                        VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
1657                        VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &lpc);
1658    if (ret) {
1659        error_report("IGD device %s does not support LPC bridge access,"
1660                     "legacy mode disabled", vdev->vbasedev.name);
1661        goto out;
1662    }
1663
1664    gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4);
1665
1666    /*
1667     * If IGD VGA Disable is clear (expected) and VGA is not already enabled,
1668     * try to enable it.  Probably shouldn't be using legacy mode without VGA,
1669     * but also no point in us enabling VGA if disabled in hardware.
1670     */
1671    if (!(gmch & 0x2) && !vdev->vga && vfio_populate_vga(vdev, &err)) {
1672        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
1673        error_report("IGD device %s failed to enable VGA access, "
1674                     "legacy mode disabled", vdev->vbasedev.name);
1675        goto out;
1676    }
1677
1678    /* Create our LPC/ISA bridge */
1679    ret = vfio_pci_igd_lpc_init(vdev, lpc);
1680    if (ret) {
1681        error_report("IGD device %s failed to create LPC bridge, "
1682                     "legacy mode disabled", vdev->vbasedev.name);
1683        goto out;
1684    }
1685
1686    /* Stuff some host values into the VM PCI host bridge */
1687    ret = vfio_pci_igd_host_init(vdev, host);
1688    if (ret) {
1689        error_report("IGD device %s failed to modify host bridge, "
1690                     "legacy mode disabled", vdev->vbasedev.name);
1691        goto out;
1692    }
1693
1694    /* Setup OpRegion access */
1695    ret = vfio_pci_igd_opregion_init(vdev, opregion, &err);
1696    if (ret) {
1697        error_append_hint(&err, "IGD legacy mode disabled\n");
1698        error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
1699        goto out;
1700    }
1701
1702    /* Setup our quirk to munge GTT addresses to the VM allocated buffer */
1703    quirk = vfio_quirk_alloc(2);
1704    igd = quirk->data = g_malloc0(sizeof(*igd));
1705    igd->vdev = vdev;
1706    igd->index = ~0;
1707    igd->bdsm = vfio_pci_read_config(&vdev->pdev, IGD_BDSM, 4);
1708    igd->bdsm &= ~((1 * MiB) - 1); /* 1MB aligned */
1709
1710    memory_region_init_io(&quirk->mem[0], OBJECT(vdev), &vfio_igd_index_quirk,
1711                          igd, "vfio-igd-index-quirk", 4);
1712    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1713                                        0, &quirk->mem[0], 1);
1714
1715    memory_region_init_io(&quirk->mem[1], OBJECT(vdev), &vfio_igd_data_quirk,
1716                          igd, "vfio-igd-data-quirk", 4);
1717    memory_region_add_subregion_overlap(vdev->bars[nr].region.mem,
1718                                        4, &quirk->mem[1], 1);
1719
1720    QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next);
1721
1722    /* Determine the size of stolen memory needed for GTT */
1723    ggms_mb = (gmch >> (gen < 8 ? 8 : 6)) & 0x3;
1724    if (gen > 6) {
1725        ggms_mb = 1 << ggms_mb;
1726    }
1727
1728    /*
1729     * Assume we have no GMS memory, but allow it to be overrided by device
1730     * option (experimental).  The spec doesn't actually allow zero GMS when
1731     * when IVD (IGD VGA Disable) is clear, but the claim is that it's unused,
1732     * so let's not waste VM memory for it.
1733     */
1734    gmch &= ~((gen < 8 ? 0x1f : 0xff) << (gen < 8 ? 3 : 8));
1735
1736    if (vdev->igd_gms) {
1737        if (vdev->igd_gms <= 0x10) {
1738            gms_mb = vdev->igd_gms * 32;
1739            gmch |= vdev->igd_gms << (gen < 8 ? 3 : 8);
1740        } else {
1741            error_report("Unsupported IGD GMS value 0x%x", vdev->igd_gms);
1742            vdev->igd_gms = 0;
1743        }
1744    }
1745
1746    /*
1747     * Request reserved memory for stolen memory via fw_cfg.  VM firmware
1748     * must allocate a 1MB aligned reserved memory region below 4GB with
1749     * the requested size (in bytes) for use by the Intel PCI class VGA
1750     * device at VM address 00:02.0.  The base address of this reserved
1751     * memory region must be written to the device BDSM regsiter at PCI
1752     * config offset 0x5C.
1753     */
1754    bdsm_size = g_malloc(sizeof(*bdsm_size));
1755    *bdsm_size = cpu_to_le64((ggms_mb + gms_mb) * MiB);
1756    fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size",
1757                    bdsm_size, sizeof(*bdsm_size));
1758
1759    /* GMCH is read-only, emulated */
1760    pci_set_long(vdev->pdev.config + IGD_GMCH, gmch);
1761    pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0);
1762    pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0);
1763
1764    /* BDSM is read-write, emulated.  The BIOS needs to be able to write it */
1765    pci_set_long(vdev->pdev.config + IGD_BDSM, 0);
1766    pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0);
1767    pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0);
1768
1769    /*
1770     * This IOBAR gives us access to GTTADR, which allows us to write to
1771     * the GTT itself.  So let's go ahead and write zero to all the GTT
1772     * entries to avoid spurious DMA faults.  Be sure I/O access is enabled
1773     * before talking to the device.
1774     */
1775    if (pread(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig),
1776              vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) {
1777        error_report("IGD device %s - failed to read PCI command register",
1778                     vdev->vbasedev.name);
1779    }
1780
1781    cmd = cmd_orig | PCI_COMMAND_IO;
1782
1783    if (pwrite(vdev->vbasedev.fd, &cmd, sizeof(cmd),
1784               vdev->config_offset + PCI_COMMAND) != sizeof(cmd)) {
1785        error_report("IGD device %s - failed to write PCI command register",
1786                     vdev->vbasedev.name);
1787    }
1788
1789    for (i = 1; i < vfio_igd_gtt_max(vdev); i += 4) {
1790        vfio_region_write(&vdev->bars[4].region, 0, i, 4);
1791        vfio_region_write(&vdev->bars[4].region, 4, 0, 4);
1792    }
1793
1794    if (pwrite(vdev->vbasedev.fd, &cmd_orig, sizeof(cmd_orig),
1795               vdev->config_offset + PCI_COMMAND) != sizeof(cmd_orig)) {
1796        error_report("IGD device %s - failed to restore PCI command register",
1797                     vdev->vbasedev.name);
1798    }
1799
1800    trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, ggms_mb + gms_mb);
1801
1802out:
1803    g_free(rom);
1804    g_free(opregion);
1805    g_free(host);
1806    g_free(lpc);
1807}
1808
1809/*
1810 * Common quirk probe entry points.
1811 */
1812void vfio_vga_quirk_setup(VFIOPCIDevice *vdev)
1813{
1814    vfio_vga_probe_ati_3c3_quirk(vdev);
1815    vfio_vga_probe_nvidia_3d0_quirk(vdev);
1816}
1817
1818void vfio_vga_quirk_exit(VFIOPCIDevice *vdev)
1819{
1820    VFIOQuirk *quirk;
1821    int i, j;
1822
1823    for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1824        QLIST_FOREACH(quirk, &vdev->vga->region[i].quirks, next) {
1825            for (j = 0; j < quirk->nr_mem; j++) {
1826                memory_region_del_subregion(&vdev->vga->region[i].mem,
1827                                            &quirk->mem[j]);
1828            }
1829        }
1830    }
1831}
1832
1833void vfio_vga_quirk_finalize(VFIOPCIDevice *vdev)
1834{
1835    int i, j;
1836
1837    for (i = 0; i < ARRAY_SIZE(vdev->vga->region); i++) {
1838        while (!QLIST_EMPTY(&vdev->vga->region[i].quirks)) {
1839            VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga->region[i].quirks);
1840            QLIST_REMOVE(quirk, next);
1841            for (j = 0; j < quirk->nr_mem; j++) {
1842                object_unparent(OBJECT(&quirk->mem[j]));
1843            }
1844            g_free(quirk->mem);
1845            g_free(quirk->data);
1846            g_free(quirk);
1847        }
1848    }
1849}
1850
1851void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr)
1852{
1853    vfio_probe_ati_bar4_quirk(vdev, nr);
1854    vfio_probe_ati_bar2_quirk(vdev, nr);
1855    vfio_probe_nvidia_bar5_quirk(vdev, nr);
1856    vfio_probe_nvidia_bar0_quirk(vdev, nr);
1857    vfio_probe_rtl8168_bar2_quirk(vdev, nr);
1858    vfio_probe_igd_bar4_quirk(vdev, nr);
1859}
1860
1861void vfio_bar_quirk_exit(VFIOPCIDevice *vdev, int nr)
1862{
1863    VFIOBAR *bar = &vdev->bars[nr];
1864    VFIOQuirk *quirk;
1865    int i;
1866
1867    QLIST_FOREACH(quirk, &bar->quirks, next) {
1868        while (!QLIST_EMPTY(&quirk->ioeventfds)) {
1869            vfio_ioeventfd_exit(vdev, QLIST_FIRST(&quirk->ioeventfds));
1870        }
1871
1872        for (i = 0; i < quirk->nr_mem; i++) {
1873            memory_region_del_subregion(bar->region.mem, &quirk->mem[i]);
1874        }
1875    }
1876}
1877
1878void vfio_bar_quirk_finalize(VFIOPCIDevice *vdev, int nr)
1879{
1880    VFIOBAR *bar = &vdev->bars[nr];
1881    int i;
1882
1883    while (!QLIST_EMPTY(&bar->quirks)) {
1884        VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks);
1885        QLIST_REMOVE(quirk, next);
1886        for (i = 0; i < quirk->nr_mem; i++) {
1887            object_unparent(OBJECT(&quirk->mem[i]));
1888        }
1889        g_free(quirk->mem);
1890        g_free(quirk->data);
1891        g_free(quirk);
1892    }
1893}
1894
1895/*
1896 * Reset quirks
1897 */
1898void vfio_quirk_reset(VFIOPCIDevice *vdev)
1899{
1900    int i;
1901
1902    for (i = 0; i < PCI_ROM_SLOT; i++) {
1903        VFIOQuirk *quirk;
1904        VFIOBAR *bar = &vdev->bars[i];
1905
1906        QLIST_FOREACH(quirk, &bar->quirks, next) {
1907            if (quirk->reset) {
1908                quirk->reset(vdev, quirk);
1909            }
1910        }
1911    }
1912}
1913
1914/*
1915 * AMD Radeon PCI config reset, based on Linux:
1916 *   drivers/gpu/drm/radeon/ci_smc.c:ci_is_smc_running()
1917 *   drivers/gpu/drm/radeon/radeon_device.c:radeon_pci_config_reset
1918 *   drivers/gpu/drm/radeon/ci_smc.c:ci_reset_smc()
1919 *   drivers/gpu/drm/radeon/ci_smc.c:ci_stop_smc_clock()
1920 * IDs: include/drm/drm_pciids.h
1921 * Registers: http://cgit.freedesktop.org/~agd5f/linux/commit/?id=4e2aa447f6f0
1922 *
1923 * Bonaire and Hawaii GPUs do not respond to a bus reset.  This is a bug in the
1924 * hardware that should be fixed on future ASICs.  The symptom of this is that
1925 * once the accerlated driver loads, Windows guests will bsod on subsequent
1926 * attmpts to load the driver, such as after VM reset or shutdown/restart.  To
1927 * work around this, we do an AMD specific PCI config reset, followed by an SMC
1928 * reset.  The PCI config reset only works if SMC firmware is running, so we
1929 * have a dependency on the state of the device as to whether this reset will
1930 * be effective.  There are still cases where we won't be able to kick the
1931 * device into working, but this greatly improves the usability overall.  The
1932 * config reset magic is relatively common on AMD GPUs, but the setup and SMC
1933 * poking is largely ASIC specific.
1934 */
1935static bool vfio_radeon_smc_is_running(VFIOPCIDevice *vdev)
1936{
1937    uint32_t clk, pc_c;
1938
1939    /*
1940     * Registers 200h and 204h are index and data registers for accessing
1941     * indirect configuration registers within the device.
1942     */
1943    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
1944    clk = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1945    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000370, 4);
1946    pc_c = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1947
1948    return (!(clk & 1) && (0x20100 <= pc_c));
1949}
1950
1951/*
1952 * The scope of a config reset is controlled by a mode bit in the misc register
1953 * and a fuse, exposed as a bit in another register.  The fuse is the default
1954 * (0 = GFX, 1 = whole GPU), the misc bit is a toggle, with the forumula
1955 * scope = !(misc ^ fuse), where the resulting scope is defined the same as
1956 * the fuse.  A truth table therefore tells us that if misc == fuse, we need
1957 * to flip the value of the bit in the misc register.
1958 */
1959static void vfio_radeon_set_gfx_only_reset(VFIOPCIDevice *vdev)
1960{
1961    uint32_t misc, fuse;
1962    bool a, b;
1963
1964    vfio_region_write(&vdev->bars[5].region, 0x200, 0xc00c0000, 4);
1965    fuse = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1966    b = fuse & 64;
1967
1968    vfio_region_write(&vdev->bars[5].region, 0x200, 0xc0000010, 4);
1969    misc = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
1970    a = misc & 2;
1971
1972    if (a == b) {
1973        vfio_region_write(&vdev->bars[5].region, 0x204, misc ^ 2, 4);
1974        vfio_region_read(&vdev->bars[5].region, 0x204, 4); /* flush */
1975    }
1976}
1977
1978static int vfio_radeon_reset(VFIOPCIDevice *vdev)
1979{
1980    PCIDevice *pdev = &vdev->pdev;
1981    int i, ret = 0;
1982    uint32_t data;
1983
1984    /* Defer to a kernel implemented reset */
1985    if (vdev->vbasedev.reset_works) {
1986        trace_vfio_quirk_ati_bonaire_reset_skipped(vdev->vbasedev.name);
1987        return -ENODEV;
1988    }
1989
1990    /* Enable only memory BAR access */
1991    vfio_pci_write_config(pdev, PCI_COMMAND, PCI_COMMAND_MEMORY, 2);
1992
1993    /* Reset only works if SMC firmware is loaded and running */
1994    if (!vfio_radeon_smc_is_running(vdev)) {
1995        ret = -EINVAL;
1996        trace_vfio_quirk_ati_bonaire_reset_no_smc(vdev->vbasedev.name);
1997        goto out;
1998    }
1999
2000    /* Make sure only the GFX function is reset */
2001    vfio_radeon_set_gfx_only_reset(vdev);
2002
2003    /* AMD PCI config reset */
2004    vfio_pci_write_config(pdev, 0x7c, 0x39d5e86b, 4);
2005    usleep(100);
2006
2007    /* Read back the memory size to make sure we're out of reset */
2008    for (i = 0; i < 100000; i++) {
2009        if (vfio_region_read(&vdev->bars[5].region, 0x5428, 4) != 0xffffffff) {
2010            goto reset_smc;
2011        }
2012        usleep(1);
2013    }
2014
2015    trace_vfio_quirk_ati_bonaire_reset_timeout(vdev->vbasedev.name);
2016
2017reset_smc:
2018    /* Reset SMC */
2019    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000000, 4);
2020    data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
2021    data |= 1;
2022    vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
2023
2024    /* Disable SMC clock */
2025    vfio_region_write(&vdev->bars[5].region, 0x200, 0x80000004, 4);
2026    data = vfio_region_read(&vdev->bars[5].region, 0x204, 4);
2027    data |= 1;
2028    vfio_region_write(&vdev->bars[5].region, 0x204, data, 4);
2029
2030    trace_vfio_quirk_ati_bonaire_reset_done(vdev->vbasedev.name);
2031
2032out:
2033    /* Restore PCI command register */
2034    vfio_pci_write_config(pdev, PCI_COMMAND, 0, 2);
2035
2036    return ret;
2037}
2038
2039void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev)
2040{
2041    switch (vdev->vendor_id) {
2042    case 0x1002:
2043        switch (vdev->device_id) {
2044        /* Bonaire */
2045        case 0x6649: /* Bonaire [FirePro W5100] */
2046        case 0x6650:
2047        case 0x6651:
2048        case 0x6658: /* Bonaire XTX [Radeon R7 260X] */
2049        case 0x665c: /* Bonaire XT [Radeon HD 7790/8770 / R9 260 OEM] */
2050        case 0x665d: /* Bonaire [Radeon R7 200 Series] */
2051        /* Hawaii */
2052        case 0x67A0: /* Hawaii XT GL [FirePro W9100] */
2053        case 0x67A1: /* Hawaii PRO GL [FirePro W8100] */
2054        case 0x67A2:
2055        case 0x67A8:
2056        case 0x67A9:
2057        case 0x67AA:
2058        case 0x67B0: /* Hawaii XT [Radeon R9 290X] */
2059        case 0x67B1: /* Hawaii PRO [Radeon R9 290] */
2060        case 0x67B8:
2061        case 0x67B9:
2062        case 0x67BA:
2063        case 0x67BE:
2064            vdev->resetfn = vfio_radeon_reset;
2065            trace_vfio_quirk_ati_bonaire_reset(vdev->vbasedev.name);
2066            break;
2067        }
2068        break;
2069    }
2070}
2071
2072/*
2073 * The NVIDIA GPUDirect P2P Vendor capability allows the user to specify
2074 * devices as a member of a clique.  Devices within the same clique ID
2075 * are capable of direct P2P.  It's the user's responsibility that this
2076 * is correct.  The spec says that this may reside at any unused config
2077 * offset, but reserves and recommends hypervisors place this at C8h.
2078 * The spec also states that the hypervisor should place this capability
2079 * at the end of the capability list, thus next is defined as 0h.
2080 *
2081 * +----------------+----------------+----------------+----------------+
2082 * | sig 7:0 ('P')  |  vndr len (8h) |    next (0h)   |   cap id (9h)  |
2083 * +----------------+----------------+----------------+----------------+
2084 * | rsvd 15:7(0h),id 6:3,ver 2:0(0h)|          sig 23:8 ('P2')        |
2085 * +---------------------------------+---------------------------------+
2086 *
2087 * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
2088 */
2089static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v,
2090                                       const char *name, void *opaque,
2091                                       Error **errp)
2092{
2093    DeviceState *dev = DEVICE(obj);
2094    Property *prop = opaque;
2095    uint8_t *ptr = qdev_get_prop_ptr(dev, prop);
2096
2097    visit_type_uint8(v, name, ptr, errp);
2098}
2099
2100static void set_nv_gpudirect_clique_id(Object *obj, Visitor *v,
2101                                       const char *name, void *opaque,
2102                                       Error **errp)
2103{
2104    DeviceState *dev = DEVICE(obj);
2105    Property *prop = opaque;
2106    uint8_t value, *ptr = qdev_get_prop_ptr(dev, prop);
2107    Error *local_err = NULL;
2108
2109    if (dev->realized) {
2110        qdev_prop_set_after_realize(dev, name, errp);
2111        return;
2112    }
2113
2114    visit_type_uint8(v, name, &value, &local_err);
2115    if (local_err) {
2116        error_propagate(errp, local_err);
2117        return;
2118    }
2119
2120    if (value & ~0xF) {
2121        error_setg(errp, "Property %s: valid range 0-15", name);
2122        return;
2123    }
2124
2125    *ptr = value;
2126}
2127
2128const PropertyInfo qdev_prop_nv_gpudirect_clique = {
2129    .name = "uint4",
2130    .description = "NVIDIA GPUDirect Clique ID (0 - 15)",
2131    .get = get_nv_gpudirect_clique_id,
2132    .set = set_nv_gpudirect_clique_id,
2133};
2134
2135static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
2136{
2137    PCIDevice *pdev = &vdev->pdev;
2138    int ret, pos = 0xC8;
2139
2140    if (vdev->nv_gpudirect_clique == 0xFF) {
2141        return 0;
2142    }
2143
2144    if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
2145        error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid device vendor");
2146        return -EINVAL;
2147    }
2148
2149    if (pci_get_byte(pdev->config + PCI_CLASS_DEVICE + 1) !=
2150        PCI_BASE_CLASS_DISPLAY) {
2151        error_setg(errp, "NVIDIA GPUDirect Clique ID: unsupported PCI class");
2152        return -EINVAL;
2153    }
2154
2155    ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
2156    if (ret < 0) {
2157        error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
2158        return ret;
2159    }
2160
2161    memset(vdev->emulated_config_bits + pos, 0xFF, 8);
2162    pos += PCI_CAP_FLAGS;
2163    pci_set_byte(pdev->config + pos++, 8);
2164    pci_set_byte(pdev->config + pos++, 'P');
2165    pci_set_byte(pdev->config + pos++, '2');
2166    pci_set_byte(pdev->config + pos++, 'P');
2167    pci_set_byte(pdev->config + pos++, vdev->nv_gpudirect_clique << 3);
2168    pci_set_byte(pdev->config + pos, 0);
2169
2170    return 0;
2171}
2172
2173int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
2174{
2175    int ret;
2176
2177    ret = vfio_add_nv_gpudirect_cap(vdev, errp);
2178    if (ret) {
2179        return ret;
2180    }
2181
2182    return 0;
2183}
2184
2185static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v,
2186                                     const char *name,
2187                                     void *opaque, Error **errp)
2188{
2189    uint64_t tgt = (uintptr_t) opaque;
2190    visit_type_uint64(v, name, &tgt, errp);
2191}
2192
2193static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v,
2194                                                 const char *name,
2195                                                 void *opaque, Error **errp)
2196{
2197    uint32_t link_speed = (uint32_t)(uintptr_t) opaque;
2198    visit_type_uint32(v, name, &link_speed, errp);
2199}
2200
2201int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp)
2202{
2203    int ret;
2204    void *p;
2205    struct vfio_region_info *nv2reg = NULL;
2206    struct vfio_info_cap_header *hdr;
2207    struct vfio_region_info_cap_nvlink2_ssatgt *cap;
2208    VFIOQuirk *quirk;
2209
2210    ret = vfio_get_dev_region_info(&vdev->vbasedev,
2211                                   VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
2212                                   PCI_VENDOR_ID_NVIDIA,
2213                                   VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
2214                                   &nv2reg);
2215    if (ret) {
2216        return ret;
2217    }
2218
2219    hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
2220    if (!hdr) {
2221        ret = -ENODEV;
2222        goto free_exit;
2223    }
2224    cap = (void *) hdr;
2225
2226    p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE | PROT_EXEC,
2227             MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset);
2228    if (p == MAP_FAILED) {
2229        ret = -errno;
2230        goto free_exit;
2231    }
2232
2233    quirk = vfio_quirk_alloc(1);
2234    memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr",
2235                               nv2reg->size, p);
2236    QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
2237
2238    object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
2239                        vfio_pci_nvlink2_get_tgt, NULL, NULL,
2240                        (void *) (uintptr_t) cap->tgt, NULL);
2241    trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt,
2242                                          nv2reg->size);
2243free_exit:
2244    g_free(nv2reg);
2245
2246    return ret;
2247}
2248
2249int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp)
2250{
2251    int ret;
2252    void *p;
2253    struct vfio_region_info *atsdreg = NULL;
2254    struct vfio_info_cap_header *hdr;
2255    struct vfio_region_info_cap_nvlink2_ssatgt *captgt;
2256    struct vfio_region_info_cap_nvlink2_lnkspd *capspeed;
2257    VFIOQuirk *quirk;
2258
2259    ret = vfio_get_dev_region_info(&vdev->vbasedev,
2260                                   VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
2261                                   PCI_VENDOR_ID_IBM,
2262                                   VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
2263                                   &atsdreg);
2264    if (ret) {
2265        return ret;
2266    }
2267
2268    hdr = vfio_get_region_info_cap(atsdreg,
2269                                   VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
2270    if (!hdr) {
2271        ret = -ENODEV;
2272        goto free_exit;
2273    }
2274    captgt = (void *) hdr;
2275
2276    hdr = vfio_get_region_info_cap(atsdreg,
2277                                   VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD);
2278    if (!hdr) {
2279        ret = -ENODEV;
2280        goto free_exit;
2281    }
2282    capspeed = (void *) hdr;
2283
2284    /* Some NVLink bridges may not have assigned ATSD */
2285    if (atsdreg->size) {
2286        p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE | PROT_EXEC,
2287                 MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset);
2288        if (p == MAP_FAILED) {
2289            ret = -errno;
2290            goto free_exit;
2291        }
2292
2293        quirk = vfio_quirk_alloc(1);
2294        memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev),
2295                                          "nvlink2-atsd-mr", atsdreg->size, p);
2296        QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
2297    }
2298
2299    object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
2300                        vfio_pci_nvlink2_get_tgt, NULL, NULL,
2301                        (void *) (uintptr_t) captgt->tgt, NULL);
2302    trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt,
2303                                              atsdreg->size);
2304
2305    object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32",
2306                        vfio_pci_nvlink2_get_link_speed, NULL, NULL,
2307                        (void *) (uintptr_t) capspeed->link_speed, NULL);
2308    trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name,
2309                                              capspeed->link_speed);
2310free_exit:
2311    g_free(atsdreg);
2312
2313    return ret;
2314}
2315