linux/drivers/vfio/pci/vfio_pci_core.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   4 *     Author: Alex Williamson <alex.williamson@redhat.com>
   5 *
   6 * Derived from original vfio:
   7 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
   8 * Author: Tom Lyon, pugs@cisco.com
   9 */
  10
  11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13#include <linux/device.h>
  14#include <linux/eventfd.h>
  15#include <linux/file.h>
  16#include <linux/interrupt.h>
  17#include <linux/iommu.h>
  18#include <linux/module.h>
  19#include <linux/mutex.h>
  20#include <linux/notifier.h>
  21#include <linux/pci.h>
  22#include <linux/pm_runtime.h>
  23#include <linux/slab.h>
  24#include <linux/types.h>
  25#include <linux/uaccess.h>
  26#include <linux/vgaarb.h>
  27#include <linux/nospec.h>
  28#include <linux/sched/mm.h>
  29
  30#include <linux/vfio_pci_core.h>
  31
  32#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  33#define DRIVER_DESC "core driver for VFIO based PCI devices"
  34
  35static bool nointxmask;
  36static bool disable_vga;
  37static bool disable_idle_d3;
  38
  39static inline bool vfio_vga_disabled(void)
  40{
  41#ifdef CONFIG_VFIO_PCI_VGA
  42        return disable_vga;
  43#else
  44        return true;
  45#endif
  46}
  47
  48/*
  49 * Our VGA arbiter participation is limited since we don't know anything
  50 * about the device itself.  However, if the device is the only VGA device
  51 * downstream of a bridge and VFIO VGA support is disabled, then we can
  52 * safely return legacy VGA IO and memory as not decoded since the user
  53 * has no way to get to it and routing can be disabled externally at the
  54 * bridge.
  55 */
  56static unsigned int vfio_pci_set_decode(struct pci_dev *pdev, bool single_vga)
  57{
  58        struct pci_dev *tmp = NULL;
  59        unsigned char max_busnr;
  60        unsigned int decodes;
  61
  62        if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus))
  63                return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
  64                       VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
  65
  66        max_busnr = pci_bus_max_busnr(pdev->bus);
  67        decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
  68
  69        while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
  70                if (tmp == pdev ||
  71                    pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
  72                    pci_is_root_bus(tmp->bus))
  73                        continue;
  74
  75                if (tmp->bus->number >= pdev->bus->number &&
  76                    tmp->bus->number <= max_busnr) {
  77                        pci_dev_put(tmp);
  78                        decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
  79                        break;
  80                }
  81        }
  82
  83        return decodes;
  84}
  85
  86static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
  87{
  88        struct resource *res;
  89        int i;
  90        struct vfio_pci_dummy_resource *dummy_res;
  91
  92        for (i = 0; i < PCI_STD_NUM_BARS; i++) {
  93                int bar = i + PCI_STD_RESOURCES;
  94
  95                res = &vdev->pdev->resource[bar];
  96
  97                if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
  98                        goto no_mmap;
  99
 100                if (!(res->flags & IORESOURCE_MEM))
 101                        goto no_mmap;
 102
 103                /*
 104                 * The PCI core shouldn't set up a resource with a
 105                 * type but zero size. But there may be bugs that
 106                 * cause us to do that.
 107                 */
 108                if (!resource_size(res))
 109                        goto no_mmap;
 110
 111                if (resource_size(res) >= PAGE_SIZE) {
 112                        vdev->bar_mmap_supported[bar] = true;
 113                        continue;
 114                }
 115
 116                if (!(res->start & ~PAGE_MASK)) {
 117                        /*
 118                         * Add a dummy resource to reserve the remainder
 119                         * of the exclusive page in case that hot-add
 120                         * device's bar is assigned into it.
 121                         */
 122                        dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
 123                        if (dummy_res == NULL)
 124                                goto no_mmap;
 125
 126                        dummy_res->resource.name = "vfio sub-page reserved";
 127                        dummy_res->resource.start = res->end + 1;
 128                        dummy_res->resource.end = res->start + PAGE_SIZE - 1;
 129                        dummy_res->resource.flags = res->flags;
 130                        if (request_resource(res->parent,
 131                                                &dummy_res->resource)) {
 132                                kfree(dummy_res);
 133                                goto no_mmap;
 134                        }
 135                        dummy_res->index = bar;
 136                        list_add(&dummy_res->res_next,
 137                                        &vdev->dummy_resources_list);
 138                        vdev->bar_mmap_supported[bar] = true;
 139                        continue;
 140                }
 141                /*
 142                 * Here we don't handle the case when the BAR is not page
 143                 * aligned because we can't expect the BAR will be
 144                 * assigned into the same location in a page in guest
 145                 * when we passthrough the BAR. And it's hard to access
 146                 * this BAR in userspace because we have no way to get
 147                 * the BAR's location in a page.
 148                 */
 149no_mmap:
 150                vdev->bar_mmap_supported[bar] = false;
 151        }
 152}
 153
 154struct vfio_pci_group_info;
 155static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
 156static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
 157                                      struct vfio_pci_group_info *groups);
 158
 159/*
 160 * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
 161 * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
 162 * If a device implements the former but not the latter we would typically
 163 * expect broken_intx_masking be set and require an exclusive interrupt.
 164 * However since we do have control of the device's ability to assert INTx,
 165 * we can instead pretend that the device does not implement INTx, virtualizing
 166 * the pin register to report zero and maintaining DisINTx set on the host.
 167 */
 168static bool vfio_pci_nointx(struct pci_dev *pdev)
 169{
 170        switch (pdev->vendor) {
 171        case PCI_VENDOR_ID_INTEL:
 172                switch (pdev->device) {
 173                /* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
 174                case 0x1572:
 175                case 0x1574:
 176                case 0x1580 ... 0x1581:
 177                case 0x1583 ... 0x158b:
 178                case 0x37d0 ... 0x37d2:
 179                /* X550 */
 180                case 0x1563:
 181                        return true;
 182                default:
 183                        return false;
 184                }
 185        }
 186
 187        return false;
 188}
 189
 190static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev)
 191{
 192        struct pci_dev *pdev = vdev->pdev;
 193        u16 pmcsr;
 194
 195        if (!pdev->pm_cap)
 196                return;
 197
 198        pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
 199
 200        vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
 201}
 202
 203/*
 204 * pci_set_power_state() wrapper handling devices which perform a soft reset on
 205 * D3->D0 transition.  Save state prior to D0/1/2->D3, stash it on the vdev,
 206 * restore when returned to D0.  Saved separately from pci_saved_state for use
 207 * by PM capability emulation and separately from pci_dev internal saved state
 208 * to avoid it being overwritten and consumed around other resets.
 209 */
 210int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state)
 211{
 212        struct pci_dev *pdev = vdev->pdev;
 213        bool needs_restore = false, needs_save = false;
 214        int ret;
 215
 216        if (vdev->needs_pm_restore) {
 217                if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
 218                        pci_save_state(pdev);
 219                        needs_save = true;
 220                }
 221
 222                if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
 223                        needs_restore = true;
 224        }
 225
 226        ret = pci_set_power_state(pdev, state);
 227
 228        if (!ret) {
 229                /* D3 might be unsupported via quirk, skip unless in D3 */
 230                if (needs_save && pdev->current_state >= PCI_D3hot) {
 231                        vdev->pm_save = pci_store_saved_state(pdev);
 232                } else if (needs_restore) {
 233                        pci_load_and_free_saved_state(pdev, &vdev->pm_save);
 234                        pci_restore_state(pdev);
 235                }
 236        }
 237
 238        return ret;
 239}
 240
 241int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
 242{
 243        struct pci_dev *pdev = vdev->pdev;
 244        int ret;
 245        u16 cmd;
 246        u8 msix_pos;
 247
 248        vfio_pci_set_power_state(vdev, PCI_D0);
 249
 250        /* Don't allow our initial saved state to include busmaster */
 251        pci_clear_master(pdev);
 252
 253        ret = pci_enable_device(pdev);
 254        if (ret)
 255                return ret;
 256
 257        /* If reset fails because of the device lock, fail this path entirely */
 258        ret = pci_try_reset_function(pdev);
 259        if (ret == -EAGAIN) {
 260                pci_disable_device(pdev);
 261                return ret;
 262        }
 263
 264        vdev->reset_works = !ret;
 265        pci_save_state(pdev);
 266        vdev->pci_saved_state = pci_store_saved_state(pdev);
 267        if (!vdev->pci_saved_state)
 268                pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
 269
 270        if (likely(!nointxmask)) {
 271                if (vfio_pci_nointx(pdev)) {
 272                        pci_info(pdev, "Masking broken INTx support\n");
 273                        vdev->nointx = true;
 274                        pci_intx(pdev, 0);
 275                } else
 276                        vdev->pci_2_3 = pci_intx_mask_supported(pdev);
 277        }
 278
 279        pci_read_config_word(pdev, PCI_COMMAND, &cmd);
 280        if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
 281                cmd &= ~PCI_COMMAND_INTX_DISABLE;
 282                pci_write_config_word(pdev, PCI_COMMAND, cmd);
 283        }
 284
 285        ret = vfio_config_init(vdev);
 286        if (ret) {
 287                kfree(vdev->pci_saved_state);
 288                vdev->pci_saved_state = NULL;
 289                pci_disable_device(pdev);
 290                return ret;
 291        }
 292
 293        msix_pos = pdev->msix_cap;
 294        if (msix_pos) {
 295                u16 flags;
 296                u32 table;
 297
 298                pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
 299                pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
 300
 301                vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
 302                vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
 303                vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
 304        } else
 305                vdev->msix_bar = 0xFF;
 306
 307        if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
 308                vdev->has_vga = true;
 309
 310
 311        return 0;
 312}
 313EXPORT_SYMBOL_GPL(vfio_pci_core_enable);
 314
 315void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
 316{
 317        struct pci_dev *pdev = vdev->pdev;
 318        struct vfio_pci_dummy_resource *dummy_res, *tmp;
 319        struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
 320        int i, bar;
 321
 322        /* For needs_reset */
 323        lockdep_assert_held(&vdev->vdev.dev_set->lock);
 324
 325        /* Stop the device from further DMA */
 326        pci_clear_master(pdev);
 327
 328        vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
 329                                VFIO_IRQ_SET_ACTION_TRIGGER,
 330                                vdev->irq_type, 0, 0, NULL);
 331
 332        /* Device closed, don't need mutex here */
 333        list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
 334                                 &vdev->ioeventfds_list, next) {
 335                vfio_virqfd_disable(&ioeventfd->virqfd);
 336                list_del(&ioeventfd->next);
 337                kfree(ioeventfd);
 338        }
 339        vdev->ioeventfds_nr = 0;
 340
 341        vdev->virq_disabled = false;
 342
 343        for (i = 0; i < vdev->num_regions; i++)
 344                vdev->region[i].ops->release(vdev, &vdev->region[i]);
 345
 346        vdev->num_regions = 0;
 347        kfree(vdev->region);
 348        vdev->region = NULL; /* don't krealloc a freed pointer */
 349
 350        vfio_config_free(vdev);
 351
 352        for (i = 0; i < PCI_STD_NUM_BARS; i++) {
 353                bar = i + PCI_STD_RESOURCES;
 354                if (!vdev->barmap[bar])
 355                        continue;
 356                pci_iounmap(pdev, vdev->barmap[bar]);
 357                pci_release_selected_regions(pdev, 1 << bar);
 358                vdev->barmap[bar] = NULL;
 359        }
 360
 361        list_for_each_entry_safe(dummy_res, tmp,
 362                                 &vdev->dummy_resources_list, res_next) {
 363                list_del(&dummy_res->res_next);
 364                release_resource(&dummy_res->resource);
 365                kfree(dummy_res);
 366        }
 367
 368        vdev->needs_reset = true;
 369
 370        /*
 371         * If we have saved state, restore it.  If we can reset the device,
 372         * even better.  Resetting with current state seems better than
 373         * nothing, but saving and restoring current state without reset
 374         * is just busy work.
 375         */
 376        if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
 377                pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
 378
 379                if (!vdev->reset_works)
 380                        goto out;
 381
 382                pci_save_state(pdev);
 383        }
 384
 385        /*
 386         * Disable INTx and MSI, presumably to avoid spurious interrupts
 387         * during reset.  Stolen from pci_reset_function()
 388         */
 389        pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
 390
 391        /*
 392         * Try to get the locks ourselves to prevent a deadlock. The
 393         * success of this is dependent on being able to lock the device,
 394         * which is not always possible.
 395         * We can not use the "try" reset interface here, which will
 396         * overwrite the previously restored configuration information.
 397         */
 398        if (vdev->reset_works && pci_dev_trylock(pdev)) {
 399                if (!__pci_reset_function_locked(pdev))
 400                        vdev->needs_reset = false;
 401                pci_dev_unlock(pdev);
 402        }
 403
 404        pci_restore_state(pdev);
 405out:
 406        pci_disable_device(pdev);
 407
 408        if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3)
 409                vfio_pci_set_power_state(vdev, PCI_D3hot);
 410}
 411EXPORT_SYMBOL_GPL(vfio_pci_core_disable);
 412
 413static struct vfio_pci_core_device *get_pf_vdev(struct vfio_pci_core_device *vdev)
 414{
 415        struct pci_dev *physfn = pci_physfn(vdev->pdev);
 416        struct vfio_device *pf_dev;
 417
 418        if (!vdev->pdev->is_virtfn)
 419                return NULL;
 420
 421        pf_dev = vfio_device_get_from_dev(&physfn->dev);
 422        if (!pf_dev)
 423                return NULL;
 424
 425        if (pci_dev_driver(physfn) != pci_dev_driver(vdev->pdev)) {
 426                vfio_device_put(pf_dev);
 427                return NULL;
 428        }
 429
 430        return container_of(pf_dev, struct vfio_pci_core_device, vdev);
 431}
 432
 433static void vfio_pci_vf_token_user_add(struct vfio_pci_core_device *vdev, int val)
 434{
 435        struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev);
 436
 437        if (!pf_vdev)
 438                return;
 439
 440        mutex_lock(&pf_vdev->vf_token->lock);
 441        pf_vdev->vf_token->users += val;
 442        WARN_ON(pf_vdev->vf_token->users < 0);
 443        mutex_unlock(&pf_vdev->vf_token->lock);
 444
 445        vfio_device_put(&pf_vdev->vdev);
 446}
 447
 448void vfio_pci_core_close_device(struct vfio_device *core_vdev)
 449{
 450        struct vfio_pci_core_device *vdev =
 451                container_of(core_vdev, struct vfio_pci_core_device, vdev);
 452
 453        vfio_pci_vf_token_user_add(vdev, -1);
 454        vfio_spapr_pci_eeh_release(vdev->pdev);
 455        vfio_pci_core_disable(vdev);
 456
 457        mutex_lock(&vdev->igate);
 458        if (vdev->err_trigger) {
 459                eventfd_ctx_put(vdev->err_trigger);
 460                vdev->err_trigger = NULL;
 461        }
 462        if (vdev->req_trigger) {
 463                eventfd_ctx_put(vdev->req_trigger);
 464                vdev->req_trigger = NULL;
 465        }
 466        mutex_unlock(&vdev->igate);
 467}
 468EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
 469
 470void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev)
 471{
 472        vfio_pci_probe_mmaps(vdev);
 473        vfio_spapr_pci_eeh_open(vdev->pdev);
 474        vfio_pci_vf_token_user_add(vdev, 1);
 475}
 476EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable);
 477
 478static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
 479{
 480        if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
 481                u8 pin;
 482
 483                if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
 484                    vdev->nointx || vdev->pdev->is_virtfn)
 485                        return 0;
 486
 487                pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
 488
 489                return pin ? 1 : 0;
 490        } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
 491                u8 pos;
 492                u16 flags;
 493
 494                pos = vdev->pdev->msi_cap;
 495                if (pos) {
 496                        pci_read_config_word(vdev->pdev,
 497                                             pos + PCI_MSI_FLAGS, &flags);
 498                        return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
 499                }
 500        } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
 501                u8 pos;
 502                u16 flags;
 503
 504                pos = vdev->pdev->msix_cap;
 505                if (pos) {
 506                        pci_read_config_word(vdev->pdev,
 507                                             pos + PCI_MSIX_FLAGS, &flags);
 508
 509                        return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
 510                }
 511        } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
 512                if (pci_is_pcie(vdev->pdev))
 513                        return 1;
 514        } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
 515                return 1;
 516        }
 517
 518        return 0;
 519}
 520
 521static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
 522{
 523        (*(int *)data)++;
 524        return 0;
 525}
 526
 527struct vfio_pci_fill_info {
 528        int max;
 529        int cur;
 530        struct vfio_pci_dependent_device *devices;
 531};
 532
 533static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
 534{
 535        struct vfio_pci_fill_info *fill = data;
 536        struct iommu_group *iommu_group;
 537
 538        if (fill->cur == fill->max)
 539                return -EAGAIN; /* Something changed, try again */
 540
 541        iommu_group = iommu_group_get(&pdev->dev);
 542        if (!iommu_group)
 543                return -EPERM; /* Cannot reset non-isolated devices */
 544
 545        fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
 546        fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
 547        fill->devices[fill->cur].bus = pdev->bus->number;
 548        fill->devices[fill->cur].devfn = pdev->devfn;
 549        fill->cur++;
 550        iommu_group_put(iommu_group);
 551        return 0;
 552}
 553
 554struct vfio_pci_group_info {
 555        int count;
 556        struct vfio_group **groups;
 557};
 558
 559static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
 560{
 561        for (; pdev; pdev = pdev->bus->self)
 562                if (pdev->bus == slot->bus)
 563                        return (pdev->slot == slot);
 564        return false;
 565}
 566
 567struct vfio_pci_walk_info {
 568        int (*fn)(struct pci_dev *pdev, void *data);
 569        void *data;
 570        struct pci_dev *pdev;
 571        bool slot;
 572        int ret;
 573};
 574
 575static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
 576{
 577        struct vfio_pci_walk_info *walk = data;
 578
 579        if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
 580                walk->ret = walk->fn(pdev, walk->data);
 581
 582        return walk->ret;
 583}
 584
 585static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
 586                                         int (*fn)(struct pci_dev *,
 587                                                   void *data), void *data,
 588                                         bool slot)
 589{
 590        struct vfio_pci_walk_info walk = {
 591                .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
 592        };
 593
 594        pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
 595
 596        return walk.ret;
 597}
 598
 599static int msix_mmappable_cap(struct vfio_pci_core_device *vdev,
 600                              struct vfio_info_cap *caps)
 601{
 602        struct vfio_info_cap_header header = {
 603                .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
 604                .version = 1
 605        };
 606
 607        return vfio_info_add_capability(caps, &header, sizeof(header));
 608}
 609
 610int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
 611                                 unsigned int type, unsigned int subtype,
 612                                 const struct vfio_pci_regops *ops,
 613                                 size_t size, u32 flags, void *data)
 614{
 615        struct vfio_pci_region *region;
 616
 617        region = krealloc(vdev->region,
 618                          (vdev->num_regions + 1) * sizeof(*region),
 619                          GFP_KERNEL);
 620        if (!region)
 621                return -ENOMEM;
 622
 623        vdev->region = region;
 624        vdev->region[vdev->num_regions].type = type;
 625        vdev->region[vdev->num_regions].subtype = subtype;
 626        vdev->region[vdev->num_regions].ops = ops;
 627        vdev->region[vdev->num_regions].size = size;
 628        vdev->region[vdev->num_regions].flags = flags;
 629        vdev->region[vdev->num_regions].data = data;
 630
 631        vdev->num_regions++;
 632
 633        return 0;
 634}
 635EXPORT_SYMBOL_GPL(vfio_pci_register_dev_region);
 636
 637long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 638                unsigned long arg)
 639{
 640        struct vfio_pci_core_device *vdev =
 641                container_of(core_vdev, struct vfio_pci_core_device, vdev);
 642        unsigned long minsz;
 643
 644        if (cmd == VFIO_DEVICE_GET_INFO) {
 645                struct vfio_device_info info;
 646                struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
 647                unsigned long capsz;
 648                int ret;
 649
 650                minsz = offsetofend(struct vfio_device_info, num_irqs);
 651
 652                /* For backward compatibility, cannot require this */
 653                capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
 654
 655                if (copy_from_user(&info, (void __user *)arg, minsz))
 656                        return -EFAULT;
 657
 658                if (info.argsz < minsz)
 659                        return -EINVAL;
 660
 661                if (info.argsz >= capsz) {
 662                        minsz = capsz;
 663                        info.cap_offset = 0;
 664                }
 665
 666                info.flags = VFIO_DEVICE_FLAGS_PCI;
 667
 668                if (vdev->reset_works)
 669                        info.flags |= VFIO_DEVICE_FLAGS_RESET;
 670
 671                info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
 672                info.num_irqs = VFIO_PCI_NUM_IRQS;
 673
 674                ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
 675                if (ret && ret != -ENODEV) {
 676                        pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
 677                        return ret;
 678                }
 679
 680                if (caps.size) {
 681                        info.flags |= VFIO_DEVICE_FLAGS_CAPS;
 682                        if (info.argsz < sizeof(info) + caps.size) {
 683                                info.argsz = sizeof(info) + caps.size;
 684                        } else {
 685                                vfio_info_cap_shift(&caps, sizeof(info));
 686                                if (copy_to_user((void __user *)arg +
 687                                                  sizeof(info), caps.buf,
 688                                                  caps.size)) {
 689                                        kfree(caps.buf);
 690                                        return -EFAULT;
 691                                }
 692                                info.cap_offset = sizeof(info);
 693                        }
 694
 695                        kfree(caps.buf);
 696                }
 697
 698                return copy_to_user((void __user *)arg, &info, minsz) ?
 699                        -EFAULT : 0;
 700
 701        } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
 702                struct pci_dev *pdev = vdev->pdev;
 703                struct vfio_region_info info;
 704                struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
 705                int i, ret;
 706
 707                minsz = offsetofend(struct vfio_region_info, offset);
 708
 709                if (copy_from_user(&info, (void __user *)arg, minsz))
 710                        return -EFAULT;
 711
 712                if (info.argsz < minsz)
 713                        return -EINVAL;
 714
 715                switch (info.index) {
 716                case VFIO_PCI_CONFIG_REGION_INDEX:
 717                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 718                        info.size = pdev->cfg_size;
 719                        info.flags = VFIO_REGION_INFO_FLAG_READ |
 720                                     VFIO_REGION_INFO_FLAG_WRITE;
 721                        break;
 722                case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
 723                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 724                        info.size = pci_resource_len(pdev, info.index);
 725                        if (!info.size) {
 726                                info.flags = 0;
 727                                break;
 728                        }
 729
 730                        info.flags = VFIO_REGION_INFO_FLAG_READ |
 731                                     VFIO_REGION_INFO_FLAG_WRITE;
 732                        if (vdev->bar_mmap_supported[info.index]) {
 733                                info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
 734                                if (info.index == vdev->msix_bar) {
 735                                        ret = msix_mmappable_cap(vdev, &caps);
 736                                        if (ret)
 737                                                return ret;
 738                                }
 739                        }
 740
 741                        break;
 742                case VFIO_PCI_ROM_REGION_INDEX:
 743                {
 744                        void __iomem *io;
 745                        size_t size;
 746                        u16 cmd;
 747
 748                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 749                        info.flags = 0;
 750
 751                        /* Report the BAR size, not the ROM size */
 752                        info.size = pci_resource_len(pdev, info.index);
 753                        if (!info.size) {
 754                                /* Shadow ROMs appear as PCI option ROMs */
 755                                if (pdev->resource[PCI_ROM_RESOURCE].flags &
 756                                                        IORESOURCE_ROM_SHADOW)
 757                                        info.size = 0x20000;
 758                                else
 759                                        break;
 760                        }
 761
 762                        /*
 763                         * Is it really there?  Enable memory decode for
 764                         * implicit access in pci_map_rom().
 765                         */
 766                        cmd = vfio_pci_memory_lock_and_enable(vdev);
 767                        io = pci_map_rom(pdev, &size);
 768                        if (io) {
 769                                info.flags = VFIO_REGION_INFO_FLAG_READ;
 770                                pci_unmap_rom(pdev, io);
 771                        } else {
 772                                info.size = 0;
 773                        }
 774                        vfio_pci_memory_unlock_and_restore(vdev, cmd);
 775
 776                        break;
 777                }
 778                case VFIO_PCI_VGA_REGION_INDEX:
 779                        if (!vdev->has_vga)
 780                                return -EINVAL;
 781
 782                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 783                        info.size = 0xc0000;
 784                        info.flags = VFIO_REGION_INFO_FLAG_READ |
 785                                     VFIO_REGION_INFO_FLAG_WRITE;
 786
 787                        break;
 788                default:
 789                {
 790                        struct vfio_region_info_cap_type cap_type = {
 791                                        .header.id = VFIO_REGION_INFO_CAP_TYPE,
 792                                        .header.version = 1 };
 793
 794                        if (info.index >=
 795                            VFIO_PCI_NUM_REGIONS + vdev->num_regions)
 796                                return -EINVAL;
 797                        info.index = array_index_nospec(info.index,
 798                                                        VFIO_PCI_NUM_REGIONS +
 799                                                        vdev->num_regions);
 800
 801                        i = info.index - VFIO_PCI_NUM_REGIONS;
 802
 803                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 804                        info.size = vdev->region[i].size;
 805                        info.flags = vdev->region[i].flags;
 806
 807                        cap_type.type = vdev->region[i].type;
 808                        cap_type.subtype = vdev->region[i].subtype;
 809
 810                        ret = vfio_info_add_capability(&caps, &cap_type.header,
 811                                                       sizeof(cap_type));
 812                        if (ret)
 813                                return ret;
 814
 815                        if (vdev->region[i].ops->add_capability) {
 816                                ret = vdev->region[i].ops->add_capability(vdev,
 817                                                &vdev->region[i], &caps);
 818                                if (ret)
 819                                        return ret;
 820                        }
 821                }
 822                }
 823
 824                if (caps.size) {
 825                        info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
 826                        if (info.argsz < sizeof(info) + caps.size) {
 827                                info.argsz = sizeof(info) + caps.size;
 828                                info.cap_offset = 0;
 829                        } else {
 830                                vfio_info_cap_shift(&caps, sizeof(info));
 831                                if (copy_to_user((void __user *)arg +
 832                                                  sizeof(info), caps.buf,
 833                                                  caps.size)) {
 834                                        kfree(caps.buf);
 835                                        return -EFAULT;
 836                                }
 837                                info.cap_offset = sizeof(info);
 838                        }
 839
 840                        kfree(caps.buf);
 841                }
 842
 843                return copy_to_user((void __user *)arg, &info, minsz) ?
 844                        -EFAULT : 0;
 845
 846        } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
 847                struct vfio_irq_info info;
 848
 849                minsz = offsetofend(struct vfio_irq_info, count);
 850
 851                if (copy_from_user(&info, (void __user *)arg, minsz))
 852                        return -EFAULT;
 853
 854                if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
 855                        return -EINVAL;
 856
 857                switch (info.index) {
 858                case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
 859                case VFIO_PCI_REQ_IRQ_INDEX:
 860                        break;
 861                case VFIO_PCI_ERR_IRQ_INDEX:
 862                        if (pci_is_pcie(vdev->pdev))
 863                                break;
 864                        fallthrough;
 865                default:
 866                        return -EINVAL;
 867                }
 868
 869                info.flags = VFIO_IRQ_INFO_EVENTFD;
 870
 871                info.count = vfio_pci_get_irq_count(vdev, info.index);
 872
 873                if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
 874                        info.flags |= (VFIO_IRQ_INFO_MASKABLE |
 875                                       VFIO_IRQ_INFO_AUTOMASKED);
 876                else
 877                        info.flags |= VFIO_IRQ_INFO_NORESIZE;
 878
 879                return copy_to_user((void __user *)arg, &info, minsz) ?
 880                        -EFAULT : 0;
 881
 882        } else if (cmd == VFIO_DEVICE_SET_IRQS) {
 883                struct vfio_irq_set hdr;
 884                u8 *data = NULL;
 885                int max, ret = 0;
 886                size_t data_size = 0;
 887
 888                minsz = offsetofend(struct vfio_irq_set, count);
 889
 890                if (copy_from_user(&hdr, (void __user *)arg, minsz))
 891                        return -EFAULT;
 892
 893                max = vfio_pci_get_irq_count(vdev, hdr.index);
 894
 895                ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
 896                                                 VFIO_PCI_NUM_IRQS, &data_size);
 897                if (ret)
 898                        return ret;
 899
 900                if (data_size) {
 901                        data = memdup_user((void __user *)(arg + minsz),
 902                                            data_size);
 903                        if (IS_ERR(data))
 904                                return PTR_ERR(data);
 905                }
 906
 907                mutex_lock(&vdev->igate);
 908
 909                ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
 910                                              hdr.start, hdr.count, data);
 911
 912                mutex_unlock(&vdev->igate);
 913                kfree(data);
 914
 915                return ret;
 916
 917        } else if (cmd == VFIO_DEVICE_RESET) {
 918                int ret;
 919
 920                if (!vdev->reset_works)
 921                        return -EINVAL;
 922
 923                vfio_pci_zap_and_down_write_memory_lock(vdev);
 924                ret = pci_try_reset_function(vdev->pdev);
 925                up_write(&vdev->memory_lock);
 926
 927                return ret;
 928
 929        } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
 930                struct vfio_pci_hot_reset_info hdr;
 931                struct vfio_pci_fill_info fill = { 0 };
 932                struct vfio_pci_dependent_device *devices = NULL;
 933                bool slot = false;
 934                int ret = 0;
 935
 936                minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
 937
 938                if (copy_from_user(&hdr, (void __user *)arg, minsz))
 939                        return -EFAULT;
 940
 941                if (hdr.argsz < minsz)
 942                        return -EINVAL;
 943
 944                hdr.flags = 0;
 945
 946                /* Can we do a slot or bus reset or neither? */
 947                if (!pci_probe_reset_slot(vdev->pdev->slot))
 948                        slot = true;
 949                else if (pci_probe_reset_bus(vdev->pdev->bus))
 950                        return -ENODEV;
 951
 952                /* How many devices are affected? */
 953                ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
 954                                                    vfio_pci_count_devs,
 955                                                    &fill.max, slot);
 956                if (ret)
 957                        return ret;
 958
 959                WARN_ON(!fill.max); /* Should always be at least one */
 960
 961                /*
 962                 * If there's enough space, fill it now, otherwise return
 963                 * -ENOSPC and the number of devices affected.
 964                 */
 965                if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
 966                        ret = -ENOSPC;
 967                        hdr.count = fill.max;
 968                        goto reset_info_exit;
 969                }
 970
 971                devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
 972                if (!devices)
 973                        return -ENOMEM;
 974
 975                fill.devices = devices;
 976
 977                ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
 978                                                    vfio_pci_fill_devs,
 979                                                    &fill, slot);
 980
 981                /*
 982                 * If a device was removed between counting and filling,
 983                 * we may come up short of fill.max.  If a device was
 984                 * added, we'll have a return of -EAGAIN above.
 985                 */
 986                if (!ret)
 987                        hdr.count = fill.cur;
 988
 989reset_info_exit:
 990                if (copy_to_user((void __user *)arg, &hdr, minsz))
 991                        ret = -EFAULT;
 992
 993                if (!ret) {
 994                        if (copy_to_user((void __user *)(arg + minsz), devices,
 995                                         hdr.count * sizeof(*devices)))
 996                                ret = -EFAULT;
 997                }
 998
 999                kfree(devices);
1000                return ret;
1001
1002        } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
1003                struct vfio_pci_hot_reset hdr;
1004                int32_t *group_fds;
1005                struct vfio_group **groups;
1006                struct vfio_pci_group_info info;
1007                bool slot = false;
1008                int group_idx, count = 0, ret = 0;
1009
1010                minsz = offsetofend(struct vfio_pci_hot_reset, count);
1011
1012                if (copy_from_user(&hdr, (void __user *)arg, minsz))
1013                        return -EFAULT;
1014
1015                if (hdr.argsz < minsz || hdr.flags)
1016                        return -EINVAL;
1017
1018                /* Can we do a slot or bus reset or neither? */
1019                if (!pci_probe_reset_slot(vdev->pdev->slot))
1020                        slot = true;
1021                else if (pci_probe_reset_bus(vdev->pdev->bus))
1022                        return -ENODEV;
1023
1024                /*
1025                 * We can't let userspace give us an arbitrarily large
1026                 * buffer to copy, so verify how many we think there
1027                 * could be.  Note groups can have multiple devices so
1028                 * one group per device is the max.
1029                 */
1030                ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
1031                                                    vfio_pci_count_devs,
1032                                                    &count, slot);
1033                if (ret)
1034                        return ret;
1035
1036                /* Somewhere between 1 and count is OK */
1037                if (!hdr.count || hdr.count > count)
1038                        return -EINVAL;
1039
1040                group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
1041                groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
1042                if (!group_fds || !groups) {
1043                        kfree(group_fds);
1044                        kfree(groups);
1045                        return -ENOMEM;
1046                }
1047
1048                if (copy_from_user(group_fds, (void __user *)(arg + minsz),
1049                                   hdr.count * sizeof(*group_fds))) {
1050                        kfree(group_fds);
1051                        kfree(groups);
1052                        return -EFAULT;
1053                }
1054
1055                /*
1056                 * For each group_fd, get the group through the vfio external
1057                 * user interface and store the group and iommu ID.  This
1058                 * ensures the group is held across the reset.
1059                 */
1060                for (group_idx = 0; group_idx < hdr.count; group_idx++) {
1061                        struct vfio_group *group;
1062                        struct fd f = fdget(group_fds[group_idx]);
1063                        if (!f.file) {
1064                                ret = -EBADF;
1065                                break;
1066                        }
1067
1068                        group = vfio_group_get_external_user(f.file);
1069                        fdput(f);
1070                        if (IS_ERR(group)) {
1071                                ret = PTR_ERR(group);
1072                                break;
1073                        }
1074
1075                        groups[group_idx] = group;
1076                }
1077
1078                kfree(group_fds);
1079
1080                /* release reference to groups on error */
1081                if (ret)
1082                        goto hot_reset_release;
1083
1084                info.count = hdr.count;
1085                info.groups = groups;
1086
1087                ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
1088
1089hot_reset_release:
1090                for (group_idx--; group_idx >= 0; group_idx--)
1091                        vfio_group_put_external_user(groups[group_idx]);
1092
1093                kfree(groups);
1094                return ret;
1095        } else if (cmd == VFIO_DEVICE_IOEVENTFD) {
1096                struct vfio_device_ioeventfd ioeventfd;
1097                int count;
1098
1099                minsz = offsetofend(struct vfio_device_ioeventfd, fd);
1100
1101                if (copy_from_user(&ioeventfd, (void __user *)arg, minsz))
1102                        return -EFAULT;
1103
1104                if (ioeventfd.argsz < minsz)
1105                        return -EINVAL;
1106
1107                if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
1108                        return -EINVAL;
1109
1110                count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
1111
1112                if (hweight8(count) != 1 || ioeventfd.fd < -1)
1113                        return -EINVAL;
1114
1115                return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
1116                                          ioeventfd.data, count, ioeventfd.fd);
1117        } else if (cmd == VFIO_DEVICE_FEATURE) {
1118                struct vfio_device_feature feature;
1119                uuid_t uuid;
1120
1121                minsz = offsetofend(struct vfio_device_feature, flags);
1122
1123                if (copy_from_user(&feature, (void __user *)arg, minsz))
1124                        return -EFAULT;
1125
1126                if (feature.argsz < minsz)
1127                        return -EINVAL;
1128
1129                /* Check unknown flags */
1130                if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK |
1131                                      VFIO_DEVICE_FEATURE_SET |
1132                                      VFIO_DEVICE_FEATURE_GET |
1133                                      VFIO_DEVICE_FEATURE_PROBE))
1134                        return -EINVAL;
1135
1136                /* GET & SET are mutually exclusive except with PROBE */
1137                if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1138                    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1139                    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1140                        return -EINVAL;
1141
1142                switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1143                case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
1144                        if (!vdev->vf_token)
1145                                return -ENOTTY;
1146
1147                        /*
1148                         * We do not support GET of the VF Token UUID as this
1149                         * could expose the token of the previous device user.
1150                         */
1151                        if (feature.flags & VFIO_DEVICE_FEATURE_GET)
1152                                return -EINVAL;
1153
1154                        if (feature.flags & VFIO_DEVICE_FEATURE_PROBE)
1155                                return 0;
1156
1157                        /* Don't SET unless told to do so */
1158                        if (!(feature.flags & VFIO_DEVICE_FEATURE_SET))
1159                                return -EINVAL;
1160
1161                        if (feature.argsz < minsz + sizeof(uuid))
1162                                return -EINVAL;
1163
1164                        if (copy_from_user(&uuid, (void __user *)(arg + minsz),
1165                                           sizeof(uuid)))
1166                                return -EFAULT;
1167
1168                        mutex_lock(&vdev->vf_token->lock);
1169                        uuid_copy(&vdev->vf_token->uuid, &uuid);
1170                        mutex_unlock(&vdev->vf_token->lock);
1171
1172                        return 0;
1173                default:
1174                        return -ENOTTY;
1175                }
1176        }
1177
1178        return -ENOTTY;
1179}
1180EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl);
1181
1182static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
1183                           size_t count, loff_t *ppos, bool iswrite)
1184{
1185        unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1186
1187        if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1188                return -EINVAL;
1189
1190        switch (index) {
1191        case VFIO_PCI_CONFIG_REGION_INDEX:
1192                return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
1193
1194        case VFIO_PCI_ROM_REGION_INDEX:
1195                if (iswrite)
1196                        return -EINVAL;
1197                return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
1198
1199        case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1200                return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
1201
1202        case VFIO_PCI_VGA_REGION_INDEX:
1203                return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
1204        default:
1205                index -= VFIO_PCI_NUM_REGIONS;
1206                return vdev->region[index].ops->rw(vdev, buf,
1207                                                   count, ppos, iswrite);
1208        }
1209
1210        return -EINVAL;
1211}
1212
1213ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
1214                size_t count, loff_t *ppos)
1215{
1216        struct vfio_pci_core_device *vdev =
1217                container_of(core_vdev, struct vfio_pci_core_device, vdev);
1218
1219        if (!count)
1220                return 0;
1221
1222        return vfio_pci_rw(vdev, buf, count, ppos, false);
1223}
1224EXPORT_SYMBOL_GPL(vfio_pci_core_read);
1225
1226ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
1227                size_t count, loff_t *ppos)
1228{
1229        struct vfio_pci_core_device *vdev =
1230                container_of(core_vdev, struct vfio_pci_core_device, vdev);
1231
1232        if (!count)
1233                return 0;
1234
1235        return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true);
1236}
1237EXPORT_SYMBOL_GPL(vfio_pci_core_write);
1238
1239/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
1240static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
1241{
1242        struct vfio_pci_mmap_vma *mmap_vma, *tmp;
1243
1244        /*
1245         * Lock ordering:
1246         * vma_lock is nested under mmap_lock for vm_ops callback paths.
1247         * The memory_lock semaphore is used by both code paths calling
1248         * into this function to zap vmas and the vm_ops.fault callback
1249         * to protect the memory enable state of the device.
1250         *
1251         * When zapping vmas we need to maintain the mmap_lock => vma_lock
1252         * ordering, which requires using vma_lock to walk vma_list to
1253         * acquire an mm, then dropping vma_lock to get the mmap_lock and
1254         * reacquiring vma_lock.  This logic is derived from similar
1255         * requirements in uverbs_user_mmap_disassociate().
1256         *
1257         * mmap_lock must always be the top-level lock when it is taken.
1258         * Therefore we can only hold the memory_lock write lock when
1259         * vma_list is empty, as we'd need to take mmap_lock to clear
1260         * entries.  vma_list can only be guaranteed empty when holding
1261         * vma_lock, thus memory_lock is nested under vma_lock.
1262         *
1263         * This enables the vm_ops.fault callback to acquire vma_lock,
1264         * followed by memory_lock read lock, while already holding
1265         * mmap_lock without risk of deadlock.
1266         */
1267        while (1) {
1268                struct mm_struct *mm = NULL;
1269
1270                if (try) {
1271                        if (!mutex_trylock(&vdev->vma_lock))
1272                                return 0;
1273                } else {
1274                        mutex_lock(&vdev->vma_lock);
1275                }
1276                while (!list_empty(&vdev->vma_list)) {
1277                        mmap_vma = list_first_entry(&vdev->vma_list,
1278                                                    struct vfio_pci_mmap_vma,
1279                                                    vma_next);
1280                        mm = mmap_vma->vma->vm_mm;
1281                        if (mmget_not_zero(mm))
1282                                break;
1283
1284                        list_del(&mmap_vma->vma_next);
1285                        kfree(mmap_vma);
1286                        mm = NULL;
1287                }
1288                if (!mm)
1289                        return 1;
1290                mutex_unlock(&vdev->vma_lock);
1291
1292                if (try) {
1293                        if (!mmap_read_trylock(mm)) {
1294                                mmput(mm);
1295                                return 0;
1296                        }
1297                } else {
1298                        mmap_read_lock(mm);
1299                }
1300                if (try) {
1301                        if (!mutex_trylock(&vdev->vma_lock)) {
1302                                mmap_read_unlock(mm);
1303                                mmput(mm);
1304                                return 0;
1305                        }
1306                } else {
1307                        mutex_lock(&vdev->vma_lock);
1308                }
1309                list_for_each_entry_safe(mmap_vma, tmp,
1310                                         &vdev->vma_list, vma_next) {
1311                        struct vm_area_struct *vma = mmap_vma->vma;
1312
1313                        if (vma->vm_mm != mm)
1314                                continue;
1315
1316                        list_del(&mmap_vma->vma_next);
1317                        kfree(mmap_vma);
1318
1319                        zap_vma_ptes(vma, vma->vm_start,
1320                                     vma->vm_end - vma->vm_start);
1321                }
1322                mutex_unlock(&vdev->vma_lock);
1323                mmap_read_unlock(mm);
1324                mmput(mm);
1325        }
1326}
1327
1328void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
1329{
1330        vfio_pci_zap_and_vma_lock(vdev, false);
1331        down_write(&vdev->memory_lock);
1332        mutex_unlock(&vdev->vma_lock);
1333}
1334
1335u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
1336{
1337        u16 cmd;
1338
1339        down_write(&vdev->memory_lock);
1340        pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd);
1341        if (!(cmd & PCI_COMMAND_MEMORY))
1342                pci_write_config_word(vdev->pdev, PCI_COMMAND,
1343                                      cmd | PCI_COMMAND_MEMORY);
1344
1345        return cmd;
1346}
1347
1348void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
1349{
1350        pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
1351        up_write(&vdev->memory_lock);
1352}
1353
1354/* Caller holds vma_lock */
1355static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev,
1356                              struct vm_area_struct *vma)
1357{
1358        struct vfio_pci_mmap_vma *mmap_vma;
1359
1360        mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL);
1361        if (!mmap_vma)
1362                return -ENOMEM;
1363
1364        mmap_vma->vma = vma;
1365        list_add(&mmap_vma->vma_next, &vdev->vma_list);
1366
1367        return 0;
1368}
1369
1370/*
1371 * Zap mmaps on open so that we can fault them in on access and therefore
1372 * our vma_list only tracks mappings accessed since last zap.
1373 */
1374static void vfio_pci_mmap_open(struct vm_area_struct *vma)
1375{
1376        zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1377}
1378
1379static void vfio_pci_mmap_close(struct vm_area_struct *vma)
1380{
1381        struct vfio_pci_core_device *vdev = vma->vm_private_data;
1382        struct vfio_pci_mmap_vma *mmap_vma;
1383
1384        mutex_lock(&vdev->vma_lock);
1385        list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1386                if (mmap_vma->vma == vma) {
1387                        list_del(&mmap_vma->vma_next);
1388                        kfree(mmap_vma);
1389                        break;
1390                }
1391        }
1392        mutex_unlock(&vdev->vma_lock);
1393}
1394
1395static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
1396{
1397        struct vm_area_struct *vma = vmf->vma;
1398        struct vfio_pci_core_device *vdev = vma->vm_private_data;
1399        struct vfio_pci_mmap_vma *mmap_vma;
1400        vm_fault_t ret = VM_FAULT_NOPAGE;
1401
1402        mutex_lock(&vdev->vma_lock);
1403        down_read(&vdev->memory_lock);
1404
1405        if (!__vfio_pci_memory_enabled(vdev)) {
1406                ret = VM_FAULT_SIGBUS;
1407                goto up_out;
1408        }
1409
1410        /*
1411         * We populate the whole vma on fault, so we need to test whether
1412         * the vma has already been mapped, such as for concurrent faults
1413         * to the same vma.  io_remap_pfn_range() will trigger a BUG_ON if
1414         * we ask it to fill the same range again.
1415         */
1416        list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1417                if (mmap_vma->vma == vma)
1418                        goto up_out;
1419        }
1420
1421        if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
1422                               vma->vm_end - vma->vm_start,
1423                               vma->vm_page_prot)) {
1424                ret = VM_FAULT_SIGBUS;
1425                zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1426                goto up_out;
1427        }
1428
1429        if (__vfio_pci_add_vma(vdev, vma)) {
1430                ret = VM_FAULT_OOM;
1431                zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1432        }
1433
1434up_out:
1435        up_read(&vdev->memory_lock);
1436        mutex_unlock(&vdev->vma_lock);
1437        return ret;
1438}
1439
1440static const struct vm_operations_struct vfio_pci_mmap_ops = {
1441        .open = vfio_pci_mmap_open,
1442        .close = vfio_pci_mmap_close,
1443        .fault = vfio_pci_mmap_fault,
1444};
1445
1446int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
1447{
1448        struct vfio_pci_core_device *vdev =
1449                container_of(core_vdev, struct vfio_pci_core_device, vdev);
1450        struct pci_dev *pdev = vdev->pdev;
1451        unsigned int index;
1452        u64 phys_len, req_len, pgoff, req_start;
1453        int ret;
1454
1455        index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1456
1457        if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1458                return -EINVAL;
1459        if (vma->vm_end < vma->vm_start)
1460                return -EINVAL;
1461        if ((vma->vm_flags & VM_SHARED) == 0)
1462                return -EINVAL;
1463        if (index >= VFIO_PCI_NUM_REGIONS) {
1464                int regnum = index - VFIO_PCI_NUM_REGIONS;
1465                struct vfio_pci_region *region = vdev->region + regnum;
1466
1467                if (region->ops && region->ops->mmap &&
1468                    (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
1469                        return region->ops->mmap(vdev, region, vma);
1470                return -EINVAL;
1471        }
1472        if (index >= VFIO_PCI_ROM_REGION_INDEX)
1473                return -EINVAL;
1474        if (!vdev->bar_mmap_supported[index])
1475                return -EINVAL;
1476
1477        phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
1478        req_len = vma->vm_end - vma->vm_start;
1479        pgoff = vma->vm_pgoff &
1480                ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1481        req_start = pgoff << PAGE_SHIFT;
1482
1483        if (req_start + req_len > phys_len)
1484                return -EINVAL;
1485
1486        /*
1487         * Even though we don't make use of the barmap for the mmap,
1488         * we need to request the region and the barmap tracks that.
1489         */
1490        if (!vdev->barmap[index]) {
1491                ret = pci_request_selected_regions(pdev,
1492                                                   1 << index, "vfio-pci");
1493                if (ret)
1494                        return ret;
1495
1496                vdev->barmap[index] = pci_iomap(pdev, index, 0);
1497                if (!vdev->barmap[index]) {
1498                        pci_release_selected_regions(pdev, 1 << index);
1499                        return -ENOMEM;
1500                }
1501        }
1502
1503        vma->vm_private_data = vdev;
1504        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1505        vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
1506
1507        /*
1508         * See remap_pfn_range(), called from vfio_pci_fault() but we can't
1509         * change vm_flags within the fault handler.  Set them now.
1510         */
1511        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1512        vma->vm_ops = &vfio_pci_mmap_ops;
1513
1514        return 0;
1515}
1516EXPORT_SYMBOL_GPL(vfio_pci_core_mmap);
1517
1518void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
1519{
1520        struct vfio_pci_core_device *vdev =
1521                container_of(core_vdev, struct vfio_pci_core_device, vdev);
1522        struct pci_dev *pdev = vdev->pdev;
1523
1524        mutex_lock(&vdev->igate);
1525
1526        if (vdev->req_trigger) {
1527                if (!(count % 10))
1528                        pci_notice_ratelimited(pdev,
1529                                "Relaying device request to user (#%u)\n",
1530                                count);
1531                eventfd_signal(vdev->req_trigger, 1);
1532        } else if (count == 0) {
1533                pci_warn(pdev,
1534                        "No device request channel registered, blocked until released by user\n");
1535        }
1536
1537        mutex_unlock(&vdev->igate);
1538}
1539EXPORT_SYMBOL_GPL(vfio_pci_core_request);
1540
1541static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
1542                                      bool vf_token, uuid_t *uuid)
1543{
1544        /*
1545         * There's always some degree of trust or collaboration between SR-IOV
1546         * PF and VFs, even if just that the PF hosts the SR-IOV capability and
1547         * can disrupt VFs with a reset, but often the PF has more explicit
1548         * access to deny service to the VF or access data passed through the
1549         * VF.  We therefore require an opt-in via a shared VF token (UUID) to
1550         * represent this trust.  This both prevents that a VF driver might
1551         * assume the PF driver is a trusted, in-kernel driver, and also that
1552         * a PF driver might be replaced with a rogue driver, unknown to in-use
1553         * VF drivers.
1554         *
1555         * Therefore when presented with a VF, if the PF is a vfio device and
1556         * it is bound to the vfio-pci driver, the user needs to provide a VF
1557         * token to access the device, in the form of appending a vf_token to
1558         * the device name, for example:
1559         *
1560         * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
1561         *
1562         * When presented with a PF which has VFs in use, the user must also
1563         * provide the current VF token to prove collaboration with existing
1564         * VF users.  If VFs are not in use, the VF token provided for the PF
1565         * device will act to set the VF token.
1566         *
1567         * If the VF token is provided but unused, an error is generated.
1568         */
1569        if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token)
1570                return 0; /* No VF token provided or required */
1571
1572        if (vdev->pdev->is_virtfn) {
1573                struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev);
1574                bool match;
1575
1576                if (!pf_vdev) {
1577                        if (!vf_token)
1578                                return 0; /* PF is not vfio-pci, no VF token */
1579
1580                        pci_info_ratelimited(vdev->pdev,
1581                                "VF token incorrectly provided, PF not bound to vfio-pci\n");
1582                        return -EINVAL;
1583                }
1584
1585                if (!vf_token) {
1586                        vfio_device_put(&pf_vdev->vdev);
1587                        pci_info_ratelimited(vdev->pdev,
1588                                "VF token required to access device\n");
1589                        return -EACCES;
1590                }
1591
1592                mutex_lock(&pf_vdev->vf_token->lock);
1593                match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
1594                mutex_unlock(&pf_vdev->vf_token->lock);
1595
1596                vfio_device_put(&pf_vdev->vdev);
1597
1598                if (!match) {
1599                        pci_info_ratelimited(vdev->pdev,
1600                                "Incorrect VF token provided for device\n");
1601                        return -EACCES;
1602                }
1603        } else if (vdev->vf_token) {
1604                mutex_lock(&vdev->vf_token->lock);
1605                if (vdev->vf_token->users) {
1606                        if (!vf_token) {
1607                                mutex_unlock(&vdev->vf_token->lock);
1608                                pci_info_ratelimited(vdev->pdev,
1609                                        "VF token required to access device\n");
1610                                return -EACCES;
1611                        }
1612
1613                        if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
1614                                mutex_unlock(&vdev->vf_token->lock);
1615                                pci_info_ratelimited(vdev->pdev,
1616                                        "Incorrect VF token provided for device\n");
1617                                return -EACCES;
1618                        }
1619                } else if (vf_token) {
1620                        uuid_copy(&vdev->vf_token->uuid, uuid);
1621                }
1622
1623                mutex_unlock(&vdev->vf_token->lock);
1624        } else if (vf_token) {
1625                pci_info_ratelimited(vdev->pdev,
1626                        "VF token incorrectly provided, not a PF or VF\n");
1627                return -EINVAL;
1628        }
1629
1630        return 0;
1631}
1632
1633#define VF_TOKEN_ARG "vf_token="
1634
1635int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
1636{
1637        struct vfio_pci_core_device *vdev =
1638                container_of(core_vdev, struct vfio_pci_core_device, vdev);
1639        bool vf_token = false;
1640        uuid_t uuid;
1641        int ret;
1642
1643        if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
1644                return 0; /* No match */
1645
1646        if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
1647                buf += strlen(pci_name(vdev->pdev));
1648
1649                if (*buf != ' ')
1650                        return 0; /* No match: non-whitespace after name */
1651
1652                while (*buf) {
1653                        if (*buf == ' ') {
1654                                buf++;
1655                                continue;
1656                        }
1657
1658                        if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
1659                                                  strlen(VF_TOKEN_ARG))) {
1660                                buf += strlen(VF_TOKEN_ARG);
1661
1662                                if (strlen(buf) < UUID_STRING_LEN)
1663                                        return -EINVAL;
1664
1665                                ret = uuid_parse(buf, &uuid);
1666                                if (ret)
1667                                        return ret;
1668
1669                                vf_token = true;
1670                                buf += UUID_STRING_LEN;
1671                        } else {
1672                                /* Unknown/duplicate option */
1673                                return -EINVAL;
1674                        }
1675                }
1676        }
1677
1678        ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
1679        if (ret)
1680                return ret;
1681
1682        return 1; /* Match */
1683}
1684EXPORT_SYMBOL_GPL(vfio_pci_core_match);
1685
1686static int vfio_pci_bus_notifier(struct notifier_block *nb,
1687                                 unsigned long action, void *data)
1688{
1689        struct vfio_pci_core_device *vdev = container_of(nb,
1690                                                    struct vfio_pci_core_device, nb);
1691        struct device *dev = data;
1692        struct pci_dev *pdev = to_pci_dev(dev);
1693        struct pci_dev *physfn = pci_physfn(pdev);
1694
1695        if (action == BUS_NOTIFY_ADD_DEVICE &&
1696            pdev->is_virtfn && physfn == vdev->pdev) {
1697                pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
1698                         pci_name(pdev));
1699                pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
1700                                                  vdev->vdev.ops->name);
1701        } else if (action == BUS_NOTIFY_BOUND_DRIVER &&
1702                   pdev->is_virtfn && physfn == vdev->pdev) {
1703                struct pci_driver *drv = pci_dev_driver(pdev);
1704
1705                if (drv && drv != pci_dev_driver(vdev->pdev))
1706                        pci_warn(vdev->pdev,
1707                                 "VF %s bound to driver %s while PF bound to driver %s\n",
1708                                 pci_name(pdev), drv->name,
1709                                 pci_dev_driver(vdev->pdev)->name);
1710        }
1711
1712        return 0;
1713}
1714
1715static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev)
1716{
1717        struct pci_dev *pdev = vdev->pdev;
1718        int ret;
1719
1720        if (!pdev->is_physfn)
1721                return 0;
1722
1723        vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
1724        if (!vdev->vf_token)
1725                return -ENOMEM;
1726
1727        mutex_init(&vdev->vf_token->lock);
1728        uuid_gen(&vdev->vf_token->uuid);
1729
1730        vdev->nb.notifier_call = vfio_pci_bus_notifier;
1731        ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
1732        if (ret) {
1733                kfree(vdev->vf_token);
1734                return ret;
1735        }
1736        return 0;
1737}
1738
1739static void vfio_pci_vf_uninit(struct vfio_pci_core_device *vdev)
1740{
1741        if (!vdev->vf_token)
1742                return;
1743
1744        bus_unregister_notifier(&pci_bus_type, &vdev->nb);
1745        WARN_ON(vdev->vf_token->users);
1746        mutex_destroy(&vdev->vf_token->lock);
1747        kfree(vdev->vf_token);
1748}
1749
1750static int vfio_pci_vga_init(struct vfio_pci_core_device *vdev)
1751{
1752        struct pci_dev *pdev = vdev->pdev;
1753        int ret;
1754
1755        if (!vfio_pci_is_vga(pdev))
1756                return 0;
1757
1758        ret = vga_client_register(pdev, vfio_pci_set_decode);
1759        if (ret)
1760                return ret;
1761        vga_set_legacy_decoding(pdev, vfio_pci_set_decode(pdev, false));
1762        return 0;
1763}
1764
1765static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev)
1766{
1767        struct pci_dev *pdev = vdev->pdev;
1768
1769        if (!vfio_pci_is_vga(pdev))
1770                return;
1771        vga_client_unregister(pdev);
1772        vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
1773                                              VGA_RSRC_LEGACY_IO |
1774                                              VGA_RSRC_LEGACY_MEM);
1775}
1776
1777void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
1778                               struct pci_dev *pdev,
1779                               const struct vfio_device_ops *vfio_pci_ops)
1780{
1781        vfio_init_group_dev(&vdev->vdev, &pdev->dev, vfio_pci_ops);
1782        vdev->pdev = pdev;
1783        vdev->irq_type = VFIO_PCI_NUM_IRQS;
1784        mutex_init(&vdev->igate);
1785        spin_lock_init(&vdev->irqlock);
1786        mutex_init(&vdev->ioeventfds_lock);
1787        INIT_LIST_HEAD(&vdev->dummy_resources_list);
1788        INIT_LIST_HEAD(&vdev->ioeventfds_list);
1789        mutex_init(&vdev->vma_lock);
1790        INIT_LIST_HEAD(&vdev->vma_list);
1791        init_rwsem(&vdev->memory_lock);
1792}
1793EXPORT_SYMBOL_GPL(vfio_pci_core_init_device);
1794
1795void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev)
1796{
1797        mutex_destroy(&vdev->igate);
1798        mutex_destroy(&vdev->ioeventfds_lock);
1799        mutex_destroy(&vdev->vma_lock);
1800        vfio_uninit_group_dev(&vdev->vdev);
1801        kfree(vdev->region);
1802        kfree(vdev->pm_save);
1803}
1804EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device);
1805
1806int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
1807{
1808        struct pci_dev *pdev = vdev->pdev;
1809        int ret;
1810
1811        if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
1812                return -EINVAL;
1813
1814        /*
1815         * Prevent binding to PFs with VFs enabled, the VFs might be in use
1816         * by the host or other users.  We cannot capture the VFs if they
1817         * already exist, nor can we track VF users.  Disabling SR-IOV here
1818         * would initiate removing the VFs, which would unbind the driver,
1819         * which is prone to blocking if that VF is also in use by vfio-pci.
1820         * Just reject these PFs and let the user sort it out.
1821         */
1822        if (pci_num_vf(pdev)) {
1823                pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
1824                return -EBUSY;
1825        }
1826
1827        if (pci_is_root_bus(pdev->bus)) {
1828                ret = vfio_assign_device_set(&vdev->vdev, vdev);
1829        } else if (!pci_probe_reset_slot(pdev->slot)) {
1830                ret = vfio_assign_device_set(&vdev->vdev, pdev->slot);
1831        } else {
1832                /*
1833                 * If there is no slot reset support for this device, the whole
1834                 * bus needs to be grouped together to support bus-wide resets.
1835                 */
1836                ret = vfio_assign_device_set(&vdev->vdev, pdev->bus);
1837        }
1838
1839        if (ret)
1840                return ret;
1841        ret = vfio_pci_vf_init(vdev);
1842        if (ret)
1843                return ret;
1844        ret = vfio_pci_vga_init(vdev);
1845        if (ret)
1846                goto out_vf;
1847
1848        vfio_pci_probe_power_state(vdev);
1849
1850        if (!disable_idle_d3) {
1851                /*
1852                 * pci-core sets the device power state to an unknown value at
1853                 * bootup and after being removed from a driver.  The only
1854                 * transition it allows from this unknown state is to D0, which
1855                 * typically happens when a driver calls pci_enable_device().
1856                 * We're not ready to enable the device yet, but we do want to
1857                 * be able to get to D3.  Therefore first do a D0 transition
1858                 * before going to D3.
1859                 */
1860                vfio_pci_set_power_state(vdev, PCI_D0);
1861                vfio_pci_set_power_state(vdev, PCI_D3hot);
1862        }
1863
1864        ret = vfio_register_group_dev(&vdev->vdev);
1865        if (ret)
1866                goto out_power;
1867        return 0;
1868
1869out_power:
1870        if (!disable_idle_d3)
1871                vfio_pci_set_power_state(vdev, PCI_D0);
1872out_vf:
1873        vfio_pci_vf_uninit(vdev);
1874        return ret;
1875}
1876EXPORT_SYMBOL_GPL(vfio_pci_core_register_device);
1877
1878void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
1879{
1880        struct pci_dev *pdev = vdev->pdev;
1881
1882        pci_disable_sriov(pdev);
1883
1884        vfio_unregister_group_dev(&vdev->vdev);
1885
1886        vfio_pci_vf_uninit(vdev);
1887        vfio_pci_vga_uninit(vdev);
1888
1889        if (!disable_idle_d3)
1890                vfio_pci_set_power_state(vdev, PCI_D0);
1891}
1892EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device);
1893
1894static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
1895                                                  pci_channel_state_t state)
1896{
1897        struct vfio_pci_core_device *vdev;
1898        struct vfio_device *device;
1899
1900        device = vfio_device_get_from_dev(&pdev->dev);
1901        if (device == NULL)
1902                return PCI_ERS_RESULT_DISCONNECT;
1903
1904        vdev = container_of(device, struct vfio_pci_core_device, vdev);
1905
1906        mutex_lock(&vdev->igate);
1907
1908        if (vdev->err_trigger)
1909                eventfd_signal(vdev->err_trigger, 1);
1910
1911        mutex_unlock(&vdev->igate);
1912
1913        vfio_device_put(device);
1914
1915        return PCI_ERS_RESULT_CAN_RECOVER;
1916}
1917
1918int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
1919{
1920        struct vfio_device *device;
1921        int ret = 0;
1922
1923        device = vfio_device_get_from_dev(&pdev->dev);
1924        if (!device)
1925                return -ENODEV;
1926
1927        if (nr_virtfn == 0)
1928                pci_disable_sriov(pdev);
1929        else
1930                ret = pci_enable_sriov(pdev, nr_virtfn);
1931
1932        vfio_device_put(device);
1933
1934        return ret < 0 ? ret : nr_virtfn;
1935}
1936EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure);
1937
1938const struct pci_error_handlers vfio_pci_core_err_handlers = {
1939        .error_detected = vfio_pci_aer_err_detected,
1940};
1941EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
1942
1943static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
1944                               struct vfio_pci_group_info *groups)
1945{
1946        unsigned int i;
1947
1948        for (i = 0; i < groups->count; i++)
1949                if (groups->groups[i] == vdev->vdev.group)
1950                        return true;
1951        return false;
1952}
1953
1954static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
1955{
1956        struct vfio_device_set *dev_set = data;
1957        struct vfio_device *cur;
1958
1959        list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
1960                if (cur->dev == &pdev->dev)
1961                        return 0;
1962        return -EBUSY;
1963}
1964
1965/*
1966 * vfio-core considers a group to be viable and will create a vfio_device even
1967 * if some devices are bound to drivers like pci-stub or pcieport. Here we
1968 * require all PCI devices to be inside our dev_set since that ensures they stay
1969 * put and that every driver controlling the device can co-ordinate with the
1970 * device reset.
1971 *
1972 * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be
1973 * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise.
1974 */
1975static struct pci_dev *
1976vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set)
1977{
1978        struct pci_dev *pdev;
1979
1980        lockdep_assert_held(&dev_set->lock);
1981
1982        /*
1983         * By definition all PCI devices in the dev_set share the same PCI
1984         * reset, so any pci_dev will have the same outcomes for
1985         * pci_probe_reset_*() and pci_reset_bus().
1986         */
1987        pdev = list_first_entry(&dev_set->device_list,
1988                                struct vfio_pci_core_device,
1989                                vdev.dev_set_list)->pdev;
1990
1991        /* pci_reset_bus() is supported */
1992        if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus))
1993                return NULL;
1994
1995        if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set,
1996                                          dev_set,
1997                                          !pci_probe_reset_slot(pdev->slot)))
1998                return NULL;
1999        return pdev;
2000}
2001
2002/*
2003 * We need to get memory_lock for each device, but devices can share mmap_lock,
2004 * therefore we need to zap and hold the vma_lock for each device, and only then
2005 * get each memory_lock.
2006 */
2007static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
2008                                      struct vfio_pci_group_info *groups)
2009{
2010        struct vfio_pci_core_device *cur_mem;
2011        struct vfio_pci_core_device *cur_vma;
2012        struct vfio_pci_core_device *cur;
2013        struct pci_dev *pdev;
2014        bool is_mem = true;
2015        int ret;
2016
2017        mutex_lock(&dev_set->lock);
2018        cur_mem = list_first_entry(&dev_set->device_list,
2019                                   struct vfio_pci_core_device,
2020                                   vdev.dev_set_list);
2021
2022        pdev = vfio_pci_dev_set_resettable(dev_set);
2023        if (!pdev) {
2024                ret = -EINVAL;
2025                goto err_unlock;
2026        }
2027
2028        list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
2029                /*
2030                 * Test whether all the affected devices are contained by the
2031                 * set of groups provided by the user.
2032                 */
2033                if (!vfio_dev_in_groups(cur_vma, groups)) {
2034                        ret = -EINVAL;
2035                        goto err_undo;
2036                }
2037
2038                /*
2039                 * Locking multiple devices is prone to deadlock, runaway and
2040                 * unwind if we hit contention.
2041                 */
2042                if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) {
2043                        ret = -EBUSY;
2044                        goto err_undo;
2045                }
2046        }
2047        cur_vma = NULL;
2048
2049        list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) {
2050                if (!down_write_trylock(&cur_mem->memory_lock)) {
2051                        ret = -EBUSY;
2052                        goto err_undo;
2053                }
2054                mutex_unlock(&cur_mem->vma_lock);
2055        }
2056        cur_mem = NULL;
2057
2058        ret = pci_reset_bus(pdev);
2059
2060err_undo:
2061        list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2062                if (cur == cur_mem)
2063                        is_mem = false;
2064                if (cur == cur_vma)
2065                        break;
2066                if (is_mem)
2067                        up_write(&cur->memory_lock);
2068                else
2069                        mutex_unlock(&cur->vma_lock);
2070        }
2071err_unlock:
2072        mutex_unlock(&dev_set->lock);
2073        return ret;
2074}
2075
2076static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
2077{
2078        struct vfio_pci_core_device *cur;
2079        bool needs_reset = false;
2080
2081        list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2082                /* No VFIO device in the set can have an open device FD */
2083                if (cur->vdev.open_count)
2084                        return false;
2085                needs_reset |= cur->needs_reset;
2086        }
2087        return needs_reset;
2088}
2089
2090/*
2091 * If a bus or slot reset is available for the provided dev_set and:
2092 *  - All of the devices affected by that bus or slot reset are unused
2093 *  - At least one of the affected devices is marked dirty via
2094 *    needs_reset (such as by lack of FLR support)
2095 * Then attempt to perform that bus or slot reset.
2096 * Returns true if the dev_set was reset.
2097 */
2098static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
2099{
2100        struct vfio_pci_core_device *cur;
2101        struct pci_dev *pdev;
2102        int ret;
2103
2104        if (!vfio_pci_dev_set_needs_reset(dev_set))
2105                return false;
2106
2107        pdev = vfio_pci_dev_set_resettable(dev_set);
2108        if (!pdev)
2109                return false;
2110
2111        ret = pci_reset_bus(pdev);
2112        if (ret)
2113                return false;
2114
2115        list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2116                cur->needs_reset = false;
2117                if (!disable_idle_d3)
2118                        vfio_pci_set_power_state(cur, PCI_D3hot);
2119        }
2120        return true;
2121}
2122
2123void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga,
2124                              bool is_disable_idle_d3)
2125{
2126        nointxmask = is_nointxmask;
2127        disable_vga = is_disable_vga;
2128        disable_idle_d3 = is_disable_idle_d3;
2129}
2130EXPORT_SYMBOL_GPL(vfio_pci_core_set_params);
2131
2132static void vfio_pci_core_cleanup(void)
2133{
2134        vfio_pci_uninit_perm_bits();
2135}
2136
2137static int __init vfio_pci_core_init(void)
2138{
2139        /* Allocate shared config space permission data used by all devices */
2140        return vfio_pci_init_perm_bits();
2141}
2142
2143module_init(vfio_pci_core_init);
2144module_exit(vfio_pci_core_cleanup);
2145
2146MODULE_LICENSE("GPL v2");
2147MODULE_AUTHOR(DRIVER_AUTHOR);
2148MODULE_DESCRIPTION(DRIVER_DESC);
2149