linux/drivers/vfio/pci/vfio_pci.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   3 *     Author: Alex Williamson <alex.williamson@redhat.com>
   4 *
   5 * This program is free software; you can redistribute it and/or modify
   6 * it under the terms of the GNU General Public License version 2 as
   7 * published by the Free Software Foundation.
   8 *
   9 * Derived from original vfio:
  10 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  11 * Author: Tom Lyon, pugs@cisco.com
  12 */
  13
  14#include <linux/device.h>
  15#include <linux/eventfd.h>
  16#include <linux/file.h>
  17#include <linux/interrupt.h>
  18#include <linux/iommu.h>
  19#include <linux/module.h>
  20#include <linux/mutex.h>
  21#include <linux/notifier.h>
  22#include <linux/pci.h>
  23#include <linux/pm_runtime.h>
  24#include <linux/slab.h>
  25#include <linux/types.h>
  26#include <linux/uaccess.h>
  27#include <linux/vfio.h>
  28
  29#include "vfio_pci_private.h"
  30
  31#define DRIVER_VERSION  "0.2"
  32#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  33#define DRIVER_DESC     "VFIO PCI - User Level meta-driver"
  34
  35static bool nointxmask;
  36module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
  37MODULE_PARM_DESC(nointxmask,
  38                  "Disable support for PCI 2.3 style INTx masking.  If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
  39
  40static DEFINE_MUTEX(driver_lock);
  41
  42static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
  43
  44static int vfio_pci_enable(struct vfio_pci_device *vdev)
  45{
  46        struct pci_dev *pdev = vdev->pdev;
  47        int ret;
  48        u16 cmd;
  49        u8 msix_pos;
  50
  51        /* Don't allow our initial saved state to include busmaster */
  52        pci_clear_master(pdev);
  53
  54        ret = pci_enable_device(pdev);
  55        if (ret)
  56                return ret;
  57
  58        vdev->reset_works = (pci_reset_function(pdev) == 0);
  59        pci_save_state(pdev);
  60        vdev->pci_saved_state = pci_store_saved_state(pdev);
  61        if (!vdev->pci_saved_state)
  62                pr_debug("%s: Couldn't store %s saved state\n",
  63                         __func__, dev_name(&pdev->dev));
  64
  65        ret = vfio_config_init(vdev);
  66        if (ret) {
  67                kfree(vdev->pci_saved_state);
  68                vdev->pci_saved_state = NULL;
  69                pci_disable_device(pdev);
  70                return ret;
  71        }
  72
  73        if (likely(!nointxmask))
  74                vdev->pci_2_3 = pci_intx_mask_supported(pdev);
  75
  76        pci_read_config_word(pdev, PCI_COMMAND, &cmd);
  77        if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
  78                cmd &= ~PCI_COMMAND_INTX_DISABLE;
  79                pci_write_config_word(pdev, PCI_COMMAND, cmd);
  80        }
  81
  82        msix_pos = pdev->msix_cap;
  83        if (msix_pos) {
  84                u16 flags;
  85                u32 table;
  86
  87                pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
  88                pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
  89
  90                vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
  91                vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
  92                vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
  93        } else
  94                vdev->msix_bar = 0xFF;
  95
  96#ifdef CONFIG_VFIO_PCI_VGA
  97        if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
  98                vdev->has_vga = true;
  99#endif
 100
 101        return 0;
 102}
 103
 104static void vfio_pci_disable(struct vfio_pci_device *vdev)
 105{
 106        struct pci_dev *pdev = vdev->pdev;
 107        int bar;
 108
 109        /* Stop the device from further DMA */
 110        pci_clear_master(pdev);
 111
 112        vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
 113                                VFIO_IRQ_SET_ACTION_TRIGGER,
 114                                vdev->irq_type, 0, 0, NULL);
 115
 116        vdev->virq_disabled = false;
 117
 118        vfio_config_free(vdev);
 119
 120        for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
 121                if (!vdev->barmap[bar])
 122                        continue;
 123                pci_iounmap(pdev, vdev->barmap[bar]);
 124                pci_release_selected_regions(pdev, 1 << bar);
 125                vdev->barmap[bar] = NULL;
 126        }
 127
 128        vdev->needs_reset = true;
 129
 130        /*
 131         * If we have saved state, restore it.  If we can reset the device,
 132         * even better.  Resetting with current state seems better than
 133         * nothing, but saving and restoring current state without reset
 134         * is just busy work.
 135         */
 136        if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
 137                pr_info("%s: Couldn't reload %s saved state\n",
 138                        __func__, dev_name(&pdev->dev));
 139
 140                if (!vdev->reset_works)
 141                        goto out;
 142
 143                pci_save_state(pdev);
 144        }
 145
 146        /*
 147         * Disable INTx and MSI, presumably to avoid spurious interrupts
 148         * during reset.  Stolen from pci_reset_function()
 149         */
 150        pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
 151
 152        /*
 153         * Try to reset the device.  The success of this is dependent on
 154         * being able to lock the device, which is not always possible.
 155         */
 156        if (vdev->reset_works) {
 157                int ret = pci_try_reset_function(pdev);
 158                if (ret)
 159                        pr_warn("%s: Failed to reset device %s (%d)\n",
 160                                __func__, dev_name(&pdev->dev), ret);
 161                else
 162                        vdev->needs_reset = false;
 163        }
 164
 165        pci_restore_state(pdev);
 166out:
 167        pci_disable_device(pdev);
 168
 169        vfio_pci_try_bus_reset(vdev);
 170}
 171
 172static void vfio_pci_release(void *device_data)
 173{
 174        struct vfio_pci_device *vdev = device_data;
 175
 176        mutex_lock(&driver_lock);
 177
 178        if (!(--vdev->refcnt)) {
 179                vfio_spapr_pci_eeh_release(vdev->pdev);
 180                vfio_pci_disable(vdev);
 181        }
 182
 183        mutex_unlock(&driver_lock);
 184
 185        module_put(THIS_MODULE);
 186}
 187
 188static int vfio_pci_open(void *device_data)
 189{
 190        struct vfio_pci_device *vdev = device_data;
 191        int ret = 0;
 192
 193        if (!try_module_get(THIS_MODULE))
 194                return -ENODEV;
 195
 196        mutex_lock(&driver_lock);
 197
 198        if (!vdev->refcnt) {
 199                ret = vfio_pci_enable(vdev);
 200                if (ret)
 201                        goto error;
 202
 203                vfio_spapr_pci_eeh_open(vdev->pdev);
 204        }
 205        vdev->refcnt++;
 206error:
 207        mutex_unlock(&driver_lock);
 208        if (ret)
 209                module_put(THIS_MODULE);
 210        return ret;
 211}
 212
 213static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
 214{
 215        if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
 216                u8 pin;
 217                pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
 218                if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && pin)
 219                        return 1;
 220
 221        } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
 222                u8 pos;
 223                u16 flags;
 224
 225                pos = vdev->pdev->msi_cap;
 226                if (pos) {
 227                        pci_read_config_word(vdev->pdev,
 228                                             pos + PCI_MSI_FLAGS, &flags);
 229                        return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
 230                }
 231        } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
 232                u8 pos;
 233                u16 flags;
 234
 235                pos = vdev->pdev->msix_cap;
 236                if (pos) {
 237                        pci_read_config_word(vdev->pdev,
 238                                             pos + PCI_MSIX_FLAGS, &flags);
 239
 240                        return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
 241                }
 242        } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX)
 243                if (pci_is_pcie(vdev->pdev))
 244                        return 1;
 245
 246        return 0;
 247}
 248
 249static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
 250{
 251        (*(int *)data)++;
 252        return 0;
 253}
 254
 255struct vfio_pci_fill_info {
 256        int max;
 257        int cur;
 258        struct vfio_pci_dependent_device *devices;
 259};
 260
 261static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
 262{
 263        struct vfio_pci_fill_info *fill = data;
 264        struct iommu_group *iommu_group;
 265
 266        if (fill->cur == fill->max)
 267                return -EAGAIN; /* Something changed, try again */
 268
 269        iommu_group = iommu_group_get(&pdev->dev);
 270        if (!iommu_group)
 271                return -EPERM; /* Cannot reset non-isolated devices */
 272
 273        fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
 274        fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
 275        fill->devices[fill->cur].bus = pdev->bus->number;
 276        fill->devices[fill->cur].devfn = pdev->devfn;
 277        fill->cur++;
 278        iommu_group_put(iommu_group);
 279        return 0;
 280}
 281
 282struct vfio_pci_group_entry {
 283        struct vfio_group *group;
 284        int id;
 285};
 286
 287struct vfio_pci_group_info {
 288        int count;
 289        struct vfio_pci_group_entry *groups;
 290};
 291
 292static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
 293{
 294        struct vfio_pci_group_info *info = data;
 295        struct iommu_group *group;
 296        int id, i;
 297
 298        group = iommu_group_get(&pdev->dev);
 299        if (!group)
 300                return -EPERM;
 301
 302        id = iommu_group_id(group);
 303
 304        for (i = 0; i < info->count; i++)
 305                if (info->groups[i].id == id)
 306                        break;
 307
 308        iommu_group_put(group);
 309
 310        return (i == info->count) ? -EINVAL : 0;
 311}
 312
 313static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
 314{
 315        for (; pdev; pdev = pdev->bus->self)
 316                if (pdev->bus == slot->bus)
 317                        return (pdev->slot == slot);
 318        return false;
 319}
 320
 321struct vfio_pci_walk_info {
 322        int (*fn)(struct pci_dev *, void *data);
 323        void *data;
 324        struct pci_dev *pdev;
 325        bool slot;
 326        int ret;
 327};
 328
 329static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
 330{
 331        struct vfio_pci_walk_info *walk = data;
 332
 333        if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
 334                walk->ret = walk->fn(pdev, walk->data);
 335
 336        return walk->ret;
 337}
 338
 339static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
 340                                         int (*fn)(struct pci_dev *,
 341                                                   void *data), void *data,
 342                                         bool slot)
 343{
 344        struct vfio_pci_walk_info walk = {
 345                .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
 346        };
 347
 348        pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
 349
 350        return walk.ret;
 351}
 352
 353static long vfio_pci_ioctl(void *device_data,
 354                           unsigned int cmd, unsigned long arg)
 355{
 356        struct vfio_pci_device *vdev = device_data;
 357        unsigned long minsz;
 358
 359        if (cmd == VFIO_DEVICE_GET_INFO) {
 360                struct vfio_device_info info;
 361
 362                minsz = offsetofend(struct vfio_device_info, num_irqs);
 363
 364                if (copy_from_user(&info, (void __user *)arg, minsz))
 365                        return -EFAULT;
 366
 367                if (info.argsz < minsz)
 368                        return -EINVAL;
 369
 370                info.flags = VFIO_DEVICE_FLAGS_PCI;
 371
 372                if (vdev->reset_works)
 373                        info.flags |= VFIO_DEVICE_FLAGS_RESET;
 374
 375                info.num_regions = VFIO_PCI_NUM_REGIONS;
 376                info.num_irqs = VFIO_PCI_NUM_IRQS;
 377
 378                return copy_to_user((void __user *)arg, &info, minsz);
 379
 380        } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
 381                struct pci_dev *pdev = vdev->pdev;
 382                struct vfio_region_info info;
 383
 384                minsz = offsetofend(struct vfio_region_info, offset);
 385
 386                if (copy_from_user(&info, (void __user *)arg, minsz))
 387                        return -EFAULT;
 388
 389                if (info.argsz < minsz)
 390                        return -EINVAL;
 391
 392                switch (info.index) {
 393                case VFIO_PCI_CONFIG_REGION_INDEX:
 394                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 395                        info.size = pdev->cfg_size;
 396                        info.flags = VFIO_REGION_INFO_FLAG_READ |
 397                                     VFIO_REGION_INFO_FLAG_WRITE;
 398                        break;
 399                case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
 400                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 401                        info.size = pci_resource_len(pdev, info.index);
 402                        if (!info.size) {
 403                                info.flags = 0;
 404                                break;
 405                        }
 406
 407                        info.flags = VFIO_REGION_INFO_FLAG_READ |
 408                                     VFIO_REGION_INFO_FLAG_WRITE;
 409                        if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) &&
 410                            pci_resource_flags(pdev, info.index) &
 411                            IORESOURCE_MEM && info.size >= PAGE_SIZE)
 412                                info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
 413                        break;
 414                case VFIO_PCI_ROM_REGION_INDEX:
 415                {
 416                        void __iomem *io;
 417                        size_t size;
 418
 419                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 420                        info.flags = 0;
 421
 422                        /* Report the BAR size, not the ROM size */
 423                        info.size = pci_resource_len(pdev, info.index);
 424                        if (!info.size)
 425                                break;
 426
 427                        /* Is it really there? */
 428                        io = pci_map_rom(pdev, &size);
 429                        if (!io || !size) {
 430                                info.size = 0;
 431                                break;
 432                        }
 433                        pci_unmap_rom(pdev, io);
 434
 435                        info.flags = VFIO_REGION_INFO_FLAG_READ;
 436                        break;
 437                }
 438                case VFIO_PCI_VGA_REGION_INDEX:
 439                        if (!vdev->has_vga)
 440                                return -EINVAL;
 441
 442                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 443                        info.size = 0xc0000;
 444                        info.flags = VFIO_REGION_INFO_FLAG_READ |
 445                                     VFIO_REGION_INFO_FLAG_WRITE;
 446
 447                        break;
 448                default:
 449                        return -EINVAL;
 450                }
 451
 452                return copy_to_user((void __user *)arg, &info, minsz);
 453
 454        } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
 455                struct vfio_irq_info info;
 456
 457                minsz = offsetofend(struct vfio_irq_info, count);
 458
 459                if (copy_from_user(&info, (void __user *)arg, minsz))
 460                        return -EFAULT;
 461
 462                if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
 463                        return -EINVAL;
 464
 465                switch (info.index) {
 466                case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
 467                        break;
 468                case VFIO_PCI_ERR_IRQ_INDEX:
 469                        if (pci_is_pcie(vdev->pdev))
 470                                break;
 471                /* pass thru to return error */
 472                default:
 473                        return -EINVAL;
 474                }
 475
 476                info.flags = VFIO_IRQ_INFO_EVENTFD;
 477
 478                info.count = vfio_pci_get_irq_count(vdev, info.index);
 479
 480                if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
 481                        info.flags |= (VFIO_IRQ_INFO_MASKABLE |
 482                                       VFIO_IRQ_INFO_AUTOMASKED);
 483                else
 484                        info.flags |= VFIO_IRQ_INFO_NORESIZE;
 485
 486                return copy_to_user((void __user *)arg, &info, minsz);
 487
 488        } else if (cmd == VFIO_DEVICE_SET_IRQS) {
 489                struct vfio_irq_set hdr;
 490                u8 *data = NULL;
 491                int ret = 0;
 492
 493                minsz = offsetofend(struct vfio_irq_set, count);
 494
 495                if (copy_from_user(&hdr, (void __user *)arg, minsz))
 496                        return -EFAULT;
 497
 498                if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
 499                    hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
 500                                  VFIO_IRQ_SET_ACTION_TYPE_MASK))
 501                        return -EINVAL;
 502
 503                if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
 504                        size_t size;
 505                        int max = vfio_pci_get_irq_count(vdev, hdr.index);
 506
 507                        if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
 508                                size = sizeof(uint8_t);
 509                        else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
 510                                size = sizeof(int32_t);
 511                        else
 512                                return -EINVAL;
 513
 514                        if (hdr.argsz - minsz < hdr.count * size ||
 515                            hdr.start >= max || hdr.start + hdr.count > max)
 516                                return -EINVAL;
 517
 518                        data = memdup_user((void __user *)(arg + minsz),
 519                                           hdr.count * size);
 520                        if (IS_ERR(data))
 521                                return PTR_ERR(data);
 522                }
 523
 524                mutex_lock(&vdev->igate);
 525
 526                ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
 527                                              hdr.start, hdr.count, data);
 528
 529                mutex_unlock(&vdev->igate);
 530                kfree(data);
 531
 532                return ret;
 533
 534        } else if (cmd == VFIO_DEVICE_RESET) {
 535                return vdev->reset_works ?
 536                        pci_try_reset_function(vdev->pdev) : -EINVAL;
 537
 538        } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
 539                struct vfio_pci_hot_reset_info hdr;
 540                struct vfio_pci_fill_info fill = { 0 };
 541                struct vfio_pci_dependent_device *devices = NULL;
 542                bool slot = false;
 543                int ret = 0;
 544
 545                minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
 546
 547                if (copy_from_user(&hdr, (void __user *)arg, minsz))
 548                        return -EFAULT;
 549
 550                if (hdr.argsz < minsz)
 551                        return -EINVAL;
 552
 553                hdr.flags = 0;
 554
 555                /* Can we do a slot or bus reset or neither? */
 556                if (!pci_probe_reset_slot(vdev->pdev->slot))
 557                        slot = true;
 558                else if (pci_probe_reset_bus(vdev->pdev->bus))
 559                        return -ENODEV;
 560
 561                /* How many devices are affected? */
 562                ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
 563                                                    vfio_pci_count_devs,
 564                                                    &fill.max, slot);
 565                if (ret)
 566                        return ret;
 567
 568                WARN_ON(!fill.max); /* Should always be at least one */
 569
 570                /*
 571                 * If there's enough space, fill it now, otherwise return
 572                 * -ENOSPC and the number of devices affected.
 573                 */
 574                if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
 575                        ret = -ENOSPC;
 576                        hdr.count = fill.max;
 577                        goto reset_info_exit;
 578                }
 579
 580                devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
 581                if (!devices)
 582                        return -ENOMEM;
 583
 584                fill.devices = devices;
 585
 586                ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
 587                                                    vfio_pci_fill_devs,
 588                                                    &fill, slot);
 589
 590                /*
 591                 * If a device was removed between counting and filling,
 592                 * we may come up short of fill.max.  If a device was
 593                 * added, we'll have a return of -EAGAIN above.
 594                 */
 595                if (!ret)
 596                        hdr.count = fill.cur;
 597
 598reset_info_exit:
 599                if (copy_to_user((void __user *)arg, &hdr, minsz))
 600                        ret = -EFAULT;
 601
 602                if (!ret) {
 603                        if (copy_to_user((void __user *)(arg + minsz), devices,
 604                                         hdr.count * sizeof(*devices)))
 605                                ret = -EFAULT;
 606                }
 607
 608                kfree(devices);
 609                return ret;
 610
 611        } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
 612                struct vfio_pci_hot_reset hdr;
 613                int32_t *group_fds;
 614                struct vfio_pci_group_entry *groups;
 615                struct vfio_pci_group_info info;
 616                bool slot = false;
 617                int i, count = 0, ret = 0;
 618
 619                minsz = offsetofend(struct vfio_pci_hot_reset, count);
 620
 621                if (copy_from_user(&hdr, (void __user *)arg, minsz))
 622                        return -EFAULT;
 623
 624                if (hdr.argsz < minsz || hdr.flags)
 625                        return -EINVAL;
 626
 627                /* Can we do a slot or bus reset or neither? */
 628                if (!pci_probe_reset_slot(vdev->pdev->slot))
 629                        slot = true;
 630                else if (pci_probe_reset_bus(vdev->pdev->bus))
 631                        return -ENODEV;
 632
 633                /*
 634                 * We can't let userspace give us an arbitrarily large
 635                 * buffer to copy, so verify how many we think there
 636                 * could be.  Note groups can have multiple devices so
 637                 * one group per device is the max.
 638                 */
 639                ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
 640                                                    vfio_pci_count_devs,
 641                                                    &count, slot);
 642                if (ret)
 643                        return ret;
 644
 645                /* Somewhere between 1 and count is OK */
 646                if (!hdr.count || hdr.count > count)
 647                        return -EINVAL;
 648
 649                group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
 650                groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
 651                if (!group_fds || !groups) {
 652                        kfree(group_fds);
 653                        kfree(groups);
 654                        return -ENOMEM;
 655                }
 656
 657                if (copy_from_user(group_fds, (void __user *)(arg + minsz),
 658                                   hdr.count * sizeof(*group_fds))) {
 659                        kfree(group_fds);
 660                        kfree(groups);
 661                        return -EFAULT;
 662                }
 663
 664                /*
 665                 * For each group_fd, get the group through the vfio external
 666                 * user interface and store the group and iommu ID.  This
 667                 * ensures the group is held across the reset.
 668                 */
 669                for (i = 0; i < hdr.count; i++) {
 670                        struct vfio_group *group;
 671                        struct fd f = fdget(group_fds[i]);
 672                        if (!f.file) {
 673                                ret = -EBADF;
 674                                break;
 675                        }
 676
 677                        group = vfio_group_get_external_user(f.file);
 678                        fdput(f);
 679                        if (IS_ERR(group)) {
 680                                ret = PTR_ERR(group);
 681                                break;
 682                        }
 683
 684                        groups[i].group = group;
 685                        groups[i].id = vfio_external_user_iommu_id(group);
 686                }
 687
 688                kfree(group_fds);
 689
 690                /* release reference to groups on error */
 691                if (ret)
 692                        goto hot_reset_release;
 693
 694                info.count = hdr.count;
 695                info.groups = groups;
 696
 697                /*
 698                 * Test whether all the affected devices are contained
 699                 * by the set of groups provided by the user.
 700                 */
 701                ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
 702                                                    vfio_pci_validate_devs,
 703                                                    &info, slot);
 704                if (!ret)
 705                        /* User has access, do the reset */
 706                        ret = slot ? pci_try_reset_slot(vdev->pdev->slot) :
 707                                     pci_try_reset_bus(vdev->pdev->bus);
 708
 709hot_reset_release:
 710                for (i--; i >= 0; i--)
 711                        vfio_group_put_external_user(groups[i].group);
 712
 713                kfree(groups);
 714                return ret;
 715        }
 716
 717        return -ENOTTY;
 718}
 719
 720static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
 721                           size_t count, loff_t *ppos, bool iswrite)
 722{
 723        unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
 724        struct vfio_pci_device *vdev = device_data;
 725
 726        if (index >= VFIO_PCI_NUM_REGIONS)
 727                return -EINVAL;
 728
 729        switch (index) {
 730        case VFIO_PCI_CONFIG_REGION_INDEX:
 731                return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
 732
 733        case VFIO_PCI_ROM_REGION_INDEX:
 734                if (iswrite)
 735                        return -EINVAL;
 736                return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
 737
 738        case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
 739                return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
 740
 741        case VFIO_PCI_VGA_REGION_INDEX:
 742                return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
 743        }
 744
 745        return -EINVAL;
 746}
 747
 748static ssize_t vfio_pci_read(void *device_data, char __user *buf,
 749                             size_t count, loff_t *ppos)
 750{
 751        if (!count)
 752                return 0;
 753
 754        return vfio_pci_rw(device_data, buf, count, ppos, false);
 755}
 756
 757static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
 758                              size_t count, loff_t *ppos)
 759{
 760        if (!count)
 761                return 0;
 762
 763        return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
 764}
 765
 766static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 767{
 768        struct vfio_pci_device *vdev = device_data;
 769        struct pci_dev *pdev = vdev->pdev;
 770        unsigned int index;
 771        u64 phys_len, req_len, pgoff, req_start;
 772        int ret;
 773
 774        index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
 775
 776        if (vma->vm_end < vma->vm_start)
 777                return -EINVAL;
 778        if ((vma->vm_flags & VM_SHARED) == 0)
 779                return -EINVAL;
 780        if (index >= VFIO_PCI_ROM_REGION_INDEX)
 781                return -EINVAL;
 782        if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
 783                return -EINVAL;
 784
 785        phys_len = pci_resource_len(pdev, index);
 786        req_len = vma->vm_end - vma->vm_start;
 787        pgoff = vma->vm_pgoff &
 788                ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
 789        req_start = pgoff << PAGE_SHIFT;
 790
 791        if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
 792                return -EINVAL;
 793
 794        if (index == vdev->msix_bar) {
 795                /*
 796                 * Disallow mmaps overlapping the MSI-X table; users don't
 797                 * get to touch this directly.  We could find somewhere
 798                 * else to map the overlap, but page granularity is only
 799                 * a recommendation, not a requirement, so the user needs
 800                 * to know which bits are real.  Requiring them to mmap
 801                 * around the table makes that clear.
 802                 */
 803
 804                /* If neither entirely above nor below, then it overlaps */
 805                if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
 806                      req_start + req_len <= vdev->msix_offset))
 807                        return -EINVAL;
 808        }
 809
 810        /*
 811         * Even though we don't make use of the barmap for the mmap,
 812         * we need to request the region and the barmap tracks that.
 813         */
 814        if (!vdev->barmap[index]) {
 815                ret = pci_request_selected_regions(pdev,
 816                                                   1 << index, "vfio-pci");
 817                if (ret)
 818                        return ret;
 819
 820                vdev->barmap[index] = pci_iomap(pdev, index, 0);
 821        }
 822
 823        vma->vm_private_data = vdev;
 824        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 825        vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
 826
 827        return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
 828                               req_len, vma->vm_page_prot);
 829}
 830
 831static const struct vfio_device_ops vfio_pci_ops = {
 832        .name           = "vfio-pci",
 833        .open           = vfio_pci_open,
 834        .release        = vfio_pci_release,
 835        .ioctl          = vfio_pci_ioctl,
 836        .read           = vfio_pci_read,
 837        .write          = vfio_pci_write,
 838        .mmap           = vfio_pci_mmap,
 839};
 840
 841static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 842{
 843        struct vfio_pci_device *vdev;
 844        struct iommu_group *group;
 845        int ret;
 846
 847        if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 848                return -EINVAL;
 849
 850        group = iommu_group_get(&pdev->dev);
 851        if (!group)
 852                return -EINVAL;
 853
 854        vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
 855        if (!vdev) {
 856                iommu_group_put(group);
 857                return -ENOMEM;
 858        }
 859
 860        vdev->pdev = pdev;
 861        vdev->irq_type = VFIO_PCI_NUM_IRQS;
 862        mutex_init(&vdev->igate);
 863        spin_lock_init(&vdev->irqlock);
 864
 865        ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
 866        if (ret) {
 867                iommu_group_put(group);
 868                kfree(vdev);
 869        }
 870
 871        return ret;
 872}
 873
 874static void vfio_pci_remove(struct pci_dev *pdev)
 875{
 876        struct vfio_pci_device *vdev;
 877
 878        vdev = vfio_del_group_dev(&pdev->dev);
 879        if (vdev) {
 880                iommu_group_put(pdev->dev.iommu_group);
 881                kfree(vdev);
 882        }
 883}
 884
 885static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 886                                                  pci_channel_state_t state)
 887{
 888        struct vfio_pci_device *vdev;
 889        struct vfio_device *device;
 890
 891        device = vfio_device_get_from_dev(&pdev->dev);
 892        if (device == NULL)
 893                return PCI_ERS_RESULT_DISCONNECT;
 894
 895        vdev = vfio_device_data(device);
 896        if (vdev == NULL) {
 897                vfio_device_put(device);
 898                return PCI_ERS_RESULT_DISCONNECT;
 899        }
 900
 901        mutex_lock(&vdev->igate);
 902
 903        if (vdev->err_trigger)
 904                eventfd_signal(vdev->err_trigger, 1);
 905
 906        mutex_unlock(&vdev->igate);
 907
 908        vfio_device_put(device);
 909
 910        return PCI_ERS_RESULT_CAN_RECOVER;
 911}
 912
 913static struct pci_error_handlers vfio_err_handlers = {
 914        .error_detected = vfio_pci_aer_err_detected,
 915};
 916
 917static struct pci_driver vfio_pci_driver = {
 918        .name           = "vfio-pci",
 919        .id_table       = NULL, /* only dynamic ids */
 920        .probe          = vfio_pci_probe,
 921        .remove         = vfio_pci_remove,
 922        .err_handler    = &vfio_err_handlers,
 923};
 924
 925struct vfio_devices {
 926        struct vfio_device **devices;
 927        int cur_index;
 928        int max_index;
 929};
 930
 931static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
 932{
 933        struct vfio_devices *devs = data;
 934        struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
 935
 936        if (pci_drv != &vfio_pci_driver)
 937                return -EBUSY;
 938
 939        if (devs->cur_index == devs->max_index)
 940                return -ENOSPC;
 941
 942        devs->devices[devs->cur_index] = vfio_device_get_from_dev(&pdev->dev);
 943        if (!devs->devices[devs->cur_index])
 944                return -EINVAL;
 945
 946        devs->cur_index++;
 947        return 0;
 948}
 949
 950/*
 951 * Attempt to do a bus/slot reset if there are devices affected by a reset for
 952 * this device that are needs_reset and all of the affected devices are unused
 953 * (!refcnt).  Callers are required to hold driver_lock when calling this to
 954 * prevent device opens and concurrent bus reset attempts.  We prevent device
 955 * unbinds by acquiring and holding a reference to the vfio_device.
 956 *
 957 * NB: vfio-core considers a group to be viable even if some devices are
 958 * bound to drivers like pci-stub or pcieport.  Here we require all devices
 959 * to be bound to vfio_pci since that's the only way we can be sure they
 960 * stay put.
 961 */
 962static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 963{
 964        struct vfio_devices devs = { .cur_index = 0 };
 965        int i = 0, ret = -EINVAL;
 966        bool needs_reset = false, slot = false;
 967        struct vfio_pci_device *tmp;
 968
 969        if (!pci_probe_reset_slot(vdev->pdev->slot))
 970                slot = true;
 971        else if (pci_probe_reset_bus(vdev->pdev->bus))
 972                return;
 973
 974        if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
 975                                          &i, slot) || !i)
 976                return;
 977
 978        devs.max_index = i;
 979        devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL);
 980        if (!devs.devices)
 981                return;
 982
 983        if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
 984                                          vfio_pci_get_devs, &devs, slot))
 985                goto put_devs;
 986
 987        for (i = 0; i < devs.cur_index; i++) {
 988                tmp = vfio_device_data(devs.devices[i]);
 989                if (tmp->needs_reset)
 990                        needs_reset = true;
 991                if (tmp->refcnt)
 992                        goto put_devs;
 993        }
 994
 995        if (needs_reset)
 996                ret = slot ? pci_try_reset_slot(vdev->pdev->slot) :
 997                             pci_try_reset_bus(vdev->pdev->bus);
 998
 999put_devs:
1000        for (i = 0; i < devs.cur_index; i++) {
1001                if (!ret) {
1002                        tmp = vfio_device_data(devs.devices[i]);
1003                        tmp->needs_reset = false;
1004                }
1005                vfio_device_put(devs.devices[i]);
1006        }
1007
1008        kfree(devs.devices);
1009}
1010
1011static void __exit vfio_pci_cleanup(void)
1012{
1013        pci_unregister_driver(&vfio_pci_driver);
1014        vfio_pci_virqfd_exit();
1015        vfio_pci_uninit_perm_bits();
1016}
1017
1018static int __init vfio_pci_init(void)
1019{
1020        int ret;
1021
1022        /* Allocate shared config space permision data used by all devices */
1023        ret = vfio_pci_init_perm_bits();
1024        if (ret)
1025                return ret;
1026
1027        /* Start the virqfd cleanup handler */
1028        ret = vfio_pci_virqfd_init();
1029        if (ret)
1030                goto out_virqfd;
1031
1032        /* Register and scan for devices */
1033        ret = pci_register_driver(&vfio_pci_driver);
1034        if (ret)
1035                goto out_driver;
1036
1037        return 0;
1038
1039out_driver:
1040        vfio_pci_virqfd_exit();
1041out_virqfd:
1042        vfio_pci_uninit_perm_bits();
1043        return ret;
1044}
1045
1046module_init(vfio_pci_init);
1047module_exit(vfio_pci_cleanup);
1048
1049MODULE_VERSION(DRIVER_VERSION);
1050MODULE_LICENSE("GPL v2");
1051MODULE_AUTHOR(DRIVER_AUTHOR);
1052MODULE_DESCRIPTION(DRIVER_DESC);
1053