linux/drivers/vfio/pci/vfio_pci.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   3 *     Author: Alex Williamson <alex.williamson@redhat.com>
   4 *
   5 * This program is free software; you can redistribute it and/or modify
   6 * it under the terms of the GNU General Public License version 2 as
   7 * published by the Free Software Foundation.
   8 *
   9 * Derived from original vfio:
  10 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  11 * Author: Tom Lyon, pugs@cisco.com
  12 */
  13
  14#include <linux/device.h>
  15#include <linux/eventfd.h>
  16#include <linux/file.h>
  17#include <linux/interrupt.h>
  18#include <linux/iommu.h>
  19#include <linux/module.h>
  20#include <linux/mutex.h>
  21#include <linux/notifier.h>
  22#include <linux/pci.h>
  23#include <linux/pm_runtime.h>
  24#include <linux/slab.h>
  25#include <linux/types.h>
  26#include <linux/uaccess.h>
  27#include <linux/vfio.h>
  28
  29#include "vfio_pci_private.h"
  30
  31#define DRIVER_VERSION  "0.2"
  32#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  33#define DRIVER_DESC     "VFIO PCI - User Level meta-driver"
  34
  35static bool nointxmask;
  36module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
  37MODULE_PARM_DESC(nointxmask,
  38                  "Disable support for PCI 2.3 style INTx masking.  If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
  39
  40static DEFINE_MUTEX(driver_lock);
  41
  42static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
  43
  44static int vfio_pci_enable(struct vfio_pci_device *vdev)
  45{
  46        struct pci_dev *pdev = vdev->pdev;
  47        int ret;
  48        u16 cmd;
  49        u8 msix_pos;
  50
  51        /* Don't allow our initial saved state to include busmaster */
  52        pci_clear_master(pdev);
  53
  54        ret = pci_enable_device(pdev);
  55        if (ret)
  56                return ret;
  57
  58        vdev->reset_works = (pci_reset_function(pdev) == 0);
  59        pci_save_state(pdev);
  60        vdev->pci_saved_state = pci_store_saved_state(pdev);
  61        if (!vdev->pci_saved_state)
  62                pr_debug("%s: Couldn't store %s saved state\n",
  63                         __func__, dev_name(&pdev->dev));
  64
  65        ret = vfio_config_init(vdev);
  66        if (ret) {
  67                kfree(vdev->pci_saved_state);
  68                vdev->pci_saved_state = NULL;
  69                pci_disable_device(pdev);
  70                return ret;
  71        }
  72
  73        if (likely(!nointxmask))
  74                vdev->pci_2_3 = pci_intx_mask_supported(pdev);
  75
  76        pci_read_config_word(pdev, PCI_COMMAND, &cmd);
  77        if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
  78                cmd &= ~PCI_COMMAND_INTX_DISABLE;
  79                pci_write_config_word(pdev, PCI_COMMAND, cmd);
  80        }
  81
  82        msix_pos = pdev->msix_cap;
  83        if (msix_pos) {
  84                u16 flags;
  85                u32 table;
  86
  87                pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
  88                pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
  89
  90                vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
  91                vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
  92                vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
  93        } else
  94                vdev->msix_bar = 0xFF;
  95
  96#ifdef CONFIG_VFIO_PCI_VGA
  97        if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
  98                vdev->has_vga = true;
  99#endif
 100
 101        return 0;
 102}
 103
 104static void vfio_pci_disable(struct vfio_pci_device *vdev)
 105{
 106        struct pci_dev *pdev = vdev->pdev;
 107        int bar;
 108
 109        /* Stop the device from further DMA */
 110        pci_clear_master(pdev);
 111
 112        vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
 113                                VFIO_IRQ_SET_ACTION_TRIGGER,
 114                                vdev->irq_type, 0, 0, NULL);
 115
 116        vdev->virq_disabled = false;
 117
 118        vfio_config_free(vdev);
 119
 120        for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
 121                if (!vdev->barmap[bar])
 122                        continue;
 123                pci_iounmap(pdev, vdev->barmap[bar]);
 124                pci_release_selected_regions(pdev, 1 << bar);
 125                vdev->barmap[bar] = NULL;
 126        }
 127
 128        vdev->needs_reset = true;
 129
 130        /*
 131         * If we have saved state, restore it.  If we can reset the device,
 132         * even better.  Resetting with current state seems better than
 133         * nothing, but saving and restoring current state without reset
 134         * is just busy work.
 135         */
 136        if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
 137                pr_info("%s: Couldn't reload %s saved state\n",
 138                        __func__, dev_name(&pdev->dev));
 139
 140                if (!vdev->reset_works)
 141                        goto out;
 142
 143                pci_save_state(pdev);
 144        }
 145
 146        /*
 147         * Disable INTx and MSI, presumably to avoid spurious interrupts
 148         * during reset.  Stolen from pci_reset_function()
 149         */
 150        pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
 151
 152        /*
 153         * Try to reset the device.  The success of this is dependent on
 154         * being able to lock the device, which is not always possible.
 155         */
 156        if (vdev->reset_works) {
 157                int ret = pci_try_reset_function(pdev);
 158                if (ret)
 159                        pr_warn("%s: Failed to reset device %s (%d)\n",
 160                                __func__, dev_name(&pdev->dev), ret);
 161                else
 162                        vdev->needs_reset = false;
 163        }
 164
 165        pci_restore_state(pdev);
 166out:
 167        pci_disable_device(pdev);
 168
 169        vfio_pci_try_bus_reset(vdev);
 170}
 171
 172static void vfio_pci_release(void *device_data)
 173{
 174        struct vfio_pci_device *vdev = device_data;
 175
 176        mutex_lock(&driver_lock);
 177
 178        if (!(--vdev->refcnt)) {
 179                vfio_spapr_pci_eeh_release(vdev->pdev);
 180                vfio_pci_disable(vdev);
 181        }
 182
 183        mutex_unlock(&driver_lock);
 184
 185        module_put(THIS_MODULE);
 186}
 187
 188static int vfio_pci_open(void *device_data)
 189{
 190        struct vfio_pci_device *vdev = device_data;
 191        int ret = 0;
 192
 193        if (!try_module_get(THIS_MODULE))
 194                return -ENODEV;
 195
 196        mutex_lock(&driver_lock);
 197
 198        if (!vdev->refcnt) {
 199                ret = vfio_pci_enable(vdev);
 200                if (ret)
 201                        goto error;
 202
 203                vfio_spapr_pci_eeh_open(vdev->pdev);
 204        }
 205        vdev->refcnt++;
 206error:
 207        mutex_unlock(&driver_lock);
 208        if (ret)
 209                module_put(THIS_MODULE);
 210        return ret;
 211}
 212
 213static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
 214{
 215        if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
 216                u8 pin;
 217                pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
 218                if (pin)
 219                        return 1;
 220
 221        } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
 222                u8 pos;
 223                u16 flags;
 224
 225                pos = vdev->pdev->msi_cap;
 226                if (pos) {
 227                        pci_read_config_word(vdev->pdev,
 228                                             pos + PCI_MSI_FLAGS, &flags);
 229                        return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
 230                }
 231        } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
 232                u8 pos;
 233                u16 flags;
 234
 235                pos = vdev->pdev->msix_cap;
 236                if (pos) {
 237                        pci_read_config_word(vdev->pdev,
 238                                             pos + PCI_MSIX_FLAGS, &flags);
 239
 240                        return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
 241                }
 242        } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX)
 243                if (pci_is_pcie(vdev->pdev))
 244                        return 1;
 245
 246        return 0;
 247}
 248
 249static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
 250{
 251        (*(int *)data)++;
 252        return 0;
 253}
 254
 255struct vfio_pci_fill_info {
 256        int max;
 257        int cur;
 258        struct vfio_pci_dependent_device *devices;
 259};
 260
 261static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
 262{
 263        struct vfio_pci_fill_info *fill = data;
 264        struct iommu_group *iommu_group;
 265
 266        if (fill->cur == fill->max)
 267                return -EAGAIN; /* Something changed, try again */
 268
 269        iommu_group = iommu_group_get(&pdev->dev);
 270        if (!iommu_group)
 271                return -EPERM; /* Cannot reset non-isolated devices */
 272
 273        fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
 274        fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
 275        fill->devices[fill->cur].bus = pdev->bus->number;
 276        fill->devices[fill->cur].devfn = pdev->devfn;
 277        fill->cur++;
 278        iommu_group_put(iommu_group);
 279        return 0;
 280}
 281
 282struct vfio_pci_group_entry {
 283        struct vfio_group *group;
 284        int id;
 285};
 286
 287struct vfio_pci_group_info {
 288        int count;
 289        struct vfio_pci_group_entry *groups;
 290};
 291
 292static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
 293{
 294        struct vfio_pci_group_info *info = data;
 295        struct iommu_group *group;
 296        int id, i;
 297
 298        group = iommu_group_get(&pdev->dev);
 299        if (!group)
 300                return -EPERM;
 301
 302        id = iommu_group_id(group);
 303
 304        for (i = 0; i < info->count; i++)
 305                if (info->groups[i].id == id)
 306                        break;
 307
 308        iommu_group_put(group);
 309
 310        return (i == info->count) ? -EINVAL : 0;
 311}
 312
 313static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
 314{
 315        for (; pdev; pdev = pdev->bus->self)
 316                if (pdev->bus == slot->bus)
 317                        return (pdev->slot == slot);
 318        return false;
 319}
 320
 321struct vfio_pci_walk_info {
 322        int (*fn)(struct pci_dev *, void *data);
 323        void *data;
 324        struct pci_dev *pdev;
 325        bool slot;
 326        int ret;
 327};
 328
 329static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
 330{
 331        struct vfio_pci_walk_info *walk = data;
 332
 333        if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
 334                walk->ret = walk->fn(pdev, walk->data);
 335
 336        return walk->ret;
 337}
 338
 339static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
 340                                         int (*fn)(struct pci_dev *,
 341                                                   void *data), void *data,
 342                                         bool slot)
 343{
 344        struct vfio_pci_walk_info walk = {
 345                .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
 346        };
 347
 348        pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
 349
 350        return walk.ret;
 351}
 352
 353static long vfio_pci_ioctl(void *device_data,
 354                           unsigned int cmd, unsigned long arg)
 355{
 356        struct vfio_pci_device *vdev = device_data;
 357        unsigned long minsz;
 358
 359        if (cmd == VFIO_DEVICE_GET_INFO) {
 360                struct vfio_device_info info;
 361
 362                minsz = offsetofend(struct vfio_device_info, num_irqs);
 363
 364                if (copy_from_user(&info, (void __user *)arg, minsz))
 365                        return -EFAULT;
 366
 367                if (info.argsz < minsz)
 368                        return -EINVAL;
 369
 370                info.flags = VFIO_DEVICE_FLAGS_PCI;
 371
 372                if (vdev->reset_works)
 373                        info.flags |= VFIO_DEVICE_FLAGS_RESET;
 374
 375                info.num_regions = VFIO_PCI_NUM_REGIONS;
 376                info.num_irqs = VFIO_PCI_NUM_IRQS;
 377
 378                return copy_to_user((void __user *)arg, &info, minsz);
 379
 380        } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
 381                struct pci_dev *pdev = vdev->pdev;
 382                struct vfio_region_info info;
 383
 384                minsz = offsetofend(struct vfio_region_info, offset);
 385
 386                if (copy_from_user(&info, (void __user *)arg, minsz))
 387                        return -EFAULT;
 388
 389                if (info.argsz < minsz)
 390                        return -EINVAL;
 391
 392                switch (info.index) {
 393                case VFIO_PCI_CONFIG_REGION_INDEX:
 394                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 395                        info.size = pdev->cfg_size;
 396                        info.flags = VFIO_REGION_INFO_FLAG_READ |
 397                                     VFIO_REGION_INFO_FLAG_WRITE;
 398                        break;
 399                case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
 400                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 401                        info.size = pci_resource_len(pdev, info.index);
 402                        if (!info.size) {
 403                                info.flags = 0;
 404                                break;
 405                        }
 406
 407                        info.flags = VFIO_REGION_INFO_FLAG_READ |
 408                                     VFIO_REGION_INFO_FLAG_WRITE;
 409                        if (pci_resource_flags(pdev, info.index) &
 410                            IORESOURCE_MEM && info.size >= PAGE_SIZE)
 411                                info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
 412                        break;
 413                case VFIO_PCI_ROM_REGION_INDEX:
 414                {
 415                        void __iomem *io;
 416                        size_t size;
 417
 418                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 419                        info.flags = 0;
 420
 421                        /* Report the BAR size, not the ROM size */
 422                        info.size = pci_resource_len(pdev, info.index);
 423                        if (!info.size)
 424                                break;
 425
 426                        /* Is it really there? */
 427                        io = pci_map_rom(pdev, &size);
 428                        if (!io || !size) {
 429                                info.size = 0;
 430                                break;
 431                        }
 432                        pci_unmap_rom(pdev, io);
 433
 434                        info.flags = VFIO_REGION_INFO_FLAG_READ;
 435                        break;
 436                }
 437                case VFIO_PCI_VGA_REGION_INDEX:
 438                        if (!vdev->has_vga)
 439                                return -EINVAL;
 440
 441                        info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
 442                        info.size = 0xc0000;
 443                        info.flags = VFIO_REGION_INFO_FLAG_READ |
 444                                     VFIO_REGION_INFO_FLAG_WRITE;
 445
 446                        break;
 447                default:
 448                        return -EINVAL;
 449                }
 450
 451                return copy_to_user((void __user *)arg, &info, minsz);
 452
 453        } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
 454                struct vfio_irq_info info;
 455
 456                minsz = offsetofend(struct vfio_irq_info, count);
 457
 458                if (copy_from_user(&info, (void __user *)arg, minsz))
 459                        return -EFAULT;
 460
 461                if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
 462                        return -EINVAL;
 463
 464                switch (info.index) {
 465                case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
 466                        break;
 467                case VFIO_PCI_ERR_IRQ_INDEX:
 468                        if (pci_is_pcie(vdev->pdev))
 469                                break;
 470                /* pass thru to return error */
 471                default:
 472                        return -EINVAL;
 473                }
 474
 475                info.flags = VFIO_IRQ_INFO_EVENTFD;
 476
 477                info.count = vfio_pci_get_irq_count(vdev, info.index);
 478
 479                if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
 480                        info.flags |= (VFIO_IRQ_INFO_MASKABLE |
 481                                       VFIO_IRQ_INFO_AUTOMASKED);
 482                else
 483                        info.flags |= VFIO_IRQ_INFO_NORESIZE;
 484
 485                return copy_to_user((void __user *)arg, &info, minsz);
 486
 487        } else if (cmd == VFIO_DEVICE_SET_IRQS) {
 488                struct vfio_irq_set hdr;
 489                u8 *data = NULL;
 490                int ret = 0;
 491
 492                minsz = offsetofend(struct vfio_irq_set, count);
 493
 494                if (copy_from_user(&hdr, (void __user *)arg, minsz))
 495                        return -EFAULT;
 496
 497                if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
 498                    hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
 499                                  VFIO_IRQ_SET_ACTION_TYPE_MASK))
 500                        return -EINVAL;
 501
 502                if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
 503                        size_t size;
 504                        int max = vfio_pci_get_irq_count(vdev, hdr.index);
 505
 506                        if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
 507                                size = sizeof(uint8_t);
 508                        else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
 509                                size = sizeof(int32_t);
 510                        else
 511                                return -EINVAL;
 512
 513                        if (hdr.argsz - minsz < hdr.count * size ||
 514                            hdr.start >= max || hdr.start + hdr.count > max)
 515                                return -EINVAL;
 516
 517                        data = memdup_user((void __user *)(arg + minsz),
 518                                           hdr.count * size);
 519                        if (IS_ERR(data))
 520                                return PTR_ERR(data);
 521                }
 522
 523                mutex_lock(&vdev->igate);
 524
 525                ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
 526                                              hdr.start, hdr.count, data);
 527
 528                mutex_unlock(&vdev->igate);
 529                kfree(data);
 530
 531                return ret;
 532
 533        } else if (cmd == VFIO_DEVICE_RESET) {
 534                return vdev->reset_works ?
 535                        pci_try_reset_function(vdev->pdev) : -EINVAL;
 536
 537        } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
 538                struct vfio_pci_hot_reset_info hdr;
 539                struct vfio_pci_fill_info fill = { 0 };
 540                struct vfio_pci_dependent_device *devices = NULL;
 541                bool slot = false;
 542                int ret = 0;
 543
 544                minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
 545
 546                if (copy_from_user(&hdr, (void __user *)arg, minsz))
 547                        return -EFAULT;
 548
 549                if (hdr.argsz < minsz)
 550                        return -EINVAL;
 551
 552                hdr.flags = 0;
 553
 554                /* Can we do a slot or bus reset or neither? */
 555                if (!pci_probe_reset_slot(vdev->pdev->slot))
 556                        slot = true;
 557                else if (pci_probe_reset_bus(vdev->pdev->bus))
 558                        return -ENODEV;
 559
 560                /* How many devices are affected? */
 561                ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
 562                                                    vfio_pci_count_devs,
 563                                                    &fill.max, slot);
 564                if (ret)
 565                        return ret;
 566
 567                WARN_ON(!fill.max); /* Should always be at least one */
 568
 569                /*
 570                 * If there's enough space, fill it now, otherwise return
 571                 * -ENOSPC and the number of devices affected.
 572                 */
 573                if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
 574                        ret = -ENOSPC;
 575                        hdr.count = fill.max;
 576                        goto reset_info_exit;
 577                }
 578
 579                devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
 580                if (!devices)
 581                        return -ENOMEM;
 582
 583                fill.devices = devices;
 584
 585                ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
 586                                                    vfio_pci_fill_devs,
 587                                                    &fill, slot);
 588
 589                /*
 590                 * If a device was removed between counting and filling,
 591                 * we may come up short of fill.max.  If a device was
 592                 * added, we'll have a return of -EAGAIN above.
 593                 */
 594                if (!ret)
 595                        hdr.count = fill.cur;
 596
 597reset_info_exit:
 598                if (copy_to_user((void __user *)arg, &hdr, minsz))
 599                        ret = -EFAULT;
 600
 601                if (!ret) {
 602                        if (copy_to_user((void __user *)(arg + minsz), devices,
 603                                         hdr.count * sizeof(*devices)))
 604                                ret = -EFAULT;
 605                }
 606
 607                kfree(devices);
 608                return ret;
 609
 610        } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
 611                struct vfio_pci_hot_reset hdr;
 612                int32_t *group_fds;
 613                struct vfio_pci_group_entry *groups;
 614                struct vfio_pci_group_info info;
 615                bool slot = false;
 616                int i, count = 0, ret = 0;
 617
 618                minsz = offsetofend(struct vfio_pci_hot_reset, count);
 619
 620                if (copy_from_user(&hdr, (void __user *)arg, minsz))
 621                        return -EFAULT;
 622
 623                if (hdr.argsz < minsz || hdr.flags)
 624                        return -EINVAL;
 625
 626                /* Can we do a slot or bus reset or neither? */
 627                if (!pci_probe_reset_slot(vdev->pdev->slot))
 628                        slot = true;
 629                else if (pci_probe_reset_bus(vdev->pdev->bus))
 630                        return -ENODEV;
 631
 632                /*
 633                 * We can't let userspace give us an arbitrarily large
 634                 * buffer to copy, so verify how many we think there
 635                 * could be.  Note groups can have multiple devices so
 636                 * one group per device is the max.
 637                 */
 638                ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
 639                                                    vfio_pci_count_devs,
 640                                                    &count, slot);
 641                if (ret)
 642                        return ret;
 643
 644                /* Somewhere between 1 and count is OK */
 645                if (!hdr.count || hdr.count > count)
 646                        return -EINVAL;
 647
 648                group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
 649                groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
 650                if (!group_fds || !groups) {
 651                        kfree(group_fds);
 652                        kfree(groups);
 653                        return -ENOMEM;
 654                }
 655
 656                if (copy_from_user(group_fds, (void __user *)(arg + minsz),
 657                                   hdr.count * sizeof(*group_fds))) {
 658                        kfree(group_fds);
 659                        kfree(groups);
 660                        return -EFAULT;
 661                }
 662
 663                /*
 664                 * For each group_fd, get the group through the vfio external
 665                 * user interface and store the group and iommu ID.  This
 666                 * ensures the group is held across the reset.
 667                 */
 668                for (i = 0; i < hdr.count; i++) {
 669                        struct vfio_group *group;
 670                        struct fd f = fdget(group_fds[i]);
 671                        if (!f.file) {
 672                                ret = -EBADF;
 673                                break;
 674                        }
 675
 676                        group = vfio_group_get_external_user(f.file);
 677                        fdput(f);
 678                        if (IS_ERR(group)) {
 679                                ret = PTR_ERR(group);
 680                                break;
 681                        }
 682
 683                        groups[i].group = group;
 684                        groups[i].id = vfio_external_user_iommu_id(group);
 685                }
 686
 687                kfree(group_fds);
 688
 689                /* release reference to groups on error */
 690                if (ret)
 691                        goto hot_reset_release;
 692
 693                info.count = hdr.count;
 694                info.groups = groups;
 695
 696                /*
 697                 * Test whether all the affected devices are contained
 698                 * by the set of groups provided by the user.
 699                 */
 700                ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
 701                                                    vfio_pci_validate_devs,
 702                                                    &info, slot);
 703                if (!ret)
 704                        /* User has access, do the reset */
 705                        ret = slot ? pci_try_reset_slot(vdev->pdev->slot) :
 706                                     pci_try_reset_bus(vdev->pdev->bus);
 707
 708hot_reset_release:
 709                for (i--; i >= 0; i--)
 710                        vfio_group_put_external_user(groups[i].group);
 711
 712                kfree(groups);
 713                return ret;
 714        }
 715
 716        return -ENOTTY;
 717}
 718
 719static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
 720                           size_t count, loff_t *ppos, bool iswrite)
 721{
 722        unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
 723        struct vfio_pci_device *vdev = device_data;
 724
 725        if (index >= VFIO_PCI_NUM_REGIONS)
 726                return -EINVAL;
 727
 728        switch (index) {
 729        case VFIO_PCI_CONFIG_REGION_INDEX:
 730                return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
 731
 732        case VFIO_PCI_ROM_REGION_INDEX:
 733                if (iswrite)
 734                        return -EINVAL;
 735                return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
 736
 737        case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
 738                return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
 739
 740        case VFIO_PCI_VGA_REGION_INDEX:
 741                return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
 742        }
 743
 744        return -EINVAL;
 745}
 746
 747static ssize_t vfio_pci_read(void *device_data, char __user *buf,
 748                             size_t count, loff_t *ppos)
 749{
 750        if (!count)
 751                return 0;
 752
 753        return vfio_pci_rw(device_data, buf, count, ppos, false);
 754}
 755
 756static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
 757                              size_t count, loff_t *ppos)
 758{
 759        if (!count)
 760                return 0;
 761
 762        return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
 763}
 764
 765static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 766{
 767        struct vfio_pci_device *vdev = device_data;
 768        struct pci_dev *pdev = vdev->pdev;
 769        unsigned int index;
 770        u64 phys_len, req_len, pgoff, req_start;
 771        int ret;
 772
 773        index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
 774
 775        if (vma->vm_end < vma->vm_start)
 776                return -EINVAL;
 777        if ((vma->vm_flags & VM_SHARED) == 0)
 778                return -EINVAL;
 779        if (index >= VFIO_PCI_ROM_REGION_INDEX)
 780                return -EINVAL;
 781        if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
 782                return -EINVAL;
 783
 784        phys_len = pci_resource_len(pdev, index);
 785        req_len = vma->vm_end - vma->vm_start;
 786        pgoff = vma->vm_pgoff &
 787                ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
 788        req_start = pgoff << PAGE_SHIFT;
 789
 790        if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
 791                return -EINVAL;
 792
 793        if (index == vdev->msix_bar) {
 794                /*
 795                 * Disallow mmaps overlapping the MSI-X table; users don't
 796                 * get to touch this directly.  We could find somewhere
 797                 * else to map the overlap, but page granularity is only
 798                 * a recommendation, not a requirement, so the user needs
 799                 * to know which bits are real.  Requiring them to mmap
 800                 * around the table makes that clear.
 801                 */
 802
 803                /* If neither entirely above nor below, then it overlaps */
 804                if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
 805                      req_start + req_len <= vdev->msix_offset))
 806                        return -EINVAL;
 807        }
 808
 809        /*
 810         * Even though we don't make use of the barmap for the mmap,
 811         * we need to request the region and the barmap tracks that.
 812         */
 813        if (!vdev->barmap[index]) {
 814                ret = pci_request_selected_regions(pdev,
 815                                                   1 << index, "vfio-pci");
 816                if (ret)
 817                        return ret;
 818
 819                vdev->barmap[index] = pci_iomap(pdev, index, 0);
 820        }
 821
 822        vma->vm_private_data = vdev;
 823        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 824        vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
 825
 826        return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
 827                               req_len, vma->vm_page_prot);
 828}
 829
 830static const struct vfio_device_ops vfio_pci_ops = {
 831        .name           = "vfio-pci",
 832        .open           = vfio_pci_open,
 833        .release        = vfio_pci_release,
 834        .ioctl          = vfio_pci_ioctl,
 835        .read           = vfio_pci_read,
 836        .write          = vfio_pci_write,
 837        .mmap           = vfio_pci_mmap,
 838};
 839
 840static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 841{
 842        u8 type;
 843        struct vfio_pci_device *vdev;
 844        struct iommu_group *group;
 845        int ret;
 846
 847        pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
 848        if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
 849                return -EINVAL;
 850
 851        group = iommu_group_get(&pdev->dev);
 852        if (!group)
 853                return -EINVAL;
 854
 855        vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
 856        if (!vdev) {
 857                iommu_group_put(group);
 858                return -ENOMEM;
 859        }
 860
 861        vdev->pdev = pdev;
 862        vdev->irq_type = VFIO_PCI_NUM_IRQS;
 863        mutex_init(&vdev->igate);
 864        spin_lock_init(&vdev->irqlock);
 865
 866        ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
 867        if (ret) {
 868                iommu_group_put(group);
 869                kfree(vdev);
 870        }
 871
 872        return ret;
 873}
 874
 875static void vfio_pci_remove(struct pci_dev *pdev)
 876{
 877        struct vfio_pci_device *vdev;
 878
 879        vdev = vfio_del_group_dev(&pdev->dev);
 880        if (vdev) {
 881                iommu_group_put(pdev->dev.iommu_group);
 882                kfree(vdev);
 883        }
 884}
 885
 886static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 887                                                  pci_channel_state_t state)
 888{
 889        struct vfio_pci_device *vdev;
 890        struct vfio_device *device;
 891
 892        device = vfio_device_get_from_dev(&pdev->dev);
 893        if (device == NULL)
 894                return PCI_ERS_RESULT_DISCONNECT;
 895
 896        vdev = vfio_device_data(device);
 897        if (vdev == NULL) {
 898                vfio_device_put(device);
 899                return PCI_ERS_RESULT_DISCONNECT;
 900        }
 901
 902        mutex_lock(&vdev->igate);
 903
 904        if (vdev->err_trigger)
 905                eventfd_signal(vdev->err_trigger, 1);
 906
 907        mutex_unlock(&vdev->igate);
 908
 909        vfio_device_put(device);
 910
 911        return PCI_ERS_RESULT_CAN_RECOVER;
 912}
 913
 914static struct pci_error_handlers vfio_err_handlers = {
 915        .error_detected = vfio_pci_aer_err_detected,
 916};
 917
 918static struct pci_driver vfio_pci_driver = {
 919        .name           = "vfio-pci",
 920        .id_table       = NULL, /* only dynamic ids */
 921        .probe          = vfio_pci_probe,
 922        .remove         = vfio_pci_remove,
 923        .err_handler    = &vfio_err_handlers,
 924};
 925
 926struct vfio_devices {
 927        struct vfio_device **devices;
 928        int cur_index;
 929        int max_index;
 930};
 931
 932static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
 933{
 934        struct vfio_devices *devs = data;
 935        struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
 936
 937        if (pci_drv != &vfio_pci_driver)
 938                return -EBUSY;
 939
 940        if (devs->cur_index == devs->max_index)
 941                return -ENOSPC;
 942
 943        devs->devices[devs->cur_index] = vfio_device_get_from_dev(&pdev->dev);
 944        if (!devs->devices[devs->cur_index])
 945                return -EINVAL;
 946
 947        devs->cur_index++;
 948        return 0;
 949}
 950
 951/*
 952 * Attempt to do a bus/slot reset if there are devices affected by a reset for
 953 * this device that are needs_reset and all of the affected devices are unused
 954 * (!refcnt).  Callers are required to hold driver_lock when calling this to
 955 * prevent device opens and concurrent bus reset attempts.  We prevent device
 956 * unbinds by acquiring and holding a reference to the vfio_device.
 957 *
 958 * NB: vfio-core considers a group to be viable even if some devices are
 959 * bound to drivers like pci-stub or pcieport.  Here we require all devices
 960 * to be bound to vfio_pci since that's the only way we can be sure they
 961 * stay put.
 962 */
 963static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 964{
 965        struct vfio_devices devs = { .cur_index = 0 };
 966        int i = 0, ret = -EINVAL;
 967        bool needs_reset = false, slot = false;
 968        struct vfio_pci_device *tmp;
 969
 970        if (!pci_probe_reset_slot(vdev->pdev->slot))
 971                slot = true;
 972        else if (pci_probe_reset_bus(vdev->pdev->bus))
 973                return;
 974
 975        if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
 976                                          &i, slot) || !i)
 977                return;
 978
 979        devs.max_index = i;
 980        devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL);
 981        if (!devs.devices)
 982                return;
 983
 984        if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
 985                                          vfio_pci_get_devs, &devs, slot))
 986                goto put_devs;
 987
 988        for (i = 0; i < devs.cur_index; i++) {
 989                tmp = vfio_device_data(devs.devices[i]);
 990                if (tmp->needs_reset)
 991                        needs_reset = true;
 992                if (tmp->refcnt)
 993                        goto put_devs;
 994        }
 995
 996        if (needs_reset)
 997                ret = slot ? pci_try_reset_slot(vdev->pdev->slot) :
 998                             pci_try_reset_bus(vdev->pdev->bus);
 999
1000put_devs:
1001        for (i = 0; i < devs.cur_index; i++) {
1002                if (!ret) {
1003                        tmp = vfio_device_data(devs.devices[i]);
1004                        tmp->needs_reset = false;
1005                }
1006                vfio_device_put(devs.devices[i]);
1007        }
1008
1009        kfree(devs.devices);
1010}
1011
1012static void __exit vfio_pci_cleanup(void)
1013{
1014        pci_unregister_driver(&vfio_pci_driver);
1015        vfio_pci_virqfd_exit();
1016        vfio_pci_uninit_perm_bits();
1017}
1018
1019static int __init vfio_pci_init(void)
1020{
1021        int ret;
1022
1023        /* Allocate shared config space permision data used by all devices */
1024        ret = vfio_pci_init_perm_bits();
1025        if (ret)
1026                return ret;
1027
1028        /* Start the virqfd cleanup handler */
1029        ret = vfio_pci_virqfd_init();
1030        if (ret)
1031                goto out_virqfd;
1032
1033        /* Register and scan for devices */
1034        ret = pci_register_driver(&vfio_pci_driver);
1035        if (ret)
1036                goto out_driver;
1037
1038        return 0;
1039
1040out_driver:
1041        vfio_pci_virqfd_exit();
1042out_virqfd:
1043        vfio_pci_uninit_perm_bits();
1044        return ret;
1045}
1046
1047module_init(vfio_pci_init);
1048module_exit(vfio_pci_cleanup);
1049
1050MODULE_VERSION(DRIVER_VERSION);
1051MODULE_LICENSE("GPL v2");
1052MODULE_AUTHOR(DRIVER_AUTHOR);
1053MODULE_DESCRIPTION(DRIVER_DESC);
1054