linux/drivers/vfio/vfio.c
<<
>>
Prefs
   1/*
   2 * VFIO core
   3 *
   4 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5 *     Author: Alex Williamson <alex.williamson@redhat.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio:
  12 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13 * Author: Tom Lyon, pugs@cisco.com
  14 */
  15
  16#include <linux/cdev.h>
  17#include <linux/compat.h>
  18#include <linux/device.h>
  19#include <linux/file.h>
  20#include <linux/anon_inodes.h>
  21#include <linux/fs.h>
  22#include <linux/idr.h>
  23#include <linux/iommu.h>
  24#include <linux/list.h>
  25#include <linux/module.h>
  26#include <linux/mutex.h>
  27#include <linux/rwsem.h>
  28#include <linux/sched.h>
  29#include <linux/slab.h>
  30#include <linux/stat.h>
  31#include <linux/string.h>
  32#include <linux/uaccess.h>
  33#include <linux/vfio.h>
  34#include <linux/wait.h>
  35
  36#define DRIVER_VERSION  "0.3"
  37#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  38#define DRIVER_DESC     "VFIO - User Level meta-driver"
  39
  40static struct vfio {
  41        struct class                    *class;
  42        struct list_head                iommu_drivers_list;
  43        struct mutex                    iommu_drivers_lock;
  44        struct list_head                group_list;
  45        struct idr                      group_idr;
  46        struct mutex                    group_lock;
  47        struct cdev                     group_cdev;
  48        struct device                   *dev;
  49        dev_t                           devt;
  50        struct cdev                     cdev;
  51        wait_queue_head_t               release_q;
  52} vfio;
  53
  54struct vfio_iommu_driver {
  55        const struct vfio_iommu_driver_ops      *ops;
  56        struct list_head                        vfio_next;
  57};
  58
  59struct vfio_container {
  60        struct kref                     kref;
  61        struct list_head                group_list;
  62        struct rw_semaphore             group_lock;
  63        struct vfio_iommu_driver        *iommu_driver;
  64        void                            *iommu_data;
  65};
  66
  67struct vfio_group {
  68        struct kref                     kref;
  69        int                             minor;
  70        atomic_t                        container_users;
  71        struct iommu_group              *iommu_group;
  72        struct vfio_container           *container;
  73        struct list_head                device_list;
  74        struct mutex                    device_lock;
  75        struct device                   *dev;
  76        struct notifier_block           nb;
  77        struct list_head                vfio_next;
  78        struct list_head                container_next;
  79};
  80
  81struct vfio_device {
  82        struct kref                     kref;
  83        struct device                   *dev;
  84        const struct vfio_device_ops    *ops;
  85        struct vfio_group               *group;
  86        struct list_head                group_next;
  87        void                            *device_data;
  88};
  89
  90/**
  91 * IOMMU driver registration
  92 */
  93int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
  94{
  95        struct vfio_iommu_driver *driver, *tmp;
  96
  97        driver = kzalloc(sizeof(*driver), GFP_KERNEL);
  98        if (!driver)
  99                return -ENOMEM;
 100
 101        driver->ops = ops;
 102
 103        mutex_lock(&vfio.iommu_drivers_lock);
 104
 105        /* Check for duplicates */
 106        list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 107                if (tmp->ops == ops) {
 108                        mutex_unlock(&vfio.iommu_drivers_lock);
 109                        kfree(driver);
 110                        return -EINVAL;
 111                }
 112        }
 113
 114        list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 115
 116        mutex_unlock(&vfio.iommu_drivers_lock);
 117
 118        return 0;
 119}
 120EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 121
 122void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 123{
 124        struct vfio_iommu_driver *driver;
 125
 126        mutex_lock(&vfio.iommu_drivers_lock);
 127        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 128                if (driver->ops == ops) {
 129                        list_del(&driver->vfio_next);
 130                        mutex_unlock(&vfio.iommu_drivers_lock);
 131                        kfree(driver);
 132                        return;
 133                }
 134        }
 135        mutex_unlock(&vfio.iommu_drivers_lock);
 136}
 137EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 138
 139/**
 140 * Group minor allocation/free - both called with vfio.group_lock held
 141 */
 142static int vfio_alloc_group_minor(struct vfio_group *group)
 143{
 144        /* index 0 is used by /dev/vfio/vfio */
 145        return idr_alloc(&vfio.group_idr, group, 1, MINORMASK + 1, GFP_KERNEL);
 146}
 147
 148static void vfio_free_group_minor(int minor)
 149{
 150        idr_remove(&vfio.group_idr, minor);
 151}
 152
 153static int vfio_iommu_group_notifier(struct notifier_block *nb,
 154                                     unsigned long action, void *data);
 155static void vfio_group_get(struct vfio_group *group);
 156
 157/**
 158 * Container objects - containers are created when /dev/vfio/vfio is
 159 * opened, but their lifecycle extends until the last user is done, so
 160 * it's freed via kref.  Must support container/group/device being
 161 * closed in any order.
 162 */
 163static void vfio_container_get(struct vfio_container *container)
 164{
 165        kref_get(&container->kref);
 166}
 167
 168static void vfio_container_release(struct kref *kref)
 169{
 170        struct vfio_container *container;
 171        container = container_of(kref, struct vfio_container, kref);
 172
 173        kfree(container);
 174}
 175
 176static void vfio_container_put(struct vfio_container *container)
 177{
 178        kref_put(&container->kref, vfio_container_release);
 179}
 180
 181static void vfio_group_unlock_and_free(struct vfio_group *group)
 182{
 183        mutex_unlock(&vfio.group_lock);
 184        /*
 185         * Unregister outside of lock.  A spurious callback is harmless now
 186         * that the group is no longer in vfio.group_list.
 187         */
 188        iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 189        kfree(group);
 190}
 191
 192/**
 193 * Group objects - create, release, get, put, search
 194 */
 195static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 196{
 197        struct vfio_group *group, *tmp;
 198        struct device *dev;
 199        int ret, minor;
 200
 201        group = kzalloc(sizeof(*group), GFP_KERNEL);
 202        if (!group)
 203                return ERR_PTR(-ENOMEM);
 204
 205        kref_init(&group->kref);
 206        INIT_LIST_HEAD(&group->device_list);
 207        mutex_init(&group->device_lock);
 208        atomic_set(&group->container_users, 0);
 209        group->iommu_group = iommu_group;
 210
 211        group->nb.notifier_call = vfio_iommu_group_notifier;
 212
 213        /*
 214         * blocking notifiers acquire a rwsem around registering and hold
 215         * it around callback.  Therefore, need to register outside of
 216         * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 217         * do anything unless it can find the group in vfio.group_list, so
 218         * no harm in registering early.
 219         */
 220        ret = iommu_group_register_notifier(iommu_group, &group->nb);
 221        if (ret) {
 222                kfree(group);
 223                return ERR_PTR(ret);
 224        }
 225
 226        mutex_lock(&vfio.group_lock);
 227
 228        minor = vfio_alloc_group_minor(group);
 229        if (minor < 0) {
 230                vfio_group_unlock_and_free(group);
 231                return ERR_PTR(minor);
 232        }
 233
 234        /* Did we race creating this group? */
 235        list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 236                if (tmp->iommu_group == iommu_group) {
 237                        vfio_group_get(tmp);
 238                        vfio_free_group_minor(minor);
 239                        vfio_group_unlock_and_free(group);
 240                        return tmp;
 241                }
 242        }
 243
 244        dev = device_create(vfio.class, NULL, MKDEV(MAJOR(vfio.devt), minor),
 245                            group, "%d", iommu_group_id(iommu_group));
 246        if (IS_ERR(dev)) {
 247                vfio_free_group_minor(minor);
 248                vfio_group_unlock_and_free(group);
 249                return (struct vfio_group *)dev; /* ERR_PTR */
 250        }
 251
 252        group->minor = minor;
 253        group->dev = dev;
 254
 255        list_add(&group->vfio_next, &vfio.group_list);
 256
 257        mutex_unlock(&vfio.group_lock);
 258
 259        return group;
 260}
 261
 262/* called with vfio.group_lock held */
 263static void vfio_group_release(struct kref *kref)
 264{
 265        struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 266
 267        WARN_ON(!list_empty(&group->device_list));
 268
 269        device_destroy(vfio.class, MKDEV(MAJOR(vfio.devt), group->minor));
 270        list_del(&group->vfio_next);
 271        vfio_free_group_minor(group->minor);
 272        vfio_group_unlock_and_free(group);
 273}
 274
 275static void vfio_group_put(struct vfio_group *group)
 276{
 277        kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 278}
 279
 280/* Assume group_lock or group reference is held */
 281static void vfio_group_get(struct vfio_group *group)
 282{
 283        kref_get(&group->kref);
 284}
 285
 286/*
 287 * Not really a try as we will sleep for mutex, but we need to make
 288 * sure the group pointer is valid under lock and get a reference.
 289 */
 290static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 291{
 292        struct vfio_group *target = group;
 293
 294        mutex_lock(&vfio.group_lock);
 295        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 296                if (group == target) {
 297                        vfio_group_get(group);
 298                        mutex_unlock(&vfio.group_lock);
 299                        return group;
 300                }
 301        }
 302        mutex_unlock(&vfio.group_lock);
 303
 304        return NULL;
 305}
 306
 307static
 308struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 309{
 310        struct vfio_group *group;
 311
 312        mutex_lock(&vfio.group_lock);
 313        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 314                if (group->iommu_group == iommu_group) {
 315                        vfio_group_get(group);
 316                        mutex_unlock(&vfio.group_lock);
 317                        return group;
 318                }
 319        }
 320        mutex_unlock(&vfio.group_lock);
 321
 322        return NULL;
 323}
 324
 325static struct vfio_group *vfio_group_get_from_minor(int minor)
 326{
 327        struct vfio_group *group;
 328
 329        mutex_lock(&vfio.group_lock);
 330        group = idr_find(&vfio.group_idr, minor);
 331        if (!group) {
 332                mutex_unlock(&vfio.group_lock);
 333                return NULL;
 334        }
 335        vfio_group_get(group);
 336        mutex_unlock(&vfio.group_lock);
 337
 338        return group;
 339}
 340
 341/**
 342 * Device objects - create, release, get, put, search
 343 */
 344static
 345struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 346                                             struct device *dev,
 347                                             const struct vfio_device_ops *ops,
 348                                             void *device_data)
 349{
 350        struct vfio_device *device;
 351        int ret;
 352
 353        device = kzalloc(sizeof(*device), GFP_KERNEL);
 354        if (!device)
 355                return ERR_PTR(-ENOMEM);
 356
 357        kref_init(&device->kref);
 358        device->dev = dev;
 359        device->group = group;
 360        device->ops = ops;
 361        device->device_data = device_data;
 362
 363        ret = dev_set_drvdata(dev, device);
 364        if (ret) {
 365                kfree(device);
 366                return ERR_PTR(ret);
 367        }
 368
 369        /* No need to get group_lock, caller has group reference */
 370        vfio_group_get(group);
 371
 372        mutex_lock(&group->device_lock);
 373        list_add(&device->group_next, &group->device_list);
 374        mutex_unlock(&group->device_lock);
 375
 376        return device;
 377}
 378
 379static void vfio_device_release(struct kref *kref)
 380{
 381        struct vfio_device *device = container_of(kref,
 382                                                  struct vfio_device, kref);
 383        struct vfio_group *group = device->group;
 384
 385        list_del(&device->group_next);
 386        mutex_unlock(&group->device_lock);
 387
 388        dev_set_drvdata(device->dev, NULL);
 389
 390        kfree(device);
 391
 392        /* vfio_del_group_dev may be waiting for this device */
 393        wake_up(&vfio.release_q);
 394}
 395
 396/* Device reference always implies a group reference */
 397void vfio_device_put(struct vfio_device *device)
 398{
 399        struct vfio_group *group = device->group;
 400        kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 401        vfio_group_put(group);
 402}
 403EXPORT_SYMBOL_GPL(vfio_device_put);
 404
 405static void vfio_device_get(struct vfio_device *device)
 406{
 407        vfio_group_get(device->group);
 408        kref_get(&device->kref);
 409}
 410
 411static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 412                                                 struct device *dev)
 413{
 414        struct vfio_device *device;
 415
 416        mutex_lock(&group->device_lock);
 417        list_for_each_entry(device, &group->device_list, group_next) {
 418                if (device->dev == dev) {
 419                        vfio_device_get(device);
 420                        mutex_unlock(&group->device_lock);
 421                        return device;
 422                }
 423        }
 424        mutex_unlock(&group->device_lock);
 425        return NULL;
 426}
 427
 428/*
 429 * Whitelist some drivers that we know are safe (no dma) or just sit on
 430 * a device.  It's not always practical to leave a device within a group
 431 * driverless as it could get re-bound to something unsafe.
 432 */
 433static const char * const vfio_driver_whitelist[] = { "pci-stub", "pcieport" };
 434
 435static bool vfio_whitelisted_driver(struct device_driver *drv)
 436{
 437        int i;
 438
 439        for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
 440                if (!strcmp(drv->name, vfio_driver_whitelist[i]))
 441                        return true;
 442        }
 443
 444        return false;
 445}
 446
 447/*
 448 * A vfio group is viable for use by userspace if all devices are either
 449 * driver-less or bound to a vfio or whitelisted driver.  We test the
 450 * latter by the existence of a struct vfio_device matching the dev.
 451 */
 452static int vfio_dev_viable(struct device *dev, void *data)
 453{
 454        struct vfio_group *group = data;
 455        struct vfio_device *device;
 456        struct device_driver *drv = ACCESS_ONCE(dev->driver);
 457
 458        if (!drv || vfio_whitelisted_driver(drv))
 459                return 0;
 460
 461        device = vfio_group_get_device(group, dev);
 462        if (device) {
 463                vfio_device_put(device);
 464                return 0;
 465        }
 466
 467        return -EINVAL;
 468}
 469
 470/**
 471 * Async device support
 472 */
 473static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 474{
 475        struct vfio_device *device;
 476
 477        /* Do we already know about it?  We shouldn't */
 478        device = vfio_group_get_device(group, dev);
 479        if (WARN_ON_ONCE(device)) {
 480                vfio_device_put(device);
 481                return 0;
 482        }
 483
 484        /* Nothing to do for idle groups */
 485        if (!atomic_read(&group->container_users))
 486                return 0;
 487
 488        /* TODO Prevent device auto probing */
 489        WARN("Device %s added to live group %d!\n", dev_name(dev),
 490             iommu_group_id(group->iommu_group));
 491
 492        return 0;
 493}
 494
 495static int vfio_group_nb_del_dev(struct vfio_group *group, struct device *dev)
 496{
 497        struct vfio_device *device;
 498
 499        /*
 500         * Expect to fall out here.  If a device was in use, it would
 501         * have been bound to a vfio sub-driver, which would have blocked
 502         * in .remove at vfio_del_group_dev.  Sanity check that we no
 503         * longer track the device, so it's safe to remove.
 504         */
 505        device = vfio_group_get_device(group, dev);
 506        if (likely(!device))
 507                return 0;
 508
 509        WARN("Device %s removed from live group %d!\n", dev_name(dev),
 510             iommu_group_id(group->iommu_group));
 511
 512        vfio_device_put(device);
 513        return 0;
 514}
 515
 516static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 517{
 518        /* We don't care what happens when the group isn't in use */
 519        if (!atomic_read(&group->container_users))
 520                return 0;
 521
 522        return vfio_dev_viable(dev, group);
 523}
 524
 525static int vfio_iommu_group_notifier(struct notifier_block *nb,
 526                                     unsigned long action, void *data)
 527{
 528        struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 529        struct device *dev = data;
 530
 531        /*
 532         * Need to go through a group_lock lookup to get a reference or
 533         * we risk racing a group being removed.  Leave a WARN_ON for
 534         * debuging, but if the group no longer exists, a spurious notify
 535         * is harmless.
 536         */
 537        group = vfio_group_try_get(group);
 538        if (WARN_ON(!group))
 539                return NOTIFY_OK;
 540
 541        switch (action) {
 542        case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 543                vfio_group_nb_add_dev(group, dev);
 544                break;
 545        case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 546                vfio_group_nb_del_dev(group, dev);
 547                break;
 548        case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 549                pr_debug("%s: Device %s, group %d binding to driver\n",
 550                         __func__, dev_name(dev),
 551                         iommu_group_id(group->iommu_group));
 552                break;
 553        case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 554                pr_debug("%s: Device %s, group %d bound to driver %s\n",
 555                         __func__, dev_name(dev),
 556                         iommu_group_id(group->iommu_group), dev->driver->name);
 557                BUG_ON(vfio_group_nb_verify(group, dev));
 558                break;
 559        case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 560                pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
 561                         __func__, dev_name(dev),
 562                         iommu_group_id(group->iommu_group), dev->driver->name);
 563                break;
 564        case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 565                pr_debug("%s: Device %s, group %d unbound from driver\n",
 566                         __func__, dev_name(dev),
 567                         iommu_group_id(group->iommu_group));
 568                /*
 569                 * XXX An unbound device in a live group is ok, but we'd
 570                 * really like to avoid the above BUG_ON by preventing other
 571                 * drivers from binding to it.  Once that occurs, we have to
 572                 * stop the system to maintain isolation.  At a minimum, we'd
 573                 * want a toggle to disable driver auto probe for this device.
 574                 */
 575                break;
 576        }
 577
 578        vfio_group_put(group);
 579        return NOTIFY_OK;
 580}
 581
 582/**
 583 * VFIO driver API
 584 */
 585int vfio_add_group_dev(struct device *dev,
 586                       const struct vfio_device_ops *ops, void *device_data)
 587{
 588        struct iommu_group *iommu_group;
 589        struct vfio_group *group;
 590        struct vfio_device *device;
 591
 592        iommu_group = iommu_group_get(dev);
 593        if (!iommu_group)
 594                return -EINVAL;
 595
 596        group = vfio_group_get_from_iommu(iommu_group);
 597        if (!group) {
 598                group = vfio_create_group(iommu_group);
 599                if (IS_ERR(group)) {
 600                        iommu_group_put(iommu_group);
 601                        return PTR_ERR(group);
 602                }
 603        }
 604
 605        device = vfio_group_get_device(group, dev);
 606        if (device) {
 607                WARN(1, "Device %s already exists on group %d\n",
 608                     dev_name(dev), iommu_group_id(iommu_group));
 609                vfio_device_put(device);
 610                vfio_group_put(group);
 611                iommu_group_put(iommu_group);
 612                return -EBUSY;
 613        }
 614
 615        device = vfio_group_create_device(group, dev, ops, device_data);
 616        if (IS_ERR(device)) {
 617                vfio_group_put(group);
 618                iommu_group_put(iommu_group);
 619                return PTR_ERR(device);
 620        }
 621
 622        /*
 623         * Added device holds reference to iommu_group and vfio_device
 624         * (which in turn holds reference to vfio_group).  Drop extra
 625         * group reference used while acquiring device.
 626         */
 627        vfio_group_put(group);
 628
 629        return 0;
 630}
 631EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 632
 633/**
 634 * Get a reference to the vfio_device for a device that is known to
 635 * be bound to a vfio driver.  The driver implicitly holds a
 636 * vfio_device reference between vfio_add_group_dev and
 637 * vfio_del_group_dev.  We can therefore use drvdata to increment
 638 * that reference from the struct device.  This additional
 639 * reference must be released by calling vfio_device_put.
 640 */
 641struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 642{
 643        struct vfio_device *device = dev_get_drvdata(dev);
 644
 645        vfio_device_get(device);
 646
 647        return device;
 648}
 649EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 650
 651/*
 652 * Caller must hold a reference to the vfio_device
 653 */
 654void *vfio_device_data(struct vfio_device *device)
 655{
 656        return device->device_data;
 657}
 658EXPORT_SYMBOL_GPL(vfio_device_data);
 659
 660/* Given a referenced group, check if it contains the device */
 661static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
 662{
 663        struct vfio_device *device;
 664
 665        device = vfio_group_get_device(group, dev);
 666        if (!device)
 667                return false;
 668
 669        vfio_device_put(device);
 670        return true;
 671}
 672
 673/*
 674 * Decrement the device reference count and wait for the device to be
 675 * removed.  Open file descriptors for the device... */
 676void *vfio_del_group_dev(struct device *dev)
 677{
 678        struct vfio_device *device = dev_get_drvdata(dev);
 679        struct vfio_group *group = device->group;
 680        struct iommu_group *iommu_group = group->iommu_group;
 681        void *device_data = device->device_data;
 682
 683        /*
 684         * The group exists so long as we have a device reference.  Get
 685         * a group reference and use it to scan for the device going away.
 686         */
 687        vfio_group_get(group);
 688
 689        vfio_device_put(device);
 690
 691        /* TODO send a signal to encourage this to be released */
 692        wait_event(vfio.release_q, !vfio_dev_present(group, dev));
 693
 694        vfio_group_put(group);
 695
 696        iommu_group_put(iommu_group);
 697
 698        return device_data;
 699}
 700EXPORT_SYMBOL_GPL(vfio_del_group_dev);
 701
 702/**
 703 * VFIO base fd, /dev/vfio/vfio
 704 */
 705static long vfio_ioctl_check_extension(struct vfio_container *container,
 706                                       unsigned long arg)
 707{
 708        struct vfio_iommu_driver *driver;
 709        long ret = 0;
 710
 711        down_read(&container->group_lock);
 712
 713        driver = container->iommu_driver;
 714
 715        switch (arg) {
 716                /* No base extensions yet */
 717        default:
 718                /*
 719                 * If no driver is set, poll all registered drivers for
 720                 * extensions and return the first positive result.  If
 721                 * a driver is already set, further queries will be passed
 722                 * only to that driver.
 723                 */
 724                if (!driver) {
 725                        mutex_lock(&vfio.iommu_drivers_lock);
 726                        list_for_each_entry(driver, &vfio.iommu_drivers_list,
 727                                            vfio_next) {
 728                                if (!try_module_get(driver->ops->owner))
 729                                        continue;
 730
 731                                ret = driver->ops->ioctl(NULL,
 732                                                         VFIO_CHECK_EXTENSION,
 733                                                         arg);
 734                                module_put(driver->ops->owner);
 735                                if (ret > 0)
 736                                        break;
 737                        }
 738                        mutex_unlock(&vfio.iommu_drivers_lock);
 739                } else
 740                        ret = driver->ops->ioctl(container->iommu_data,
 741                                                 VFIO_CHECK_EXTENSION, arg);
 742        }
 743
 744        up_read(&container->group_lock);
 745
 746        return ret;
 747}
 748
 749/* hold write lock on container->group_lock */
 750static int __vfio_container_attach_groups(struct vfio_container *container,
 751                                          struct vfio_iommu_driver *driver,
 752                                          void *data)
 753{
 754        struct vfio_group *group;
 755        int ret = -ENODEV;
 756
 757        list_for_each_entry(group, &container->group_list, container_next) {
 758                ret = driver->ops->attach_group(data, group->iommu_group);
 759                if (ret)
 760                        goto unwind;
 761        }
 762
 763        return ret;
 764
 765unwind:
 766        list_for_each_entry_continue_reverse(group, &container->group_list,
 767                                             container_next) {
 768                driver->ops->detach_group(data, group->iommu_group);
 769        }
 770
 771        return ret;
 772}
 773
 774static long vfio_ioctl_set_iommu(struct vfio_container *container,
 775                                 unsigned long arg)
 776{
 777        struct vfio_iommu_driver *driver;
 778        long ret = -ENODEV;
 779
 780        down_write(&container->group_lock);
 781
 782        /*
 783         * The container is designed to be an unprivileged interface while
 784         * the group can be assigned to specific users.  Therefore, only by
 785         * adding a group to a container does the user get the privilege of
 786         * enabling the iommu, which may allocate finite resources.  There
 787         * is no unset_iommu, but by removing all the groups from a container,
 788         * the container is deprivileged and returns to an unset state.
 789         */
 790        if (list_empty(&container->group_list) || container->iommu_driver) {
 791                up_write(&container->group_lock);
 792                return -EINVAL;
 793        }
 794
 795        mutex_lock(&vfio.iommu_drivers_lock);
 796        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 797                void *data;
 798
 799                if (!try_module_get(driver->ops->owner))
 800                        continue;
 801
 802                /*
 803                 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
 804                 * so test which iommu driver reported support for this
 805                 * extension and call open on them.  We also pass them the
 806                 * magic, allowing a single driver to support multiple
 807                 * interfaces if they'd like.
 808                 */
 809                if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
 810                        module_put(driver->ops->owner);
 811                        continue;
 812                }
 813
 814                /* module reference holds the driver we're working on */
 815                mutex_unlock(&vfio.iommu_drivers_lock);
 816
 817                data = driver->ops->open(arg);
 818                if (IS_ERR(data)) {
 819                        ret = PTR_ERR(data);
 820                        module_put(driver->ops->owner);
 821                        goto skip_drivers_unlock;
 822                }
 823
 824                ret = __vfio_container_attach_groups(container, driver, data);
 825                if (!ret) {
 826                        container->iommu_driver = driver;
 827                        container->iommu_data = data;
 828                } else {
 829                        driver->ops->release(data);
 830                        module_put(driver->ops->owner);
 831                }
 832
 833                goto skip_drivers_unlock;
 834        }
 835
 836        mutex_unlock(&vfio.iommu_drivers_lock);
 837skip_drivers_unlock:
 838        up_write(&container->group_lock);
 839
 840        return ret;
 841}
 842
 843static long vfio_fops_unl_ioctl(struct file *filep,
 844                                unsigned int cmd, unsigned long arg)
 845{
 846        struct vfio_container *container = filep->private_data;
 847        struct vfio_iommu_driver *driver;
 848        void *data;
 849        long ret = -EINVAL;
 850
 851        if (!container)
 852                return ret;
 853
 854        switch (cmd) {
 855        case VFIO_GET_API_VERSION:
 856                ret = VFIO_API_VERSION;
 857                break;
 858        case VFIO_CHECK_EXTENSION:
 859                ret = vfio_ioctl_check_extension(container, arg);
 860                break;
 861        case VFIO_SET_IOMMU:
 862                ret = vfio_ioctl_set_iommu(container, arg);
 863                break;
 864        default:
 865                down_read(&container->group_lock);
 866
 867                driver = container->iommu_driver;
 868                data = container->iommu_data;
 869
 870                if (driver) /* passthrough all unrecognized ioctls */
 871                        ret = driver->ops->ioctl(data, cmd, arg);
 872
 873                up_read(&container->group_lock);
 874        }
 875
 876        return ret;
 877}
 878
 879#ifdef CONFIG_COMPAT
 880static long vfio_fops_compat_ioctl(struct file *filep,
 881                                   unsigned int cmd, unsigned long arg)
 882{
 883        arg = (unsigned long)compat_ptr(arg);
 884        return vfio_fops_unl_ioctl(filep, cmd, arg);
 885}
 886#endif  /* CONFIG_COMPAT */
 887
 888static int vfio_fops_open(struct inode *inode, struct file *filep)
 889{
 890        struct vfio_container *container;
 891
 892        container = kzalloc(sizeof(*container), GFP_KERNEL);
 893        if (!container)
 894                return -ENOMEM;
 895
 896        INIT_LIST_HEAD(&container->group_list);
 897        init_rwsem(&container->group_lock);
 898        kref_init(&container->kref);
 899
 900        filep->private_data = container;
 901
 902        return 0;
 903}
 904
 905static int vfio_fops_release(struct inode *inode, struct file *filep)
 906{
 907        struct vfio_container *container = filep->private_data;
 908
 909        filep->private_data = NULL;
 910
 911        vfio_container_put(container);
 912
 913        return 0;
 914}
 915
 916/*
 917 * Once an iommu driver is set, we optionally pass read/write/mmap
 918 * on to the driver, allowing management interfaces beyond ioctl.
 919 */
 920static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
 921                              size_t count, loff_t *ppos)
 922{
 923        struct vfio_container *container = filep->private_data;
 924        struct vfio_iommu_driver *driver;
 925        ssize_t ret = -EINVAL;
 926
 927        down_read(&container->group_lock);
 928
 929        driver = container->iommu_driver;
 930        if (likely(driver && driver->ops->read))
 931                ret = driver->ops->read(container->iommu_data,
 932                                        buf, count, ppos);
 933
 934        up_read(&container->group_lock);
 935
 936        return ret;
 937}
 938
 939static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
 940                               size_t count, loff_t *ppos)
 941{
 942        struct vfio_container *container = filep->private_data;
 943        struct vfio_iommu_driver *driver;
 944        ssize_t ret = -EINVAL;
 945
 946        down_read(&container->group_lock);
 947
 948        driver = container->iommu_driver;
 949        if (likely(driver && driver->ops->write))
 950                ret = driver->ops->write(container->iommu_data,
 951                                         buf, count, ppos);
 952
 953        up_read(&container->group_lock);
 954
 955        return ret;
 956}
 957
 958static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 959{
 960        struct vfio_container *container = filep->private_data;
 961        struct vfio_iommu_driver *driver;
 962        int ret = -EINVAL;
 963
 964        down_read(&container->group_lock);
 965
 966        driver = container->iommu_driver;
 967        if (likely(driver && driver->ops->mmap))
 968                ret = driver->ops->mmap(container->iommu_data, vma);
 969
 970        up_read(&container->group_lock);
 971
 972        return ret;
 973}
 974
 975static const struct file_operations vfio_fops = {
 976        .owner          = THIS_MODULE,
 977        .open           = vfio_fops_open,
 978        .release        = vfio_fops_release,
 979        .read           = vfio_fops_read,
 980        .write          = vfio_fops_write,
 981        .unlocked_ioctl = vfio_fops_unl_ioctl,
 982#ifdef CONFIG_COMPAT
 983        .compat_ioctl   = vfio_fops_compat_ioctl,
 984#endif
 985        .mmap           = vfio_fops_mmap,
 986};
 987
 988/**
 989 * VFIO Group fd, /dev/vfio/$GROUP
 990 */
 991static void __vfio_group_unset_container(struct vfio_group *group)
 992{
 993        struct vfio_container *container = group->container;
 994        struct vfio_iommu_driver *driver;
 995
 996        down_write(&container->group_lock);
 997
 998        driver = container->iommu_driver;
 999        if (driver)
1000                driver->ops->detach_group(container->iommu_data,
1001                                          group->iommu_group);
1002
1003        group->container = NULL;
1004        list_del(&group->container_next);
1005
1006        /* Detaching the last group deprivileges a container, remove iommu */
1007        if (driver && list_empty(&container->group_list)) {
1008                driver->ops->release(container->iommu_data);
1009                module_put(driver->ops->owner);
1010                container->iommu_driver = NULL;
1011                container->iommu_data = NULL;
1012        }
1013
1014        up_write(&container->group_lock);
1015
1016        vfio_container_put(container);
1017}
1018
1019/*
1020 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1021 * if there was no container to unset.  Since the ioctl is called on
1022 * the group, we know that still exists, therefore the only valid
1023 * transition here is 1->0.
1024 */
1025static int vfio_group_unset_container(struct vfio_group *group)
1026{
1027        int users = atomic_cmpxchg(&group->container_users, 1, 0);
1028
1029        if (!users)
1030                return -EINVAL;
1031        if (users != 1)
1032                return -EBUSY;
1033
1034        __vfio_group_unset_container(group);
1035
1036        return 0;
1037}
1038
1039/*
1040 * When removing container users, anything that removes the last user
1041 * implicitly removes the group from the container.  That is, if the
1042 * group file descriptor is closed, as well as any device file descriptors,
1043 * the group is free.
1044 */
1045static void vfio_group_try_dissolve_container(struct vfio_group *group)
1046{
1047        if (0 == atomic_dec_if_positive(&group->container_users))
1048                __vfio_group_unset_container(group);
1049}
1050
1051static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1052{
1053        struct fd f;
1054        struct vfio_container *container;
1055        struct vfio_iommu_driver *driver;
1056        int ret = 0;
1057
1058        if (atomic_read(&group->container_users))
1059                return -EINVAL;
1060
1061        f = fdget(container_fd);
1062        if (!f.file)
1063                return -EBADF;
1064
1065        /* Sanity check, is this really our fd? */
1066        if (f.file->f_op != &vfio_fops) {
1067                fdput(f);
1068                return -EINVAL;
1069        }
1070
1071        container = f.file->private_data;
1072        WARN_ON(!container); /* fget ensures we don't race vfio_release */
1073
1074        down_write(&container->group_lock);
1075
1076        driver = container->iommu_driver;
1077        if (driver) {
1078                ret = driver->ops->attach_group(container->iommu_data,
1079                                                group->iommu_group);
1080                if (ret)
1081                        goto unlock_out;
1082        }
1083
1084        group->container = container;
1085        list_add(&group->container_next, &container->group_list);
1086
1087        /* Get a reference on the container and mark a user within the group */
1088        vfio_container_get(container);
1089        atomic_inc(&group->container_users);
1090
1091unlock_out:
1092        up_write(&container->group_lock);
1093        fdput(f);
1094        return ret;
1095}
1096
1097static bool vfio_group_viable(struct vfio_group *group)
1098{
1099        return (iommu_group_for_each_dev(group->iommu_group,
1100                                         group, vfio_dev_viable) == 0);
1101}
1102
1103static const struct file_operations vfio_device_fops;
1104
1105static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1106{
1107        struct vfio_device *device;
1108        struct file *filep;
1109        int ret = -ENODEV;
1110
1111        if (0 == atomic_read(&group->container_users) ||
1112            !group->container->iommu_driver || !vfio_group_viable(group))
1113                return -EINVAL;
1114
1115        mutex_lock(&group->device_lock);
1116        list_for_each_entry(device, &group->device_list, group_next) {
1117                if (strcmp(dev_name(device->dev), buf))
1118                        continue;
1119
1120                ret = device->ops->open(device->device_data);
1121                if (ret)
1122                        break;
1123                /*
1124                 * We can't use anon_inode_getfd() because we need to modify
1125                 * the f_mode flags directly to allow more than just ioctls
1126                 */
1127                ret = get_unused_fd();
1128                if (ret < 0) {
1129                        device->ops->release(device->device_data);
1130                        break;
1131                }
1132
1133                filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1134                                           device, O_RDWR);
1135                if (IS_ERR(filep)) {
1136                        put_unused_fd(ret);
1137                        ret = PTR_ERR(filep);
1138                        device->ops->release(device->device_data);
1139                        break;
1140                }
1141
1142                /*
1143                 * TODO: add an anon_inode interface to do this.
1144                 * Appears to be missing by lack of need rather than
1145                 * explicitly prevented.  Now there's need.
1146                 */
1147                filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1148
1149                vfio_device_get(device);
1150                atomic_inc(&group->container_users);
1151
1152                fd_install(ret, filep);
1153                break;
1154        }
1155        mutex_unlock(&group->device_lock);
1156
1157        return ret;
1158}
1159
1160static long vfio_group_fops_unl_ioctl(struct file *filep,
1161                                      unsigned int cmd, unsigned long arg)
1162{
1163        struct vfio_group *group = filep->private_data;
1164        long ret = -ENOTTY;
1165
1166        switch (cmd) {
1167        case VFIO_GROUP_GET_STATUS:
1168        {
1169                struct vfio_group_status status;
1170                unsigned long minsz;
1171
1172                minsz = offsetofend(struct vfio_group_status, flags);
1173
1174                if (copy_from_user(&status, (void __user *)arg, minsz))
1175                        return -EFAULT;
1176
1177                if (status.argsz < minsz)
1178                        return -EINVAL;
1179
1180                status.flags = 0;
1181
1182                if (vfio_group_viable(group))
1183                        status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1184
1185                if (group->container)
1186                        status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1187
1188                if (copy_to_user((void __user *)arg, &status, minsz))
1189                        return -EFAULT;
1190
1191                ret = 0;
1192                break;
1193        }
1194        case VFIO_GROUP_SET_CONTAINER:
1195        {
1196                int fd;
1197
1198                if (get_user(fd, (int __user *)arg))
1199                        return -EFAULT;
1200
1201                if (fd < 0)
1202                        return -EINVAL;
1203
1204                ret = vfio_group_set_container(group, fd);
1205                break;
1206        }
1207        case VFIO_GROUP_UNSET_CONTAINER:
1208                ret = vfio_group_unset_container(group);
1209                break;
1210        case VFIO_GROUP_GET_DEVICE_FD:
1211        {
1212                char *buf;
1213
1214                buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1215                if (IS_ERR(buf))
1216                        return PTR_ERR(buf);
1217
1218                ret = vfio_group_get_device_fd(group, buf);
1219                kfree(buf);
1220                break;
1221        }
1222        }
1223
1224        return ret;
1225}
1226
1227#ifdef CONFIG_COMPAT
1228static long vfio_group_fops_compat_ioctl(struct file *filep,
1229                                         unsigned int cmd, unsigned long arg)
1230{
1231        arg = (unsigned long)compat_ptr(arg);
1232        return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1233}
1234#endif  /* CONFIG_COMPAT */
1235
1236static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1237{
1238        struct vfio_group *group;
1239
1240        group = vfio_group_get_from_minor(iminor(inode));
1241        if (!group)
1242                return -ENODEV;
1243
1244        if (group->container) {
1245                vfio_group_put(group);
1246                return -EBUSY;
1247        }
1248
1249        filep->private_data = group;
1250
1251        return 0;
1252}
1253
1254static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1255{
1256        struct vfio_group *group = filep->private_data;
1257
1258        filep->private_data = NULL;
1259
1260        vfio_group_try_dissolve_container(group);
1261
1262        vfio_group_put(group);
1263
1264        return 0;
1265}
1266
1267static const struct file_operations vfio_group_fops = {
1268        .owner          = THIS_MODULE,
1269        .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1270#ifdef CONFIG_COMPAT
1271        .compat_ioctl   = vfio_group_fops_compat_ioctl,
1272#endif
1273        .open           = vfio_group_fops_open,
1274        .release        = vfio_group_fops_release,
1275};
1276
1277/**
1278 * VFIO Device fd
1279 */
1280static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1281{
1282        struct vfio_device *device = filep->private_data;
1283
1284        device->ops->release(device->device_data);
1285
1286        vfio_group_try_dissolve_container(device->group);
1287
1288        vfio_device_put(device);
1289
1290        return 0;
1291}
1292
1293static long vfio_device_fops_unl_ioctl(struct file *filep,
1294                                       unsigned int cmd, unsigned long arg)
1295{
1296        struct vfio_device *device = filep->private_data;
1297
1298        if (unlikely(!device->ops->ioctl))
1299                return -EINVAL;
1300
1301        return device->ops->ioctl(device->device_data, cmd, arg);
1302}
1303
1304static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1305                                     size_t count, loff_t *ppos)
1306{
1307        struct vfio_device *device = filep->private_data;
1308
1309        if (unlikely(!device->ops->read))
1310                return -EINVAL;
1311
1312        return device->ops->read(device->device_data, buf, count, ppos);
1313}
1314
1315static ssize_t vfio_device_fops_write(struct file *filep,
1316                                      const char __user *buf,
1317                                      size_t count, loff_t *ppos)
1318{
1319        struct vfio_device *device = filep->private_data;
1320
1321        if (unlikely(!device->ops->write))
1322                return -EINVAL;
1323
1324        return device->ops->write(device->device_data, buf, count, ppos);
1325}
1326
1327static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1328{
1329        struct vfio_device *device = filep->private_data;
1330
1331        if (unlikely(!device->ops->mmap))
1332                return -EINVAL;
1333
1334        return device->ops->mmap(device->device_data, vma);
1335}
1336
1337#ifdef CONFIG_COMPAT
1338static long vfio_device_fops_compat_ioctl(struct file *filep,
1339                                          unsigned int cmd, unsigned long arg)
1340{
1341        arg = (unsigned long)compat_ptr(arg);
1342        return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1343}
1344#endif  /* CONFIG_COMPAT */
1345
1346static const struct file_operations vfio_device_fops = {
1347        .owner          = THIS_MODULE,
1348        .release        = vfio_device_fops_release,
1349        .read           = vfio_device_fops_read,
1350        .write          = vfio_device_fops_write,
1351        .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1352#ifdef CONFIG_COMPAT
1353        .compat_ioctl   = vfio_device_fops_compat_ioctl,
1354#endif
1355        .mmap           = vfio_device_fops_mmap,
1356};
1357
1358/**
1359 * Module/class support
1360 */
1361static char *vfio_devnode(struct device *dev, umode_t *mode)
1362{
1363        if (mode && (MINOR(dev->devt) == 0))
1364                *mode = S_IRUGO | S_IWUGO;
1365
1366        return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1367}
1368
1369static int __init vfio_init(void)
1370{
1371        int ret;
1372
1373        idr_init(&vfio.group_idr);
1374        mutex_init(&vfio.group_lock);
1375        mutex_init(&vfio.iommu_drivers_lock);
1376        INIT_LIST_HEAD(&vfio.group_list);
1377        INIT_LIST_HEAD(&vfio.iommu_drivers_list);
1378        init_waitqueue_head(&vfio.release_q);
1379
1380        vfio.class = class_create(THIS_MODULE, "vfio");
1381        if (IS_ERR(vfio.class)) {
1382                ret = PTR_ERR(vfio.class);
1383                goto err_class;
1384        }
1385
1386        vfio.class->devnode = vfio_devnode;
1387
1388        ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");
1389        if (ret)
1390                goto err_base_chrdev;
1391
1392        cdev_init(&vfio.cdev, &vfio_fops);
1393        ret = cdev_add(&vfio.cdev, vfio.devt, 1);
1394        if (ret)
1395                goto err_base_cdev;
1396
1397        vfio.dev = device_create(vfio.class, NULL, vfio.devt, NULL, "vfio");
1398        if (IS_ERR(vfio.dev)) {
1399                ret = PTR_ERR(vfio.dev);
1400                goto err_base_dev;
1401        }
1402
1403        /* /dev/vfio/$GROUP */
1404        cdev_init(&vfio.group_cdev, &vfio_group_fops);
1405        ret = cdev_add(&vfio.group_cdev,
1406                       MKDEV(MAJOR(vfio.devt), 1), MINORMASK - 1);
1407        if (ret)
1408                goto err_groups_cdev;
1409
1410        pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1411
1412        /*
1413         * Attempt to load known iommu-drivers.  This gives us a working
1414         * environment without the user needing to explicitly load iommu
1415         * drivers.
1416         */
1417        request_module_nowait("vfio_iommu_type1");
1418
1419        return 0;
1420
1421err_groups_cdev:
1422        device_destroy(vfio.class, vfio.devt);
1423err_base_dev:
1424        cdev_del(&vfio.cdev);
1425err_base_cdev:
1426        unregister_chrdev_region(vfio.devt, MINORMASK);
1427err_base_chrdev:
1428        class_destroy(vfio.class);
1429        vfio.class = NULL;
1430err_class:
1431        return ret;
1432}
1433
1434static void __exit vfio_cleanup(void)
1435{
1436        WARN_ON(!list_empty(&vfio.group_list));
1437
1438        idr_destroy(&vfio.group_idr);
1439        cdev_del(&vfio.group_cdev);
1440        device_destroy(vfio.class, vfio.devt);
1441        cdev_del(&vfio.cdev);
1442        unregister_chrdev_region(vfio.devt, MINORMASK);
1443        class_destroy(vfio.class);
1444        vfio.class = NULL;
1445}
1446
1447module_init(vfio_init);
1448module_exit(vfio_cleanup);
1449
1450MODULE_VERSION(DRIVER_VERSION);
1451MODULE_LICENSE("GPL v2");
1452MODULE_AUTHOR(DRIVER_AUTHOR);
1453MODULE_DESCRIPTION(DRIVER_DESC);
1454