linux/drivers/vfio/vfio.c
<<
>>
Prefs
   1/*
   2 * VFIO core
   3 *
   4 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5 *     Author: Alex Williamson <alex.williamson@redhat.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio:
  12 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13 * Author: Tom Lyon, pugs@cisco.com
  14 */
  15
  16#include <linux/cdev.h>
  17#include <linux/compat.h>
  18#include <linux/device.h>
  19#include <linux/file.h>
  20#include <linux/anon_inodes.h>
  21#include <linux/fs.h>
  22#include <linux/idr.h>
  23#include <linux/iommu.h>
  24#include <linux/list.h>
  25#include <linux/module.h>
  26#include <linux/mutex.h>
  27#include <linux/rwsem.h>
  28#include <linux/sched.h>
  29#include <linux/slab.h>
  30#include <linux/stat.h>
  31#include <linux/string.h>
  32#include <linux/uaccess.h>
  33#include <linux/vfio.h>
  34#include <linux/wait.h>
  35
  36#define DRIVER_VERSION  "0.3"
  37#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  38#define DRIVER_DESC     "VFIO - User Level meta-driver"
  39
  40static struct vfio {
  41        struct class                    *class;
  42        struct list_head                iommu_drivers_list;
  43        struct mutex                    iommu_drivers_lock;
  44        struct list_head                group_list;
  45        struct idr                      group_idr;
  46        struct mutex                    group_lock;
  47        struct cdev                     group_cdev;
  48        struct device                   *dev;
  49        dev_t                           devt;
  50        struct cdev                     cdev;
  51        wait_queue_head_t               release_q;
  52} vfio;
  53
  54struct vfio_iommu_driver {
  55        const struct vfio_iommu_driver_ops      *ops;
  56        struct list_head                        vfio_next;
  57};
  58
  59struct vfio_container {
  60        struct kref                     kref;
  61        struct list_head                group_list;
  62        struct rw_semaphore             group_lock;
  63        struct vfio_iommu_driver        *iommu_driver;
  64        void                            *iommu_data;
  65};
  66
  67struct vfio_group {
  68        struct kref                     kref;
  69        int                             minor;
  70        atomic_t                        container_users;
  71        struct iommu_group              *iommu_group;
  72        struct vfio_container           *container;
  73        struct list_head                device_list;
  74        struct mutex                    device_lock;
  75        struct device                   *dev;
  76        struct notifier_block           nb;
  77        struct list_head                vfio_next;
  78        struct list_head                container_next;
  79        atomic_t                        opened;
  80};
  81
  82struct vfio_device {
  83        struct kref                     kref;
  84        struct device                   *dev;
  85        const struct vfio_device_ops    *ops;
  86        struct vfio_group               *group;
  87        struct list_head                group_next;
  88        void                            *device_data;
  89};
  90
  91/**
  92 * IOMMU driver registration
  93 */
  94int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
  95{
  96        struct vfio_iommu_driver *driver, *tmp;
  97
  98        driver = kzalloc(sizeof(*driver), GFP_KERNEL);
  99        if (!driver)
 100                return -ENOMEM;
 101
 102        driver->ops = ops;
 103
 104        mutex_lock(&vfio.iommu_drivers_lock);
 105
 106        /* Check for duplicates */
 107        list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 108                if (tmp->ops == ops) {
 109                        mutex_unlock(&vfio.iommu_drivers_lock);
 110                        kfree(driver);
 111                        return -EINVAL;
 112                }
 113        }
 114
 115        list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 116
 117        mutex_unlock(&vfio.iommu_drivers_lock);
 118
 119        return 0;
 120}
 121EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 122
 123void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 124{
 125        struct vfio_iommu_driver *driver;
 126
 127        mutex_lock(&vfio.iommu_drivers_lock);
 128        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 129                if (driver->ops == ops) {
 130                        list_del(&driver->vfio_next);
 131                        mutex_unlock(&vfio.iommu_drivers_lock);
 132                        kfree(driver);
 133                        return;
 134                }
 135        }
 136        mutex_unlock(&vfio.iommu_drivers_lock);
 137}
 138EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 139
 140/**
 141 * Group minor allocation/free - both called with vfio.group_lock held
 142 */
 143static int vfio_alloc_group_minor(struct vfio_group *group)
 144{
 145        /* index 0 is used by /dev/vfio/vfio */
 146        return idr_alloc(&vfio.group_idr, group, 1, MINORMASK + 1, GFP_KERNEL);
 147}
 148
 149static void vfio_free_group_minor(int minor)
 150{
 151        idr_remove(&vfio.group_idr, minor);
 152}
 153
 154static int vfio_iommu_group_notifier(struct notifier_block *nb,
 155                                     unsigned long action, void *data);
 156static void vfio_group_get(struct vfio_group *group);
 157
 158/**
 159 * Container objects - containers are created when /dev/vfio/vfio is
 160 * opened, but their lifecycle extends until the last user is done, so
 161 * it's freed via kref.  Must support container/group/device being
 162 * closed in any order.
 163 */
 164static void vfio_container_get(struct vfio_container *container)
 165{
 166        kref_get(&container->kref);
 167}
 168
 169static void vfio_container_release(struct kref *kref)
 170{
 171        struct vfio_container *container;
 172        container = container_of(kref, struct vfio_container, kref);
 173
 174        kfree(container);
 175}
 176
 177static void vfio_container_put(struct vfio_container *container)
 178{
 179        kref_put(&container->kref, vfio_container_release);
 180}
 181
 182static void vfio_group_unlock_and_free(struct vfio_group *group)
 183{
 184        mutex_unlock(&vfio.group_lock);
 185        /*
 186         * Unregister outside of lock.  A spurious callback is harmless now
 187         * that the group is no longer in vfio.group_list.
 188         */
 189        iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 190        kfree(group);
 191}
 192
 193/**
 194 * Group objects - create, release, get, put, search
 195 */
 196static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 197{
 198        struct vfio_group *group, *tmp;
 199        struct device *dev;
 200        int ret, minor;
 201
 202        group = kzalloc(sizeof(*group), GFP_KERNEL);
 203        if (!group)
 204                return ERR_PTR(-ENOMEM);
 205
 206        kref_init(&group->kref);
 207        INIT_LIST_HEAD(&group->device_list);
 208        mutex_init(&group->device_lock);
 209        atomic_set(&group->container_users, 0);
 210        atomic_set(&group->opened, 0);
 211        group->iommu_group = iommu_group;
 212
 213        group->nb.notifier_call = vfio_iommu_group_notifier;
 214
 215        /*
 216         * blocking notifiers acquire a rwsem around registering and hold
 217         * it around callback.  Therefore, need to register outside of
 218         * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 219         * do anything unless it can find the group in vfio.group_list, so
 220         * no harm in registering early.
 221         */
 222        ret = iommu_group_register_notifier(iommu_group, &group->nb);
 223        if (ret) {
 224                kfree(group);
 225                return ERR_PTR(ret);
 226        }
 227
 228        mutex_lock(&vfio.group_lock);
 229
 230        minor = vfio_alloc_group_minor(group);
 231        if (minor < 0) {
 232                vfio_group_unlock_and_free(group);
 233                return ERR_PTR(minor);
 234        }
 235
 236        /* Did we race creating this group? */
 237        list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 238                if (tmp->iommu_group == iommu_group) {
 239                        vfio_group_get(tmp);
 240                        vfio_free_group_minor(minor);
 241                        vfio_group_unlock_and_free(group);
 242                        return tmp;
 243                }
 244        }
 245
 246        dev = device_create(vfio.class, NULL, MKDEV(MAJOR(vfio.devt), minor),
 247                            group, "%d", iommu_group_id(iommu_group));
 248        if (IS_ERR(dev)) {
 249                vfio_free_group_minor(minor);
 250                vfio_group_unlock_and_free(group);
 251                return (struct vfio_group *)dev; /* ERR_PTR */
 252        }
 253
 254        group->minor = minor;
 255        group->dev = dev;
 256
 257        list_add(&group->vfio_next, &vfio.group_list);
 258
 259        mutex_unlock(&vfio.group_lock);
 260
 261        return group;
 262}
 263
 264/* called with vfio.group_lock held */
 265static void vfio_group_release(struct kref *kref)
 266{
 267        struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 268
 269        WARN_ON(!list_empty(&group->device_list));
 270
 271        device_destroy(vfio.class, MKDEV(MAJOR(vfio.devt), group->minor));
 272        list_del(&group->vfio_next);
 273        vfio_free_group_minor(group->minor);
 274        vfio_group_unlock_and_free(group);
 275}
 276
 277static void vfio_group_put(struct vfio_group *group)
 278{
 279        kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 280}
 281
 282/* Assume group_lock or group reference is held */
 283static void vfio_group_get(struct vfio_group *group)
 284{
 285        kref_get(&group->kref);
 286}
 287
 288/*
 289 * Not really a try as we will sleep for mutex, but we need to make
 290 * sure the group pointer is valid under lock and get a reference.
 291 */
 292static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 293{
 294        struct vfio_group *target = group;
 295
 296        mutex_lock(&vfio.group_lock);
 297        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 298                if (group == target) {
 299                        vfio_group_get(group);
 300                        mutex_unlock(&vfio.group_lock);
 301                        return group;
 302                }
 303        }
 304        mutex_unlock(&vfio.group_lock);
 305
 306        return NULL;
 307}
 308
 309static
 310struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 311{
 312        struct vfio_group *group;
 313
 314        mutex_lock(&vfio.group_lock);
 315        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 316                if (group->iommu_group == iommu_group) {
 317                        vfio_group_get(group);
 318                        mutex_unlock(&vfio.group_lock);
 319                        return group;
 320                }
 321        }
 322        mutex_unlock(&vfio.group_lock);
 323
 324        return NULL;
 325}
 326
 327static struct vfio_group *vfio_group_get_from_minor(int minor)
 328{
 329        struct vfio_group *group;
 330
 331        mutex_lock(&vfio.group_lock);
 332        group = idr_find(&vfio.group_idr, minor);
 333        if (!group) {
 334                mutex_unlock(&vfio.group_lock);
 335                return NULL;
 336        }
 337        vfio_group_get(group);
 338        mutex_unlock(&vfio.group_lock);
 339
 340        return group;
 341}
 342
 343/**
 344 * Device objects - create, release, get, put, search
 345 */
 346static
 347struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 348                                             struct device *dev,
 349                                             const struct vfio_device_ops *ops,
 350                                             void *device_data)
 351{
 352        struct vfio_device *device;
 353        int ret;
 354
 355        device = kzalloc(sizeof(*device), GFP_KERNEL);
 356        if (!device)
 357                return ERR_PTR(-ENOMEM);
 358
 359        kref_init(&device->kref);
 360        device->dev = dev;
 361        device->group = group;
 362        device->ops = ops;
 363        device->device_data = device_data;
 364
 365        ret = dev_set_drvdata(dev, device);
 366        if (ret) {
 367                kfree(device);
 368                return ERR_PTR(ret);
 369        }
 370
 371        /* No need to get group_lock, caller has group reference */
 372        vfio_group_get(group);
 373
 374        mutex_lock(&group->device_lock);
 375        list_add(&device->group_next, &group->device_list);
 376        mutex_unlock(&group->device_lock);
 377
 378        return device;
 379}
 380
 381static void vfio_device_release(struct kref *kref)
 382{
 383        struct vfio_device *device = container_of(kref,
 384                                                  struct vfio_device, kref);
 385        struct vfio_group *group = device->group;
 386
 387        list_del(&device->group_next);
 388        mutex_unlock(&group->device_lock);
 389
 390        dev_set_drvdata(device->dev, NULL);
 391
 392        kfree(device);
 393
 394        /* vfio_del_group_dev may be waiting for this device */
 395        wake_up(&vfio.release_q);
 396}
 397
 398/* Device reference always implies a group reference */
 399void vfio_device_put(struct vfio_device *device)
 400{
 401        struct vfio_group *group = device->group;
 402        kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 403        vfio_group_put(group);
 404}
 405EXPORT_SYMBOL_GPL(vfio_device_put);
 406
 407static void vfio_device_get(struct vfio_device *device)
 408{
 409        vfio_group_get(device->group);
 410        kref_get(&device->kref);
 411}
 412
 413static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 414                                                 struct device *dev)
 415{
 416        struct vfio_device *device;
 417
 418        mutex_lock(&group->device_lock);
 419        list_for_each_entry(device, &group->device_list, group_next) {
 420                if (device->dev == dev) {
 421                        vfio_device_get(device);
 422                        mutex_unlock(&group->device_lock);
 423                        return device;
 424                }
 425        }
 426        mutex_unlock(&group->device_lock);
 427        return NULL;
 428}
 429
 430/*
 431 * Whitelist some drivers that we know are safe (no dma) or just sit on
 432 * a device.  It's not always practical to leave a device within a group
 433 * driverless as it could get re-bound to something unsafe.
 434 */
 435static const char * const vfio_driver_whitelist[] = { "pci-stub", "pcieport" };
 436
 437static bool vfio_whitelisted_driver(struct device_driver *drv)
 438{
 439        int i;
 440
 441        for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
 442                if (!strcmp(drv->name, vfio_driver_whitelist[i]))
 443                        return true;
 444        }
 445
 446        return false;
 447}
 448
 449/*
 450 * A vfio group is viable for use by userspace if all devices are either
 451 * driver-less or bound to a vfio or whitelisted driver.  We test the
 452 * latter by the existence of a struct vfio_device matching the dev.
 453 */
 454static int vfio_dev_viable(struct device *dev, void *data)
 455{
 456        struct vfio_group *group = data;
 457        struct vfio_device *device;
 458        struct device_driver *drv = ACCESS_ONCE(dev->driver);
 459
 460        if (!drv || vfio_whitelisted_driver(drv))
 461                return 0;
 462
 463        device = vfio_group_get_device(group, dev);
 464        if (device) {
 465                vfio_device_put(device);
 466                return 0;
 467        }
 468
 469        return -EINVAL;
 470}
 471
 472/**
 473 * Async device support
 474 */
 475static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 476{
 477        struct vfio_device *device;
 478
 479        /* Do we already know about it?  We shouldn't */
 480        device = vfio_group_get_device(group, dev);
 481        if (WARN_ON_ONCE(device)) {
 482                vfio_device_put(device);
 483                return 0;
 484        }
 485
 486        /* Nothing to do for idle groups */
 487        if (!atomic_read(&group->container_users))
 488                return 0;
 489
 490        /* TODO Prevent device auto probing */
 491        WARN("Device %s added to live group %d!\n", dev_name(dev),
 492             iommu_group_id(group->iommu_group));
 493
 494        return 0;
 495}
 496
 497static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 498{
 499        /* We don't care what happens when the group isn't in use */
 500        if (!atomic_read(&group->container_users))
 501                return 0;
 502
 503        return vfio_dev_viable(dev, group);
 504}
 505
 506static int vfio_iommu_group_notifier(struct notifier_block *nb,
 507                                     unsigned long action, void *data)
 508{
 509        struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 510        struct device *dev = data;
 511
 512        /*
 513         * Need to go through a group_lock lookup to get a reference or we
 514         * risk racing a group being removed.  Ignore spurious notifies.
 515         */
 516        group = vfio_group_try_get(group);
 517        if (!group)
 518                return NOTIFY_OK;
 519
 520        switch (action) {
 521        case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 522                vfio_group_nb_add_dev(group, dev);
 523                break;
 524        case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 525                /*
 526                 * Nothing to do here.  If the device is in use, then the
 527                 * vfio sub-driver should block the remove callback until
 528                 * it is unused.  If the device is unused or attached to a
 529                 * stub driver, then it should be released and we don't
 530                 * care that it will be going away.
 531                 */
 532                break;
 533        case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 534                pr_debug("%s: Device %s, group %d binding to driver\n",
 535                         __func__, dev_name(dev),
 536                         iommu_group_id(group->iommu_group));
 537                break;
 538        case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 539                pr_debug("%s: Device %s, group %d bound to driver %s\n",
 540                         __func__, dev_name(dev),
 541                         iommu_group_id(group->iommu_group), dev->driver->name);
 542                BUG_ON(vfio_group_nb_verify(group, dev));
 543                break;
 544        case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 545                pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
 546                         __func__, dev_name(dev),
 547                         iommu_group_id(group->iommu_group), dev->driver->name);
 548                break;
 549        case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 550                pr_debug("%s: Device %s, group %d unbound from driver\n",
 551                         __func__, dev_name(dev),
 552                         iommu_group_id(group->iommu_group));
 553                /*
 554                 * XXX An unbound device in a live group is ok, but we'd
 555                 * really like to avoid the above BUG_ON by preventing other
 556                 * drivers from binding to it.  Once that occurs, we have to
 557                 * stop the system to maintain isolation.  At a minimum, we'd
 558                 * want a toggle to disable driver auto probe for this device.
 559                 */
 560                break;
 561        }
 562
 563        vfio_group_put(group);
 564        return NOTIFY_OK;
 565}
 566
 567/**
 568 * VFIO driver API
 569 */
 570int vfio_add_group_dev(struct device *dev,
 571                       const struct vfio_device_ops *ops, void *device_data)
 572{
 573        struct iommu_group *iommu_group;
 574        struct vfio_group *group;
 575        struct vfio_device *device;
 576
 577        iommu_group = iommu_group_get(dev);
 578        if (!iommu_group)
 579                return -EINVAL;
 580
 581        group = vfio_group_get_from_iommu(iommu_group);
 582        if (!group) {
 583                group = vfio_create_group(iommu_group);
 584                if (IS_ERR(group)) {
 585                        iommu_group_put(iommu_group);
 586                        return PTR_ERR(group);
 587                }
 588        }
 589
 590        device = vfio_group_get_device(group, dev);
 591        if (device) {
 592                WARN(1, "Device %s already exists on group %d\n",
 593                     dev_name(dev), iommu_group_id(iommu_group));
 594                vfio_device_put(device);
 595                vfio_group_put(group);
 596                iommu_group_put(iommu_group);
 597                return -EBUSY;
 598        }
 599
 600        device = vfio_group_create_device(group, dev, ops, device_data);
 601        if (IS_ERR(device)) {
 602                vfio_group_put(group);
 603                iommu_group_put(iommu_group);
 604                return PTR_ERR(device);
 605        }
 606
 607        /*
 608         * Added device holds reference to iommu_group and vfio_device
 609         * (which in turn holds reference to vfio_group).  Drop extra
 610         * group reference used while acquiring device.
 611         */
 612        vfio_group_put(group);
 613
 614        return 0;
 615}
 616EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 617
 618/**
 619 * Get a reference to the vfio_device for a device that is known to
 620 * be bound to a vfio driver.  The driver implicitly holds a
 621 * vfio_device reference between vfio_add_group_dev and
 622 * vfio_del_group_dev.  We can therefore use drvdata to increment
 623 * that reference from the struct device.  This additional
 624 * reference must be released by calling vfio_device_put.
 625 */
 626struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 627{
 628        struct vfio_device *device = dev_get_drvdata(dev);
 629
 630        vfio_device_get(device);
 631
 632        return device;
 633}
 634EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 635
 636/*
 637 * Caller must hold a reference to the vfio_device
 638 */
 639void *vfio_device_data(struct vfio_device *device)
 640{
 641        return device->device_data;
 642}
 643EXPORT_SYMBOL_GPL(vfio_device_data);
 644
 645/* Given a referenced group, check if it contains the device */
 646static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
 647{
 648        struct vfio_device *device;
 649
 650        device = vfio_group_get_device(group, dev);
 651        if (!device)
 652                return false;
 653
 654        vfio_device_put(device);
 655        return true;
 656}
 657
 658/*
 659 * Decrement the device reference count and wait for the device to be
 660 * removed.  Open file descriptors for the device... */
 661void *vfio_del_group_dev(struct device *dev)
 662{
 663        struct vfio_device *device = dev_get_drvdata(dev);
 664        struct vfio_group *group = device->group;
 665        struct iommu_group *iommu_group = group->iommu_group;
 666        void *device_data = device->device_data;
 667
 668        /*
 669         * The group exists so long as we have a device reference.  Get
 670         * a group reference and use it to scan for the device going away.
 671         */
 672        vfio_group_get(group);
 673
 674        vfio_device_put(device);
 675
 676        /* TODO send a signal to encourage this to be released */
 677        wait_event(vfio.release_q, !vfio_dev_present(group, dev));
 678
 679        vfio_group_put(group);
 680
 681        iommu_group_put(iommu_group);
 682
 683        return device_data;
 684}
 685EXPORT_SYMBOL_GPL(vfio_del_group_dev);
 686
 687/**
 688 * VFIO base fd, /dev/vfio/vfio
 689 */
 690static long vfio_ioctl_check_extension(struct vfio_container *container,
 691                                       unsigned long arg)
 692{
 693        struct vfio_iommu_driver *driver;
 694        long ret = 0;
 695
 696        down_read(&container->group_lock);
 697
 698        driver = container->iommu_driver;
 699
 700        switch (arg) {
 701                /* No base extensions yet */
 702        default:
 703                /*
 704                 * If no driver is set, poll all registered drivers for
 705                 * extensions and return the first positive result.  If
 706                 * a driver is already set, further queries will be passed
 707                 * only to that driver.
 708                 */
 709                if (!driver) {
 710                        mutex_lock(&vfio.iommu_drivers_lock);
 711                        list_for_each_entry(driver, &vfio.iommu_drivers_list,
 712                                            vfio_next) {
 713                                if (!try_module_get(driver->ops->owner))
 714                                        continue;
 715
 716                                ret = driver->ops->ioctl(NULL,
 717                                                         VFIO_CHECK_EXTENSION,
 718                                                         arg);
 719                                module_put(driver->ops->owner);
 720                                if (ret > 0)
 721                                        break;
 722                        }
 723                        mutex_unlock(&vfio.iommu_drivers_lock);
 724                } else
 725                        ret = driver->ops->ioctl(container->iommu_data,
 726                                                 VFIO_CHECK_EXTENSION, arg);
 727        }
 728
 729        up_read(&container->group_lock);
 730
 731        return ret;
 732}
 733
 734/* hold write lock on container->group_lock */
 735static int __vfio_container_attach_groups(struct vfio_container *container,
 736                                          struct vfio_iommu_driver *driver,
 737                                          void *data)
 738{
 739        struct vfio_group *group;
 740        int ret = -ENODEV;
 741
 742        list_for_each_entry(group, &container->group_list, container_next) {
 743                ret = driver->ops->attach_group(data, group->iommu_group);
 744                if (ret)
 745                        goto unwind;
 746        }
 747
 748        return ret;
 749
 750unwind:
 751        list_for_each_entry_continue_reverse(group, &container->group_list,
 752                                             container_next) {
 753                driver->ops->detach_group(data, group->iommu_group);
 754        }
 755
 756        return ret;
 757}
 758
 759static long vfio_ioctl_set_iommu(struct vfio_container *container,
 760                                 unsigned long arg)
 761{
 762        struct vfio_iommu_driver *driver;
 763        long ret = -ENODEV;
 764
 765        down_write(&container->group_lock);
 766
 767        /*
 768         * The container is designed to be an unprivileged interface while
 769         * the group can be assigned to specific users.  Therefore, only by
 770         * adding a group to a container does the user get the privilege of
 771         * enabling the iommu, which may allocate finite resources.  There
 772         * is no unset_iommu, but by removing all the groups from a container,
 773         * the container is deprivileged and returns to an unset state.
 774         */
 775        if (list_empty(&container->group_list) || container->iommu_driver) {
 776                up_write(&container->group_lock);
 777                return -EINVAL;
 778        }
 779
 780        mutex_lock(&vfio.iommu_drivers_lock);
 781        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 782                void *data;
 783
 784                if (!try_module_get(driver->ops->owner))
 785                        continue;
 786
 787                /*
 788                 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
 789                 * so test which iommu driver reported support for this
 790                 * extension and call open on them.  We also pass them the
 791                 * magic, allowing a single driver to support multiple
 792                 * interfaces if they'd like.
 793                 */
 794                if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
 795                        module_put(driver->ops->owner);
 796                        continue;
 797                }
 798
 799                /* module reference holds the driver we're working on */
 800                mutex_unlock(&vfio.iommu_drivers_lock);
 801
 802                data = driver->ops->open(arg);
 803                if (IS_ERR(data)) {
 804                        ret = PTR_ERR(data);
 805                        module_put(driver->ops->owner);
 806                        goto skip_drivers_unlock;
 807                }
 808
 809                ret = __vfio_container_attach_groups(container, driver, data);
 810                if (!ret) {
 811                        container->iommu_driver = driver;
 812                        container->iommu_data = data;
 813                } else {
 814                        driver->ops->release(data);
 815                        module_put(driver->ops->owner);
 816                }
 817
 818                goto skip_drivers_unlock;
 819        }
 820
 821        mutex_unlock(&vfio.iommu_drivers_lock);
 822skip_drivers_unlock:
 823        up_write(&container->group_lock);
 824
 825        return ret;
 826}
 827
 828static long vfio_fops_unl_ioctl(struct file *filep,
 829                                unsigned int cmd, unsigned long arg)
 830{
 831        struct vfio_container *container = filep->private_data;
 832        struct vfio_iommu_driver *driver;
 833        void *data;
 834        long ret = -EINVAL;
 835
 836        if (!container)
 837                return ret;
 838
 839        switch (cmd) {
 840        case VFIO_GET_API_VERSION:
 841                ret = VFIO_API_VERSION;
 842                break;
 843        case VFIO_CHECK_EXTENSION:
 844                ret = vfio_ioctl_check_extension(container, arg);
 845                break;
 846        case VFIO_SET_IOMMU:
 847                ret = vfio_ioctl_set_iommu(container, arg);
 848                break;
 849        default:
 850                down_read(&container->group_lock);
 851
 852                driver = container->iommu_driver;
 853                data = container->iommu_data;
 854
 855                if (driver) /* passthrough all unrecognized ioctls */
 856                        ret = driver->ops->ioctl(data, cmd, arg);
 857
 858                up_read(&container->group_lock);
 859        }
 860
 861        return ret;
 862}
 863
 864#ifdef CONFIG_COMPAT
 865static long vfio_fops_compat_ioctl(struct file *filep,
 866                                   unsigned int cmd, unsigned long arg)
 867{
 868        arg = (unsigned long)compat_ptr(arg);
 869        return vfio_fops_unl_ioctl(filep, cmd, arg);
 870}
 871#endif  /* CONFIG_COMPAT */
 872
 873static int vfio_fops_open(struct inode *inode, struct file *filep)
 874{
 875        struct vfio_container *container;
 876
 877        container = kzalloc(sizeof(*container), GFP_KERNEL);
 878        if (!container)
 879                return -ENOMEM;
 880
 881        INIT_LIST_HEAD(&container->group_list);
 882        init_rwsem(&container->group_lock);
 883        kref_init(&container->kref);
 884
 885        filep->private_data = container;
 886
 887        return 0;
 888}
 889
 890static int vfio_fops_release(struct inode *inode, struct file *filep)
 891{
 892        struct vfio_container *container = filep->private_data;
 893
 894        filep->private_data = NULL;
 895
 896        vfio_container_put(container);
 897
 898        return 0;
 899}
 900
 901/*
 902 * Once an iommu driver is set, we optionally pass read/write/mmap
 903 * on to the driver, allowing management interfaces beyond ioctl.
 904 */
 905static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
 906                              size_t count, loff_t *ppos)
 907{
 908        struct vfio_container *container = filep->private_data;
 909        struct vfio_iommu_driver *driver;
 910        ssize_t ret = -EINVAL;
 911
 912        down_read(&container->group_lock);
 913
 914        driver = container->iommu_driver;
 915        if (likely(driver && driver->ops->read))
 916                ret = driver->ops->read(container->iommu_data,
 917                                        buf, count, ppos);
 918
 919        up_read(&container->group_lock);
 920
 921        return ret;
 922}
 923
 924static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
 925                               size_t count, loff_t *ppos)
 926{
 927        struct vfio_container *container = filep->private_data;
 928        struct vfio_iommu_driver *driver;
 929        ssize_t ret = -EINVAL;
 930
 931        down_read(&container->group_lock);
 932
 933        driver = container->iommu_driver;
 934        if (likely(driver && driver->ops->write))
 935                ret = driver->ops->write(container->iommu_data,
 936                                         buf, count, ppos);
 937
 938        up_read(&container->group_lock);
 939
 940        return ret;
 941}
 942
 943static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 944{
 945        struct vfio_container *container = filep->private_data;
 946        struct vfio_iommu_driver *driver;
 947        int ret = -EINVAL;
 948
 949        down_read(&container->group_lock);
 950
 951        driver = container->iommu_driver;
 952        if (likely(driver && driver->ops->mmap))
 953                ret = driver->ops->mmap(container->iommu_data, vma);
 954
 955        up_read(&container->group_lock);
 956
 957        return ret;
 958}
 959
 960static const struct file_operations vfio_fops = {
 961        .owner          = THIS_MODULE,
 962        .open           = vfio_fops_open,
 963        .release        = vfio_fops_release,
 964        .read           = vfio_fops_read,
 965        .write          = vfio_fops_write,
 966        .unlocked_ioctl = vfio_fops_unl_ioctl,
 967#ifdef CONFIG_COMPAT
 968        .compat_ioctl   = vfio_fops_compat_ioctl,
 969#endif
 970        .mmap           = vfio_fops_mmap,
 971};
 972
 973/**
 974 * VFIO Group fd, /dev/vfio/$GROUP
 975 */
 976static void __vfio_group_unset_container(struct vfio_group *group)
 977{
 978        struct vfio_container *container = group->container;
 979        struct vfio_iommu_driver *driver;
 980
 981        down_write(&container->group_lock);
 982
 983        driver = container->iommu_driver;
 984        if (driver)
 985                driver->ops->detach_group(container->iommu_data,
 986                                          group->iommu_group);
 987
 988        group->container = NULL;
 989        list_del(&group->container_next);
 990
 991        /* Detaching the last group deprivileges a container, remove iommu */
 992        if (driver && list_empty(&container->group_list)) {
 993                driver->ops->release(container->iommu_data);
 994                module_put(driver->ops->owner);
 995                container->iommu_driver = NULL;
 996                container->iommu_data = NULL;
 997        }
 998
 999        up_write(&container->group_lock);
1000
1001        vfio_container_put(container);
1002}
1003
1004/*
1005 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1006 * if there was no container to unset.  Since the ioctl is called on
1007 * the group, we know that still exists, therefore the only valid
1008 * transition here is 1->0.
1009 */
1010static int vfio_group_unset_container(struct vfio_group *group)
1011{
1012        int users = atomic_cmpxchg(&group->container_users, 1, 0);
1013
1014        if (!users)
1015                return -EINVAL;
1016        if (users != 1)
1017                return -EBUSY;
1018
1019        __vfio_group_unset_container(group);
1020
1021        return 0;
1022}
1023
1024/*
1025 * When removing container users, anything that removes the last user
1026 * implicitly removes the group from the container.  That is, if the
1027 * group file descriptor is closed, as well as any device file descriptors,
1028 * the group is free.
1029 */
1030static void vfio_group_try_dissolve_container(struct vfio_group *group)
1031{
1032        if (0 == atomic_dec_if_positive(&group->container_users))
1033                __vfio_group_unset_container(group);
1034}
1035
1036static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1037{
1038        struct fd f;
1039        struct vfio_container *container;
1040        struct vfio_iommu_driver *driver;
1041        int ret = 0;
1042
1043        if (atomic_read(&group->container_users))
1044                return -EINVAL;
1045
1046        f = fdget(container_fd);
1047        if (!f.file)
1048                return -EBADF;
1049
1050        /* Sanity check, is this really our fd? */
1051        if (f.file->f_op != &vfio_fops) {
1052                fdput(f);
1053                return -EINVAL;
1054        }
1055
1056        container = f.file->private_data;
1057        WARN_ON(!container); /* fget ensures we don't race vfio_release */
1058
1059        down_write(&container->group_lock);
1060
1061        driver = container->iommu_driver;
1062        if (driver) {
1063                ret = driver->ops->attach_group(container->iommu_data,
1064                                                group->iommu_group);
1065                if (ret)
1066                        goto unlock_out;
1067        }
1068
1069        group->container = container;
1070        list_add(&group->container_next, &container->group_list);
1071
1072        /* Get a reference on the container and mark a user within the group */
1073        vfio_container_get(container);
1074        atomic_inc(&group->container_users);
1075
1076unlock_out:
1077        up_write(&container->group_lock);
1078        fdput(f);
1079        return ret;
1080}
1081
1082static bool vfio_group_viable(struct vfio_group *group)
1083{
1084        return (iommu_group_for_each_dev(group->iommu_group,
1085                                         group, vfio_dev_viable) == 0);
1086}
1087
1088static const struct file_operations vfio_device_fops;
1089
1090static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1091{
1092        struct vfio_device *device;
1093        struct file *filep;
1094        int ret = -ENODEV;
1095
1096        if (0 == atomic_read(&group->container_users) ||
1097            !group->container->iommu_driver || !vfio_group_viable(group))
1098                return -EINVAL;
1099
1100        mutex_lock(&group->device_lock);
1101        list_for_each_entry(device, &group->device_list, group_next) {
1102                if (strcmp(dev_name(device->dev), buf))
1103                        continue;
1104
1105                ret = device->ops->open(device->device_data);
1106                if (ret)
1107                        break;
1108                /*
1109                 * We can't use anon_inode_getfd() because we need to modify
1110                 * the f_mode flags directly to allow more than just ioctls
1111                 */
1112                ret = get_unused_fd();
1113                if (ret < 0) {
1114                        device->ops->release(device->device_data);
1115                        break;
1116                }
1117
1118                filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1119                                           device, O_RDWR);
1120                if (IS_ERR(filep)) {
1121                        put_unused_fd(ret);
1122                        ret = PTR_ERR(filep);
1123                        device->ops->release(device->device_data);
1124                        break;
1125                }
1126
1127                /*
1128                 * TODO: add an anon_inode interface to do this.
1129                 * Appears to be missing by lack of need rather than
1130                 * explicitly prevented.  Now there's need.
1131                 */
1132                filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1133
1134                vfio_device_get(device);
1135                atomic_inc(&group->container_users);
1136
1137                fd_install(ret, filep);
1138                break;
1139        }
1140        mutex_unlock(&group->device_lock);
1141
1142        return ret;
1143}
1144
1145static long vfio_group_fops_unl_ioctl(struct file *filep,
1146                                      unsigned int cmd, unsigned long arg)
1147{
1148        struct vfio_group *group = filep->private_data;
1149        long ret = -ENOTTY;
1150
1151        switch (cmd) {
1152        case VFIO_GROUP_GET_STATUS:
1153        {
1154                struct vfio_group_status status;
1155                unsigned long minsz;
1156
1157                minsz = offsetofend(struct vfio_group_status, flags);
1158
1159                if (copy_from_user(&status, (void __user *)arg, minsz))
1160                        return -EFAULT;
1161
1162                if (status.argsz < minsz)
1163                        return -EINVAL;
1164
1165                status.flags = 0;
1166
1167                if (vfio_group_viable(group))
1168                        status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1169
1170                if (group->container)
1171                        status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1172
1173                if (copy_to_user((void __user *)arg, &status, minsz))
1174                        return -EFAULT;
1175
1176                ret = 0;
1177                break;
1178        }
1179        case VFIO_GROUP_SET_CONTAINER:
1180        {
1181                int fd;
1182
1183                if (get_user(fd, (int __user *)arg))
1184                        return -EFAULT;
1185
1186                if (fd < 0)
1187                        return -EINVAL;
1188
1189                ret = vfio_group_set_container(group, fd);
1190                break;
1191        }
1192        case VFIO_GROUP_UNSET_CONTAINER:
1193                ret = vfio_group_unset_container(group);
1194                break;
1195        case VFIO_GROUP_GET_DEVICE_FD:
1196        {
1197                char *buf;
1198
1199                buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1200                if (IS_ERR(buf))
1201                        return PTR_ERR(buf);
1202
1203                ret = vfio_group_get_device_fd(group, buf);
1204                kfree(buf);
1205                break;
1206        }
1207        }
1208
1209        return ret;
1210}
1211
1212#ifdef CONFIG_COMPAT
1213static long vfio_group_fops_compat_ioctl(struct file *filep,
1214                                         unsigned int cmd, unsigned long arg)
1215{
1216        arg = (unsigned long)compat_ptr(arg);
1217        return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1218}
1219#endif  /* CONFIG_COMPAT */
1220
1221static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1222{
1223        struct vfio_group *group;
1224        int opened;
1225
1226        group = vfio_group_get_from_minor(iminor(inode));
1227        if (!group)
1228                return -ENODEV;
1229
1230        /* Do we need multiple instances of the group open?  Seems not. */
1231        opened = atomic_cmpxchg(&group->opened, 0, 1);
1232        if (opened) {
1233                vfio_group_put(group);
1234                return -EBUSY;
1235        }
1236
1237        /* Is something still in use from a previous open? */
1238        if (group->container) {
1239                atomic_dec(&group->opened);
1240                vfio_group_put(group);
1241                return -EBUSY;
1242        }
1243
1244        filep->private_data = group;
1245
1246        return 0;
1247}
1248
1249static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1250{
1251        struct vfio_group *group = filep->private_data;
1252
1253        filep->private_data = NULL;
1254
1255        vfio_group_try_dissolve_container(group);
1256
1257        atomic_dec(&group->opened);
1258
1259        vfio_group_put(group);
1260
1261        return 0;
1262}
1263
1264static const struct file_operations vfio_group_fops = {
1265        .owner          = THIS_MODULE,
1266        .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1267#ifdef CONFIG_COMPAT
1268        .compat_ioctl   = vfio_group_fops_compat_ioctl,
1269#endif
1270        .open           = vfio_group_fops_open,
1271        .release        = vfio_group_fops_release,
1272};
1273
1274/**
1275 * VFIO Device fd
1276 */
1277static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1278{
1279        struct vfio_device *device = filep->private_data;
1280
1281        device->ops->release(device->device_data);
1282
1283        vfio_group_try_dissolve_container(device->group);
1284
1285        vfio_device_put(device);
1286
1287        return 0;
1288}
1289
1290static long vfio_device_fops_unl_ioctl(struct file *filep,
1291                                       unsigned int cmd, unsigned long arg)
1292{
1293        struct vfio_device *device = filep->private_data;
1294
1295        if (unlikely(!device->ops->ioctl))
1296                return -EINVAL;
1297
1298        return device->ops->ioctl(device->device_data, cmd, arg);
1299}
1300
1301static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1302                                     size_t count, loff_t *ppos)
1303{
1304        struct vfio_device *device = filep->private_data;
1305
1306        if (unlikely(!device->ops->read))
1307                return -EINVAL;
1308
1309        return device->ops->read(device->device_data, buf, count, ppos);
1310}
1311
1312static ssize_t vfio_device_fops_write(struct file *filep,
1313                                      const char __user *buf,
1314                                      size_t count, loff_t *ppos)
1315{
1316        struct vfio_device *device = filep->private_data;
1317
1318        if (unlikely(!device->ops->write))
1319                return -EINVAL;
1320
1321        return device->ops->write(device->device_data, buf, count, ppos);
1322}
1323
1324static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1325{
1326        struct vfio_device *device = filep->private_data;
1327
1328        if (unlikely(!device->ops->mmap))
1329                return -EINVAL;
1330
1331        return device->ops->mmap(device->device_data, vma);
1332}
1333
1334#ifdef CONFIG_COMPAT
1335static long vfio_device_fops_compat_ioctl(struct file *filep,
1336                                          unsigned int cmd, unsigned long arg)
1337{
1338        arg = (unsigned long)compat_ptr(arg);
1339        return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1340}
1341#endif  /* CONFIG_COMPAT */
1342
1343static const struct file_operations vfio_device_fops = {
1344        .owner          = THIS_MODULE,
1345        .release        = vfio_device_fops_release,
1346        .read           = vfio_device_fops_read,
1347        .write          = vfio_device_fops_write,
1348        .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1349#ifdef CONFIG_COMPAT
1350        .compat_ioctl   = vfio_device_fops_compat_ioctl,
1351#endif
1352        .mmap           = vfio_device_fops_mmap,
1353};
1354
1355/**
1356 * Module/class support
1357 */
1358static char *vfio_devnode(struct device *dev, umode_t *mode)
1359{
1360        if (mode && (MINOR(dev->devt) == 0))
1361                *mode = S_IRUGO | S_IWUGO;
1362
1363        return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1364}
1365
1366static int __init vfio_init(void)
1367{
1368        int ret;
1369
1370        idr_init(&vfio.group_idr);
1371        mutex_init(&vfio.group_lock);
1372        mutex_init(&vfio.iommu_drivers_lock);
1373        INIT_LIST_HEAD(&vfio.group_list);
1374        INIT_LIST_HEAD(&vfio.iommu_drivers_list);
1375        init_waitqueue_head(&vfio.release_q);
1376
1377        vfio.class = class_create(THIS_MODULE, "vfio");
1378        if (IS_ERR(vfio.class)) {
1379                ret = PTR_ERR(vfio.class);
1380                goto err_class;
1381        }
1382
1383        vfio.class->devnode = vfio_devnode;
1384
1385        ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");
1386        if (ret)
1387                goto err_base_chrdev;
1388
1389        cdev_init(&vfio.cdev, &vfio_fops);
1390        ret = cdev_add(&vfio.cdev, vfio.devt, 1);
1391        if (ret)
1392                goto err_base_cdev;
1393
1394        vfio.dev = device_create(vfio.class, NULL, vfio.devt, NULL, "vfio");
1395        if (IS_ERR(vfio.dev)) {
1396                ret = PTR_ERR(vfio.dev);
1397                goto err_base_dev;
1398        }
1399
1400        /* /dev/vfio/$GROUP */
1401        cdev_init(&vfio.group_cdev, &vfio_group_fops);
1402        ret = cdev_add(&vfio.group_cdev,
1403                       MKDEV(MAJOR(vfio.devt), 1), MINORMASK - 1);
1404        if (ret)
1405                goto err_groups_cdev;
1406
1407        pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1408
1409        /*
1410         * Attempt to load known iommu-drivers.  This gives us a working
1411         * environment without the user needing to explicitly load iommu
1412         * drivers.
1413         */
1414        request_module_nowait("vfio_iommu_type1");
1415        request_module_nowait("vfio_iommu_spapr_tce");
1416
1417        return 0;
1418
1419err_groups_cdev:
1420        device_destroy(vfio.class, vfio.devt);
1421err_base_dev:
1422        cdev_del(&vfio.cdev);
1423err_base_cdev:
1424        unregister_chrdev_region(vfio.devt, MINORMASK);
1425err_base_chrdev:
1426        class_destroy(vfio.class);
1427        vfio.class = NULL;
1428err_class:
1429        return ret;
1430}
1431
1432static void __exit vfio_cleanup(void)
1433{
1434        WARN_ON(!list_empty(&vfio.group_list));
1435
1436        idr_destroy(&vfio.group_idr);
1437        cdev_del(&vfio.group_cdev);
1438        device_destroy(vfio.class, vfio.devt);
1439        cdev_del(&vfio.cdev);
1440        unregister_chrdev_region(vfio.devt, MINORMASK);
1441        class_destroy(vfio.class);
1442        vfio.class = NULL;
1443}
1444
1445module_init(vfio_init);
1446module_exit(vfio_cleanup);
1447
1448MODULE_VERSION(DRIVER_VERSION);
1449MODULE_LICENSE("GPL v2");
1450MODULE_AUTHOR(DRIVER_AUTHOR);
1451MODULE_DESCRIPTION(DRIVER_DESC);
1452