linux/drivers/vfio/vfio.c
<<
>>
Prefs
   1/*
   2 * VFIO core
   3 *
   4 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5 *     Author: Alex Williamson <alex.williamson@redhat.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio:
  12 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13 * Author: Tom Lyon, pugs@cisco.com
  14 */
  15
  16#include <linux/cdev.h>
  17#include <linux/compat.h>
  18#include <linux/device.h>
  19#include <linux/file.h>
  20#include <linux/anon_inodes.h>
  21#include <linux/fs.h>
  22#include <linux/idr.h>
  23#include <linux/iommu.h>
  24#include <linux/list.h>
  25#include <linux/miscdevice.h>
  26#include <linux/module.h>
  27#include <linux/mutex.h>
  28#include <linux/pci.h>
  29#include <linux/rwsem.h>
  30#include <linux/sched.h>
  31#include <linux/slab.h>
  32#include <linux/stat.h>
  33#include <linux/string.h>
  34#include <linux/uaccess.h>
  35#include <linux/vfio.h>
  36#include <linux/wait.h>
  37
  38#define DRIVER_VERSION  "0.3"
  39#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  40#define DRIVER_DESC     "VFIO - User Level meta-driver"
  41
  42static struct vfio {
  43        struct class                    *class;
  44        struct list_head                iommu_drivers_list;
  45        struct mutex                    iommu_drivers_lock;
  46        struct list_head                group_list;
  47        struct idr                      group_idr;
  48        struct mutex                    group_lock;
  49        struct cdev                     group_cdev;
  50        dev_t                           group_devt;
  51        wait_queue_head_t               release_q;
  52} vfio;
  53
  54struct vfio_iommu_driver {
  55        const struct vfio_iommu_driver_ops      *ops;
  56        struct list_head                        vfio_next;
  57};
  58
  59struct vfio_container {
  60        struct kref                     kref;
  61        struct list_head                group_list;
  62        struct rw_semaphore             group_lock;
  63        struct vfio_iommu_driver        *iommu_driver;
  64        void                            *iommu_data;
  65        bool                            noiommu;
  66};
  67
  68struct vfio_unbound_dev {
  69        struct device                   *dev;
  70        struct list_head                unbound_next;
  71};
  72
  73struct vfio_group {
  74        struct kref                     kref;
  75        int                             minor;
  76        atomic_t                        container_users;
  77        struct iommu_group              *iommu_group;
  78        struct vfio_container           *container;
  79        struct list_head                device_list;
  80        struct mutex                    device_lock;
  81        struct device                   *dev;
  82        struct notifier_block           nb;
  83        struct list_head                vfio_next;
  84        struct list_head                container_next;
  85        struct list_head                unbound_list;
  86        struct mutex                    unbound_lock;
  87        atomic_t                        opened;
  88        wait_queue_head_t               container_q;
  89        bool                            noiommu;
  90        struct kvm                      *kvm;
  91        struct blocking_notifier_head   notifier;
  92};
  93
  94struct vfio_device {
  95        struct kref                     kref;
  96        struct device                   *dev;
  97        const struct vfio_device_ops    *ops;
  98        struct vfio_group               *group;
  99        struct list_head                group_next;
 100        void                            *device_data;
 101};
 102
 103#ifdef CONFIG_VFIO_NOIOMMU
 104static bool noiommu __read_mostly;
 105module_param_named(enable_unsafe_noiommu_mode,
 106                   noiommu, bool, S_IRUGO | S_IWUSR);
 107MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 108#endif
 109
 110/*
 111 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 112 * and remove functions, any use cases other than acquiring the first
 113 * reference for the purpose of calling vfio_add_group_dev() or removing
 114 * that symmetric reference after vfio_del_group_dev() should use the raw
 115 * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 116 * removes the device from the dummy group and cannot be nested.
 117 */
 118struct iommu_group *vfio_iommu_group_get(struct device *dev)
 119{
 120        struct iommu_group *group;
 121        int __maybe_unused ret;
 122
 123        group = iommu_group_get(dev);
 124
 125#ifdef CONFIG_VFIO_NOIOMMU
 126        /*
 127         * With noiommu enabled, an IOMMU group will be created for a device
 128         * that doesn't already have one and doesn't have an iommu_ops on their
 129         * bus.  We set iommudata simply to be able to identify these groups
 130         * as special use and for reclamation later.
 131         */
 132        if (group || !noiommu || iommu_present(dev->bus))
 133                return group;
 134
 135        group = iommu_group_alloc();
 136        if (IS_ERR(group))
 137                return NULL;
 138
 139        iommu_group_set_name(group, "vfio-noiommu");
 140        iommu_group_set_iommudata(group, &noiommu, NULL);
 141        ret = iommu_group_add_device(group, dev);
 142        if (ret) {
 143                iommu_group_put(group);
 144                return NULL;
 145        }
 146
 147        /*
 148         * Where to taint?  At this point we've added an IOMMU group for a
 149         * device that is not backed by iommu_ops, therefore any iommu_
 150         * callback using iommu_ops can legitimately Oops.  So, while we may
 151         * be about to give a DMA capable device to a user without IOMMU
 152         * protection, which is clearly taint-worthy, let's go ahead and do
 153         * it here.
 154         */
 155        add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 156        dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 157#endif
 158
 159        return group;
 160}
 161EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 162
 163void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 164{
 165#ifdef CONFIG_VFIO_NOIOMMU
 166        if (iommu_group_get_iommudata(group) == &noiommu)
 167                iommu_group_remove_device(dev);
 168#endif
 169
 170        iommu_group_put(group);
 171}
 172EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 173
 174#ifdef CONFIG_VFIO_NOIOMMU
 175static void *vfio_noiommu_open(unsigned long arg)
 176{
 177        if (arg != VFIO_NOIOMMU_IOMMU)
 178                return ERR_PTR(-EINVAL);
 179        if (!capable(CAP_SYS_RAWIO))
 180                return ERR_PTR(-EPERM);
 181
 182        return NULL;
 183}
 184
 185static void vfio_noiommu_release(void *iommu_data)
 186{
 187}
 188
 189static long vfio_noiommu_ioctl(void *iommu_data,
 190                               unsigned int cmd, unsigned long arg)
 191{
 192        if (cmd == VFIO_CHECK_EXTENSION)
 193                return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 194
 195        return -ENOTTY;
 196}
 197
 198static int vfio_noiommu_attach_group(void *iommu_data,
 199                                     struct iommu_group *iommu_group)
 200{
 201        return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 202}
 203
 204static void vfio_noiommu_detach_group(void *iommu_data,
 205                                      struct iommu_group *iommu_group)
 206{
 207}
 208
 209static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 210        .name = "vfio-noiommu",
 211        .owner = THIS_MODULE,
 212        .open = vfio_noiommu_open,
 213        .release = vfio_noiommu_release,
 214        .ioctl = vfio_noiommu_ioctl,
 215        .attach_group = vfio_noiommu_attach_group,
 216        .detach_group = vfio_noiommu_detach_group,
 217};
 218#endif
 219
 220
 221/**
 222 * IOMMU driver registration
 223 */
 224int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 225{
 226        struct vfio_iommu_driver *driver, *tmp;
 227
 228        driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 229        if (!driver)
 230                return -ENOMEM;
 231
 232        driver->ops = ops;
 233
 234        mutex_lock(&vfio.iommu_drivers_lock);
 235
 236        /* Check for duplicates */
 237        list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 238                if (tmp->ops == ops) {
 239                        mutex_unlock(&vfio.iommu_drivers_lock);
 240                        kfree(driver);
 241                        return -EINVAL;
 242                }
 243        }
 244
 245        list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 246
 247        mutex_unlock(&vfio.iommu_drivers_lock);
 248
 249        return 0;
 250}
 251EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 252
 253void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 254{
 255        struct vfio_iommu_driver *driver;
 256
 257        mutex_lock(&vfio.iommu_drivers_lock);
 258        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 259                if (driver->ops == ops) {
 260                        list_del(&driver->vfio_next);
 261                        mutex_unlock(&vfio.iommu_drivers_lock);
 262                        kfree(driver);
 263                        return;
 264                }
 265        }
 266        mutex_unlock(&vfio.iommu_drivers_lock);
 267}
 268EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 269
 270/**
 271 * Group minor allocation/free - both called with vfio.group_lock held
 272 */
 273static int vfio_alloc_group_minor(struct vfio_group *group)
 274{
 275        return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 276}
 277
 278static void vfio_free_group_minor(int minor)
 279{
 280        idr_remove(&vfio.group_idr, minor);
 281}
 282
 283static int vfio_iommu_group_notifier(struct notifier_block *nb,
 284                                     unsigned long action, void *data);
 285static void vfio_group_get(struct vfio_group *group);
 286
 287/**
 288 * Container objects - containers are created when /dev/vfio/vfio is
 289 * opened, but their lifecycle extends until the last user is done, so
 290 * it's freed via kref.  Must support container/group/device being
 291 * closed in any order.
 292 */
 293static void vfio_container_get(struct vfio_container *container)
 294{
 295        kref_get(&container->kref);
 296}
 297
 298static void vfio_container_release(struct kref *kref)
 299{
 300        struct vfio_container *container;
 301        container = container_of(kref, struct vfio_container, kref);
 302
 303        kfree(container);
 304}
 305
 306static void vfio_container_put(struct vfio_container *container)
 307{
 308        kref_put(&container->kref, vfio_container_release);
 309}
 310
 311static void vfio_group_unlock_and_free(struct vfio_group *group)
 312{
 313        mutex_unlock(&vfio.group_lock);
 314        /*
 315         * Unregister outside of lock.  A spurious callback is harmless now
 316         * that the group is no longer in vfio.group_list.
 317         */
 318        iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 319        kfree(group);
 320}
 321
 322/**
 323 * Group objects - create, release, get, put, search
 324 */
 325static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 326{
 327        struct vfio_group *group, *tmp;
 328        struct device *dev;
 329        int ret, minor;
 330
 331        group = kzalloc(sizeof(*group), GFP_KERNEL);
 332        if (!group)
 333                return ERR_PTR(-ENOMEM);
 334
 335        kref_init(&group->kref);
 336        INIT_LIST_HEAD(&group->device_list);
 337        mutex_init(&group->device_lock);
 338        INIT_LIST_HEAD(&group->unbound_list);
 339        mutex_init(&group->unbound_lock);
 340        atomic_set(&group->container_users, 0);
 341        atomic_set(&group->opened, 0);
 342        init_waitqueue_head(&group->container_q);
 343        group->iommu_group = iommu_group;
 344#ifdef CONFIG_VFIO_NOIOMMU
 345        group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 346#endif
 347        BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 348
 349        group->nb.notifier_call = vfio_iommu_group_notifier;
 350
 351        /*
 352         * blocking notifiers acquire a rwsem around registering and hold
 353         * it around callback.  Therefore, need to register outside of
 354         * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 355         * do anything unless it can find the group in vfio.group_list, so
 356         * no harm in registering early.
 357         */
 358        ret = iommu_group_register_notifier(iommu_group, &group->nb);
 359        if (ret) {
 360                kfree(group);
 361                return ERR_PTR(ret);
 362        }
 363
 364        mutex_lock(&vfio.group_lock);
 365
 366        /* Did we race creating this group? */
 367        list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 368                if (tmp->iommu_group == iommu_group) {
 369                        vfio_group_get(tmp);
 370                        vfio_group_unlock_and_free(group);
 371                        return tmp;
 372                }
 373        }
 374
 375        minor = vfio_alloc_group_minor(group);
 376        if (minor < 0) {
 377                vfio_group_unlock_and_free(group);
 378                return ERR_PTR(minor);
 379        }
 380
 381        dev = device_create(vfio.class, NULL,
 382                            MKDEV(MAJOR(vfio.group_devt), minor),
 383                            group, "%s%d", group->noiommu ? "noiommu-" : "",
 384                            iommu_group_id(iommu_group));
 385        if (IS_ERR(dev)) {
 386                vfio_free_group_minor(minor);
 387                vfio_group_unlock_and_free(group);
 388                return ERR_CAST(dev);
 389        }
 390
 391        group->minor = minor;
 392        group->dev = dev;
 393
 394        list_add(&group->vfio_next, &vfio.group_list);
 395
 396        mutex_unlock(&vfio.group_lock);
 397
 398        return group;
 399}
 400
 401/* called with vfio.group_lock held */
 402static void vfio_group_release(struct kref *kref)
 403{
 404        struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 405        struct vfio_unbound_dev *unbound, *tmp;
 406        struct iommu_group *iommu_group = group->iommu_group;
 407
 408        WARN_ON(!list_empty(&group->device_list));
 409        WARN_ON(group->notifier.head);
 410
 411        list_for_each_entry_safe(unbound, tmp,
 412                                 &group->unbound_list, unbound_next) {
 413                list_del(&unbound->unbound_next);
 414                kfree(unbound);
 415        }
 416
 417        device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 418        list_del(&group->vfio_next);
 419        vfio_free_group_minor(group->minor);
 420        vfio_group_unlock_and_free(group);
 421        iommu_group_put(iommu_group);
 422}
 423
 424static void vfio_group_put(struct vfio_group *group)
 425{
 426        kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 427}
 428
 429struct vfio_group_put_work {
 430        struct work_struct work;
 431        struct vfio_group *group;
 432};
 433
 434static void vfio_group_put_bg(struct work_struct *work)
 435{
 436        struct vfio_group_put_work *do_work;
 437
 438        do_work = container_of(work, struct vfio_group_put_work, work);
 439
 440        vfio_group_put(do_work->group);
 441        kfree(do_work);
 442}
 443
 444static void vfio_group_schedule_put(struct vfio_group *group)
 445{
 446        struct vfio_group_put_work *do_work;
 447
 448        do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 449        if (WARN_ON(!do_work))
 450                return;
 451
 452        INIT_WORK(&do_work->work, vfio_group_put_bg);
 453        do_work->group = group;
 454        schedule_work(&do_work->work);
 455}
 456
 457/* Assume group_lock or group reference is held */
 458static void vfio_group_get(struct vfio_group *group)
 459{
 460        kref_get(&group->kref);
 461}
 462
 463/*
 464 * Not really a try as we will sleep for mutex, but we need to make
 465 * sure the group pointer is valid under lock and get a reference.
 466 */
 467static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 468{
 469        struct vfio_group *target = group;
 470
 471        mutex_lock(&vfio.group_lock);
 472        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 473                if (group == target) {
 474                        vfio_group_get(group);
 475                        mutex_unlock(&vfio.group_lock);
 476                        return group;
 477                }
 478        }
 479        mutex_unlock(&vfio.group_lock);
 480
 481        return NULL;
 482}
 483
 484static
 485struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 486{
 487        struct vfio_group *group;
 488
 489        mutex_lock(&vfio.group_lock);
 490        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 491                if (group->iommu_group == iommu_group) {
 492                        vfio_group_get(group);
 493                        mutex_unlock(&vfio.group_lock);
 494                        return group;
 495                }
 496        }
 497        mutex_unlock(&vfio.group_lock);
 498
 499        return NULL;
 500}
 501
 502static struct vfio_group *vfio_group_get_from_minor(int minor)
 503{
 504        struct vfio_group *group;
 505
 506        mutex_lock(&vfio.group_lock);
 507        group = idr_find(&vfio.group_idr, minor);
 508        if (!group) {
 509                mutex_unlock(&vfio.group_lock);
 510                return NULL;
 511        }
 512        vfio_group_get(group);
 513        mutex_unlock(&vfio.group_lock);
 514
 515        return group;
 516}
 517
 518static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 519{
 520        struct iommu_group *iommu_group;
 521        struct vfio_group *group;
 522
 523        iommu_group = iommu_group_get(dev);
 524        if (!iommu_group)
 525                return NULL;
 526
 527        group = vfio_group_get_from_iommu(iommu_group);
 528        iommu_group_put(iommu_group);
 529
 530        return group;
 531}
 532
 533/**
 534 * Device objects - create, release, get, put, search
 535 */
 536static
 537struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 538                                             struct device *dev,
 539                                             const struct vfio_device_ops *ops,
 540                                             void *device_data)
 541{
 542        struct vfio_device *device;
 543
 544        device = kzalloc(sizeof(*device), GFP_KERNEL);
 545        if (!device)
 546                return ERR_PTR(-ENOMEM);
 547
 548        kref_init(&device->kref);
 549        device->dev = dev;
 550        device->group = group;
 551        device->ops = ops;
 552        device->device_data = device_data;
 553        dev_set_drvdata(dev, device);
 554
 555        /* No need to get group_lock, caller has group reference */
 556        vfio_group_get(group);
 557
 558        mutex_lock(&group->device_lock);
 559        list_add(&device->group_next, &group->device_list);
 560        mutex_unlock(&group->device_lock);
 561
 562        return device;
 563}
 564
 565static void vfio_device_release(struct kref *kref)
 566{
 567        struct vfio_device *device = container_of(kref,
 568                                                  struct vfio_device, kref);
 569        struct vfio_group *group = device->group;
 570
 571        list_del(&device->group_next);
 572        mutex_unlock(&group->device_lock);
 573
 574        dev_set_drvdata(device->dev, NULL);
 575
 576        kfree(device);
 577
 578        /* vfio_del_group_dev may be waiting for this device */
 579        wake_up(&vfio.release_q);
 580}
 581
 582/* Device reference always implies a group reference */
 583void vfio_device_put(struct vfio_device *device)
 584{
 585        struct vfio_group *group = device->group;
 586        kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 587        vfio_group_put(group);
 588}
 589EXPORT_SYMBOL_GPL(vfio_device_put);
 590
 591static void vfio_device_get(struct vfio_device *device)
 592{
 593        vfio_group_get(device->group);
 594        kref_get(&device->kref);
 595}
 596
 597static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 598                                                 struct device *dev)
 599{
 600        struct vfio_device *device;
 601
 602        mutex_lock(&group->device_lock);
 603        list_for_each_entry(device, &group->device_list, group_next) {
 604                if (device->dev == dev) {
 605                        vfio_device_get(device);
 606                        mutex_unlock(&group->device_lock);
 607                        return device;
 608                }
 609        }
 610        mutex_unlock(&group->device_lock);
 611        return NULL;
 612}
 613
 614/*
 615 * Some drivers, like pci-stub, are only used to prevent other drivers from
 616 * claiming a device and are therefore perfectly legitimate for a user owned
 617 * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 618 * of the device, but it does prevent the user from having direct access to
 619 * the device, which is useful in some circumstances.
 620 *
 621 * We also assume that we can include PCI interconnect devices, ie. bridges.
 622 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 623 * then all of the downstream devices will be part of the same IOMMU group as
 624 * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 625 * breaks anything, it only does so for user owned devices downstream.  Note
 626 * that error notification via MSI can be affected for platforms that handle
 627 * MSI within the same IOVA space as DMA.
 628 */
 629static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 630
 631static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 632{
 633        if (dev_is_pci(dev)) {
 634                struct pci_dev *pdev = to_pci_dev(dev);
 635
 636                if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 637                        return true;
 638        }
 639
 640        return match_string(vfio_driver_whitelist,
 641                            ARRAY_SIZE(vfio_driver_whitelist),
 642                            drv->name) >= 0;
 643}
 644
 645/*
 646 * A vfio group is viable for use by userspace if all devices are in
 647 * one of the following states:
 648 *  - driver-less
 649 *  - bound to a vfio driver
 650 *  - bound to a whitelisted driver
 651 *  - a PCI interconnect device
 652 *
 653 * We use two methods to determine whether a device is bound to a vfio
 654 * driver.  The first is to test whether the device exists in the vfio
 655 * group.  The second is to test if the device exists on the group
 656 * unbound_list, indicating it's in the middle of transitioning from
 657 * a vfio driver to driver-less.
 658 */
 659static int vfio_dev_viable(struct device *dev, void *data)
 660{
 661        struct vfio_group *group = data;
 662        struct vfio_device *device;
 663        struct device_driver *drv = READ_ONCE(dev->driver);
 664        struct vfio_unbound_dev *unbound;
 665        int ret = -EINVAL;
 666
 667        mutex_lock(&group->unbound_lock);
 668        list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 669                if (dev == unbound->dev) {
 670                        ret = 0;
 671                        break;
 672                }
 673        }
 674        mutex_unlock(&group->unbound_lock);
 675
 676        if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 677                return 0;
 678
 679        device = vfio_group_get_device(group, dev);
 680        if (device) {
 681                vfio_device_put(device);
 682                return 0;
 683        }
 684
 685        return ret;
 686}
 687
 688/**
 689 * Async device support
 690 */
 691static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 692{
 693        struct vfio_device *device;
 694
 695        /* Do we already know about it?  We shouldn't */
 696        device = vfio_group_get_device(group, dev);
 697        if (WARN_ON_ONCE(device)) {
 698                vfio_device_put(device);
 699                return 0;
 700        }
 701
 702        /* Nothing to do for idle groups */
 703        if (!atomic_read(&group->container_users))
 704                return 0;
 705
 706        /* TODO Prevent device auto probing */
 707        WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
 708             iommu_group_id(group->iommu_group));
 709
 710        return 0;
 711}
 712
 713static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 714{
 715        /* We don't care what happens when the group isn't in use */
 716        if (!atomic_read(&group->container_users))
 717                return 0;
 718
 719        return vfio_dev_viable(dev, group);
 720}
 721
 722static int vfio_iommu_group_notifier(struct notifier_block *nb,
 723                                     unsigned long action, void *data)
 724{
 725        struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 726        struct device *dev = data;
 727        struct vfio_unbound_dev *unbound;
 728
 729        /*
 730         * Need to go through a group_lock lookup to get a reference or we
 731         * risk racing a group being removed.  Ignore spurious notifies.
 732         */
 733        group = vfio_group_try_get(group);
 734        if (!group)
 735                return NOTIFY_OK;
 736
 737        switch (action) {
 738        case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 739                vfio_group_nb_add_dev(group, dev);
 740                break;
 741        case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 742                /*
 743                 * Nothing to do here.  If the device is in use, then the
 744                 * vfio sub-driver should block the remove callback until
 745                 * it is unused.  If the device is unused or attached to a
 746                 * stub driver, then it should be released and we don't
 747                 * care that it will be going away.
 748                 */
 749                break;
 750        case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 751                pr_debug("%s: Device %s, group %d binding to driver\n",
 752                         __func__, dev_name(dev),
 753                         iommu_group_id(group->iommu_group));
 754                break;
 755        case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 756                pr_debug("%s: Device %s, group %d bound to driver %s\n",
 757                         __func__, dev_name(dev),
 758                         iommu_group_id(group->iommu_group), dev->driver->name);
 759                BUG_ON(vfio_group_nb_verify(group, dev));
 760                break;
 761        case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 762                pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
 763                         __func__, dev_name(dev),
 764                         iommu_group_id(group->iommu_group), dev->driver->name);
 765                break;
 766        case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 767                pr_debug("%s: Device %s, group %d unbound from driver\n",
 768                         __func__, dev_name(dev),
 769                         iommu_group_id(group->iommu_group));
 770                /*
 771                 * XXX An unbound device in a live group is ok, but we'd
 772                 * really like to avoid the above BUG_ON by preventing other
 773                 * drivers from binding to it.  Once that occurs, we have to
 774                 * stop the system to maintain isolation.  At a minimum, we'd
 775                 * want a toggle to disable driver auto probe for this device.
 776                 */
 777
 778                mutex_lock(&group->unbound_lock);
 779                list_for_each_entry(unbound,
 780                                    &group->unbound_list, unbound_next) {
 781                        if (dev == unbound->dev) {
 782                                list_del(&unbound->unbound_next);
 783                                kfree(unbound);
 784                                break;
 785                        }
 786                }
 787                mutex_unlock(&group->unbound_lock);
 788                break;
 789        }
 790
 791        /*
 792         * If we're the last reference to the group, the group will be
 793         * released, which includes unregistering the iommu group notifier.
 794         * We hold a read-lock on that notifier list, unregistering needs
 795         * a write-lock... deadlock.  Release our reference asynchronously
 796         * to avoid that situation.
 797         */
 798        vfio_group_schedule_put(group);
 799        return NOTIFY_OK;
 800}
 801
 802/**
 803 * VFIO driver API
 804 */
 805int vfio_add_group_dev(struct device *dev,
 806                       const struct vfio_device_ops *ops, void *device_data)
 807{
 808        struct iommu_group *iommu_group;
 809        struct vfio_group *group;
 810        struct vfio_device *device;
 811
 812        iommu_group = iommu_group_get(dev);
 813        if (!iommu_group)
 814                return -EINVAL;
 815
 816        group = vfio_group_get_from_iommu(iommu_group);
 817        if (!group) {
 818                group = vfio_create_group(iommu_group);
 819                if (IS_ERR(group)) {
 820                        iommu_group_put(iommu_group);
 821                        return PTR_ERR(group);
 822                }
 823        } else {
 824                /*
 825                 * A found vfio_group already holds a reference to the
 826                 * iommu_group.  A created vfio_group keeps the reference.
 827                 */
 828                iommu_group_put(iommu_group);
 829        }
 830
 831        device = vfio_group_get_device(group, dev);
 832        if (device) {
 833                WARN(1, "Device %s already exists on group %d\n",
 834                     dev_name(dev), iommu_group_id(iommu_group));
 835                vfio_device_put(device);
 836                vfio_group_put(group);
 837                return -EBUSY;
 838        }
 839
 840        device = vfio_group_create_device(group, dev, ops, device_data);
 841        if (IS_ERR(device)) {
 842                vfio_group_put(group);
 843                return PTR_ERR(device);
 844        }
 845
 846        /*
 847         * Drop all but the vfio_device reference.  The vfio_device holds
 848         * a reference to the vfio_group, which holds a reference to the
 849         * iommu_group.
 850         */
 851        vfio_group_put(group);
 852
 853        return 0;
 854}
 855EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 856
 857/**
 858 * Get a reference to the vfio_device for a device.  Even if the
 859 * caller thinks they own the device, they could be racing with a
 860 * release call path, so we can't trust drvdata for the shortcut.
 861 * Go the long way around, from the iommu_group to the vfio_group
 862 * to the vfio_device.
 863 */
 864struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 865{
 866        struct vfio_group *group;
 867        struct vfio_device *device;
 868
 869        group = vfio_group_get_from_dev(dev);
 870        if (!group)
 871                return NULL;
 872
 873        device = vfio_group_get_device(group, dev);
 874        vfio_group_put(group);
 875
 876        return device;
 877}
 878EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 879
 880static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 881                                                     char *buf)
 882{
 883        struct vfio_device *it, *device = NULL;
 884
 885        mutex_lock(&group->device_lock);
 886        list_for_each_entry(it, &group->device_list, group_next) {
 887                if (!strcmp(dev_name(it->dev), buf)) {
 888                        device = it;
 889                        vfio_device_get(device);
 890                        break;
 891                }
 892        }
 893        mutex_unlock(&group->device_lock);
 894
 895        return device;
 896}
 897
 898/*
 899 * Caller must hold a reference to the vfio_device
 900 */
 901void *vfio_device_data(struct vfio_device *device)
 902{
 903        return device->device_data;
 904}
 905EXPORT_SYMBOL_GPL(vfio_device_data);
 906
 907/* Given a referenced group, check if it contains the device */
 908static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
 909{
 910        struct vfio_device *device;
 911
 912        device = vfio_group_get_device(group, dev);
 913        if (!device)
 914                return false;
 915
 916        vfio_device_put(device);
 917        return true;
 918}
 919
 920/*
 921 * Decrement the device reference count and wait for the device to be
 922 * removed.  Open file descriptors for the device... */
 923void *vfio_del_group_dev(struct device *dev)
 924{
 925        struct vfio_device *device = dev_get_drvdata(dev);
 926        struct vfio_group *group = device->group;
 927        void *device_data = device->device_data;
 928        struct vfio_unbound_dev *unbound;
 929        unsigned int i = 0;
 930        long ret;
 931        bool interrupted = false;
 932
 933        /*
 934         * The group exists so long as we have a device reference.  Get
 935         * a group reference and use it to scan for the device going away.
 936         */
 937        vfio_group_get(group);
 938
 939        /*
 940         * When the device is removed from the group, the group suddenly
 941         * becomes non-viable; the device has a driver (until the unbind
 942         * completes), but it's not present in the group.  This is bad news
 943         * for any external users that need to re-acquire a group reference
 944         * in order to match and release their existing reference.  To
 945         * solve this, we track such devices on the unbound_list to bridge
 946         * the gap until they're fully unbound.
 947         */
 948        unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 949        if (unbound) {
 950                unbound->dev = dev;
 951                mutex_lock(&group->unbound_lock);
 952                list_add(&unbound->unbound_next, &group->unbound_list);
 953                mutex_unlock(&group->unbound_lock);
 954        }
 955        WARN_ON(!unbound);
 956
 957        vfio_device_put(device);
 958
 959        /*
 960         * If the device is still present in the group after the above
 961         * 'put', then it is in use and we need to request it from the
 962         * bus driver.  The driver may in turn need to request the
 963         * device from the user.  We send the request on an arbitrary
 964         * interval with counter to allow the driver to take escalating
 965         * measures to release the device if it has the ability to do so.
 966         */
 967        do {
 968                device = vfio_group_get_device(group, dev);
 969                if (!device)
 970                        break;
 971
 972                if (device->ops->request)
 973                        device->ops->request(device_data, i++);
 974
 975                vfio_device_put(device);
 976
 977                if (interrupted) {
 978                        ret = wait_event_timeout(vfio.release_q,
 979                                        !vfio_dev_present(group, dev), HZ * 10);
 980                } else {
 981                        ret = wait_event_interruptible_timeout(vfio.release_q,
 982                                        !vfio_dev_present(group, dev), HZ * 10);
 983                        if (ret == -ERESTARTSYS) {
 984                                interrupted = true;
 985                                dev_warn(dev,
 986                                         "Device is currently in use, task"
 987                                         " \"%s\" (%d) "
 988                                         "blocked until device is released",
 989                                         current->comm, task_pid_nr(current));
 990                        }
 991                }
 992        } while (ret <= 0);
 993
 994        /*
 995         * In order to support multiple devices per group, devices can be
 996         * plucked from the group while other devices in the group are still
 997         * in use.  The container persists with this group and those remaining
 998         * devices still attached.  If the user creates an isolation violation
 999         * by binding this device to another driver while the group is still in
1000         * use, that's their fault.  However, in the case of removing the last,
1001         * or potentially the only, device in the group there can be no other
1002         * in-use devices in the group.  The user has done their due diligence
1003         * and we should lay no claims to those devices.  In order to do that,
1004         * we need to make sure the group is detached from the container.
1005         * Without this stall, we're potentially racing with a user process
1006         * that may attempt to immediately bind this device to another driver.
1007         */
1008        if (list_empty(&group->device_list))
1009                wait_event(group->container_q, !group->container);
1010
1011        vfio_group_put(group);
1012
1013        return device_data;
1014}
1015EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1016
1017/**
1018 * VFIO base fd, /dev/vfio/vfio
1019 */
1020static long vfio_ioctl_check_extension(struct vfio_container *container,
1021                                       unsigned long arg)
1022{
1023        struct vfio_iommu_driver *driver;
1024        long ret = 0;
1025
1026        down_read(&container->group_lock);
1027
1028        driver = container->iommu_driver;
1029
1030        switch (arg) {
1031                /* No base extensions yet */
1032        default:
1033                /*
1034                 * If no driver is set, poll all registered drivers for
1035                 * extensions and return the first positive result.  If
1036                 * a driver is already set, further queries will be passed
1037                 * only to that driver.
1038                 */
1039                if (!driver) {
1040                        mutex_lock(&vfio.iommu_drivers_lock);
1041                        list_for_each_entry(driver, &vfio.iommu_drivers_list,
1042                                            vfio_next) {
1043
1044#ifdef CONFIG_VFIO_NOIOMMU
1045                                if (!list_empty(&container->group_list) &&
1046                                    (container->noiommu !=
1047                                     (driver->ops == &vfio_noiommu_ops)))
1048                                        continue;
1049#endif
1050
1051                                if (!try_module_get(driver->ops->owner))
1052                                        continue;
1053
1054                                ret = driver->ops->ioctl(NULL,
1055                                                         VFIO_CHECK_EXTENSION,
1056                                                         arg);
1057                                module_put(driver->ops->owner);
1058                                if (ret > 0)
1059                                        break;
1060                        }
1061                        mutex_unlock(&vfio.iommu_drivers_lock);
1062                } else
1063                        ret = driver->ops->ioctl(container->iommu_data,
1064                                                 VFIO_CHECK_EXTENSION, arg);
1065        }
1066
1067        up_read(&container->group_lock);
1068
1069        return ret;
1070}
1071
1072/* hold write lock on container->group_lock */
1073static int __vfio_container_attach_groups(struct vfio_container *container,
1074                                          struct vfio_iommu_driver *driver,
1075                                          void *data)
1076{
1077        struct vfio_group *group;
1078        int ret = -ENODEV;
1079
1080        list_for_each_entry(group, &container->group_list, container_next) {
1081                ret = driver->ops->attach_group(data, group->iommu_group);
1082                if (ret)
1083                        goto unwind;
1084        }
1085
1086        return ret;
1087
1088unwind:
1089        list_for_each_entry_continue_reverse(group, &container->group_list,
1090                                             container_next) {
1091                driver->ops->detach_group(data, group->iommu_group);
1092        }
1093
1094        return ret;
1095}
1096
1097static long vfio_ioctl_set_iommu(struct vfio_container *container,
1098                                 unsigned long arg)
1099{
1100        struct vfio_iommu_driver *driver;
1101        long ret = -ENODEV;
1102
1103        down_write(&container->group_lock);
1104
1105        /*
1106         * The container is designed to be an unprivileged interface while
1107         * the group can be assigned to specific users.  Therefore, only by
1108         * adding a group to a container does the user get the privilege of
1109         * enabling the iommu, which may allocate finite resources.  There
1110         * is no unset_iommu, but by removing all the groups from a container,
1111         * the container is deprivileged and returns to an unset state.
1112         */
1113        if (list_empty(&container->group_list) || container->iommu_driver) {
1114                up_write(&container->group_lock);
1115                return -EINVAL;
1116        }
1117
1118        mutex_lock(&vfio.iommu_drivers_lock);
1119        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1120                void *data;
1121
1122#ifdef CONFIG_VFIO_NOIOMMU
1123                /*
1124                 * Only noiommu containers can use vfio-noiommu and noiommu
1125                 * containers can only use vfio-noiommu.
1126                 */
1127                if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1128                        continue;
1129#endif
1130
1131                if (!try_module_get(driver->ops->owner))
1132                        continue;
1133
1134                /*
1135                 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1136                 * so test which iommu driver reported support for this
1137                 * extension and call open on them.  We also pass them the
1138                 * magic, allowing a single driver to support multiple
1139                 * interfaces if they'd like.
1140                 */
1141                if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1142                        module_put(driver->ops->owner);
1143                        continue;
1144                }
1145
1146                data = driver->ops->open(arg);
1147                if (IS_ERR(data)) {
1148                        ret = PTR_ERR(data);
1149                        module_put(driver->ops->owner);
1150                        continue;
1151                }
1152
1153                ret = __vfio_container_attach_groups(container, driver, data);
1154                if (ret) {
1155                        driver->ops->release(data);
1156                        module_put(driver->ops->owner);
1157                        continue;
1158                }
1159
1160                container->iommu_driver = driver;
1161                container->iommu_data = data;
1162                break;
1163        }
1164
1165        mutex_unlock(&vfio.iommu_drivers_lock);
1166        up_write(&container->group_lock);
1167
1168        return ret;
1169}
1170
1171static long vfio_fops_unl_ioctl(struct file *filep,
1172                                unsigned int cmd, unsigned long arg)
1173{
1174        struct vfio_container *container = filep->private_data;
1175        struct vfio_iommu_driver *driver;
1176        void *data;
1177        long ret = -EINVAL;
1178
1179        if (!container)
1180                return ret;
1181
1182        switch (cmd) {
1183        case VFIO_GET_API_VERSION:
1184                ret = VFIO_API_VERSION;
1185                break;
1186        case VFIO_CHECK_EXTENSION:
1187                ret = vfio_ioctl_check_extension(container, arg);
1188                break;
1189        case VFIO_SET_IOMMU:
1190                ret = vfio_ioctl_set_iommu(container, arg);
1191                break;
1192        default:
1193                driver = container->iommu_driver;
1194                data = container->iommu_data;
1195
1196                if (driver) /* passthrough all unrecognized ioctls */
1197                        ret = driver->ops->ioctl(data, cmd, arg);
1198        }
1199
1200        return ret;
1201}
1202
1203#ifdef CONFIG_COMPAT
1204static long vfio_fops_compat_ioctl(struct file *filep,
1205                                   unsigned int cmd, unsigned long arg)
1206{
1207        arg = (unsigned long)compat_ptr(arg);
1208        return vfio_fops_unl_ioctl(filep, cmd, arg);
1209}
1210#endif  /* CONFIG_COMPAT */
1211
1212static int vfio_fops_open(struct inode *inode, struct file *filep)
1213{
1214        struct vfio_container *container;
1215
1216        container = kzalloc(sizeof(*container), GFP_KERNEL);
1217        if (!container)
1218                return -ENOMEM;
1219
1220        INIT_LIST_HEAD(&container->group_list);
1221        init_rwsem(&container->group_lock);
1222        kref_init(&container->kref);
1223
1224        filep->private_data = container;
1225
1226        return 0;
1227}
1228
1229static int vfio_fops_release(struct inode *inode, struct file *filep)
1230{
1231        struct vfio_container *container = filep->private_data;
1232
1233        filep->private_data = NULL;
1234
1235        vfio_container_put(container);
1236
1237        return 0;
1238}
1239
1240/*
1241 * Once an iommu driver is set, we optionally pass read/write/mmap
1242 * on to the driver, allowing management interfaces beyond ioctl.
1243 */
1244static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1245                              size_t count, loff_t *ppos)
1246{
1247        struct vfio_container *container = filep->private_data;
1248        struct vfio_iommu_driver *driver;
1249        ssize_t ret = -EINVAL;
1250
1251        driver = container->iommu_driver;
1252        if (likely(driver && driver->ops->read))
1253                ret = driver->ops->read(container->iommu_data,
1254                                        buf, count, ppos);
1255
1256        return ret;
1257}
1258
1259static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1260                               size_t count, loff_t *ppos)
1261{
1262        struct vfio_container *container = filep->private_data;
1263        struct vfio_iommu_driver *driver;
1264        ssize_t ret = -EINVAL;
1265
1266        driver = container->iommu_driver;
1267        if (likely(driver && driver->ops->write))
1268                ret = driver->ops->write(container->iommu_data,
1269                                         buf, count, ppos);
1270
1271        return ret;
1272}
1273
1274static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1275{
1276        struct vfio_container *container = filep->private_data;
1277        struct vfio_iommu_driver *driver;
1278        int ret = -EINVAL;
1279
1280        driver = container->iommu_driver;
1281        if (likely(driver && driver->ops->mmap))
1282                ret = driver->ops->mmap(container->iommu_data, vma);
1283
1284        return ret;
1285}
1286
1287static const struct file_operations vfio_fops = {
1288        .owner          = THIS_MODULE,
1289        .open           = vfio_fops_open,
1290        .release        = vfio_fops_release,
1291        .read           = vfio_fops_read,
1292        .write          = vfio_fops_write,
1293        .unlocked_ioctl = vfio_fops_unl_ioctl,
1294#ifdef CONFIG_COMPAT
1295        .compat_ioctl   = vfio_fops_compat_ioctl,
1296#endif
1297        .mmap           = vfio_fops_mmap,
1298};
1299
1300/**
1301 * VFIO Group fd, /dev/vfio/$GROUP
1302 */
1303static void __vfio_group_unset_container(struct vfio_group *group)
1304{
1305        struct vfio_container *container = group->container;
1306        struct vfio_iommu_driver *driver;
1307
1308        down_write(&container->group_lock);
1309
1310        driver = container->iommu_driver;
1311        if (driver)
1312                driver->ops->detach_group(container->iommu_data,
1313                                          group->iommu_group);
1314
1315        group->container = NULL;
1316        wake_up(&group->container_q);
1317        list_del(&group->container_next);
1318
1319        /* Detaching the last group deprivileges a container, remove iommu */
1320        if (driver && list_empty(&container->group_list)) {
1321                driver->ops->release(container->iommu_data);
1322                module_put(driver->ops->owner);
1323                container->iommu_driver = NULL;
1324                container->iommu_data = NULL;
1325        }
1326
1327        up_write(&container->group_lock);
1328
1329        vfio_container_put(container);
1330}
1331
1332/*
1333 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1334 * if there was no container to unset.  Since the ioctl is called on
1335 * the group, we know that still exists, therefore the only valid
1336 * transition here is 1->0.
1337 */
1338static int vfio_group_unset_container(struct vfio_group *group)
1339{
1340        int users = atomic_cmpxchg(&group->container_users, 1, 0);
1341
1342        if (!users)
1343                return -EINVAL;
1344        if (users != 1)
1345                return -EBUSY;
1346
1347        __vfio_group_unset_container(group);
1348
1349        return 0;
1350}
1351
1352/*
1353 * When removing container users, anything that removes the last user
1354 * implicitly removes the group from the container.  That is, if the
1355 * group file descriptor is closed, as well as any device file descriptors,
1356 * the group is free.
1357 */
1358static void vfio_group_try_dissolve_container(struct vfio_group *group)
1359{
1360        if (0 == atomic_dec_if_positive(&group->container_users))
1361                __vfio_group_unset_container(group);
1362}
1363
1364static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1365{
1366        struct fd f;
1367        struct vfio_container *container;
1368        struct vfio_iommu_driver *driver;
1369        int ret = 0;
1370
1371        if (atomic_read(&group->container_users))
1372                return -EINVAL;
1373
1374        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1375                return -EPERM;
1376
1377        f = fdget(container_fd);
1378        if (!f.file)
1379                return -EBADF;
1380
1381        /* Sanity check, is this really our fd? */
1382        if (f.file->f_op != &vfio_fops) {
1383                fdput(f);
1384                return -EINVAL;
1385        }
1386
1387        container = f.file->private_data;
1388        WARN_ON(!container); /* fget ensures we don't race vfio_release */
1389
1390        down_write(&container->group_lock);
1391
1392        /* Real groups and fake groups cannot mix */
1393        if (!list_empty(&container->group_list) &&
1394            container->noiommu != group->noiommu) {
1395                ret = -EPERM;
1396                goto unlock_out;
1397        }
1398
1399        driver = container->iommu_driver;
1400        if (driver) {
1401                ret = driver->ops->attach_group(container->iommu_data,
1402                                                group->iommu_group);
1403                if (ret)
1404                        goto unlock_out;
1405        }
1406
1407        group->container = container;
1408        container->noiommu = group->noiommu;
1409        list_add(&group->container_next, &container->group_list);
1410
1411        /* Get a reference on the container and mark a user within the group */
1412        vfio_container_get(container);
1413        atomic_inc(&group->container_users);
1414
1415unlock_out:
1416        up_write(&container->group_lock);
1417        fdput(f);
1418        return ret;
1419}
1420
1421static bool vfio_group_viable(struct vfio_group *group)
1422{
1423        return (iommu_group_for_each_dev(group->iommu_group,
1424                                         group, vfio_dev_viable) == 0);
1425}
1426
1427static int vfio_group_add_container_user(struct vfio_group *group)
1428{
1429        if (!atomic_inc_not_zero(&group->container_users))
1430                return -EINVAL;
1431
1432        if (group->noiommu) {
1433                atomic_dec(&group->container_users);
1434                return -EPERM;
1435        }
1436        if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1437                atomic_dec(&group->container_users);
1438                return -EINVAL;
1439        }
1440
1441        return 0;
1442}
1443
1444static const struct file_operations vfio_device_fops;
1445
1446static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1447{
1448        struct vfio_device *device;
1449        struct file *filep;
1450        int ret;
1451
1452        if (0 == atomic_read(&group->container_users) ||
1453            !group->container->iommu_driver || !vfio_group_viable(group))
1454                return -EINVAL;
1455
1456        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1457                return -EPERM;
1458
1459        device = vfio_device_get_from_name(group, buf);
1460        if (!device)
1461                return -ENODEV;
1462
1463        ret = device->ops->open(device->device_data);
1464        if (ret) {
1465                vfio_device_put(device);
1466                return ret;
1467        }
1468
1469        /*
1470         * We can't use anon_inode_getfd() because we need to modify
1471         * the f_mode flags directly to allow more than just ioctls
1472         */
1473        ret = get_unused_fd_flags(O_CLOEXEC);
1474        if (ret < 0) {
1475                device->ops->release(device->device_data);
1476                vfio_device_put(device);
1477                return ret;
1478        }
1479
1480        filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1481                                   device, O_RDWR);
1482        if (IS_ERR(filep)) {
1483                put_unused_fd(ret);
1484                ret = PTR_ERR(filep);
1485                device->ops->release(device->device_data);
1486                vfio_device_put(device);
1487                return ret;
1488        }
1489
1490        /*
1491         * TODO: add an anon_inode interface to do this.
1492         * Appears to be missing by lack of need rather than
1493         * explicitly prevented.  Now there's need.
1494         */
1495        filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1496
1497        atomic_inc(&group->container_users);
1498
1499        fd_install(ret, filep);
1500
1501        if (group->noiommu)
1502                dev_warn(device->dev, "vfio-noiommu device opened by user "
1503                         "(%s:%d)\n", current->comm, task_pid_nr(current));
1504
1505        return ret;
1506}
1507
1508static long vfio_group_fops_unl_ioctl(struct file *filep,
1509                                      unsigned int cmd, unsigned long arg)
1510{
1511        struct vfio_group *group = filep->private_data;
1512        long ret = -ENOTTY;
1513
1514        switch (cmd) {
1515        case VFIO_GROUP_GET_STATUS:
1516        {
1517                struct vfio_group_status status;
1518                unsigned long minsz;
1519
1520                minsz = offsetofend(struct vfio_group_status, flags);
1521
1522                if (copy_from_user(&status, (void __user *)arg, minsz))
1523                        return -EFAULT;
1524
1525                if (status.argsz < minsz)
1526                        return -EINVAL;
1527
1528                status.flags = 0;
1529
1530                if (vfio_group_viable(group))
1531                        status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1532
1533                if (group->container)
1534                        status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1535
1536                if (copy_to_user((void __user *)arg, &status, minsz))
1537                        return -EFAULT;
1538
1539                ret = 0;
1540                break;
1541        }
1542        case VFIO_GROUP_SET_CONTAINER:
1543        {
1544                int fd;
1545
1546                if (get_user(fd, (int __user *)arg))
1547                        return -EFAULT;
1548
1549                if (fd < 0)
1550                        return -EINVAL;
1551
1552                ret = vfio_group_set_container(group, fd);
1553                break;
1554        }
1555        case VFIO_GROUP_UNSET_CONTAINER:
1556                ret = vfio_group_unset_container(group);
1557                break;
1558        case VFIO_GROUP_GET_DEVICE_FD:
1559        {
1560                char *buf;
1561
1562                buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1563                if (IS_ERR(buf))
1564                        return PTR_ERR(buf);
1565
1566                ret = vfio_group_get_device_fd(group, buf);
1567                kfree(buf);
1568                break;
1569        }
1570        }
1571
1572        return ret;
1573}
1574
1575#ifdef CONFIG_COMPAT
1576static long vfio_group_fops_compat_ioctl(struct file *filep,
1577                                         unsigned int cmd, unsigned long arg)
1578{
1579        arg = (unsigned long)compat_ptr(arg);
1580        return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1581}
1582#endif  /* CONFIG_COMPAT */
1583
1584static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1585{
1586        struct vfio_group *group;
1587        int opened;
1588
1589        group = vfio_group_get_from_minor(iminor(inode));
1590        if (!group)
1591                return -ENODEV;
1592
1593        if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1594                vfio_group_put(group);
1595                return -EPERM;
1596        }
1597
1598        /* Do we need multiple instances of the group open?  Seems not. */
1599        opened = atomic_cmpxchg(&group->opened, 0, 1);
1600        if (opened) {
1601                vfio_group_put(group);
1602                return -EBUSY;
1603        }
1604
1605        /* Is something still in use from a previous open? */
1606        if (group->container) {
1607                atomic_dec(&group->opened);
1608                vfio_group_put(group);
1609                return -EBUSY;
1610        }
1611
1612        /* Warn if previous user didn't cleanup and re-init to drop them */
1613        if (WARN_ON(group->notifier.head))
1614                BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1615
1616        filep->private_data = group;
1617
1618        return 0;
1619}
1620
1621static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1622{
1623        struct vfio_group *group = filep->private_data;
1624
1625        filep->private_data = NULL;
1626
1627        vfio_group_try_dissolve_container(group);
1628
1629        atomic_dec(&group->opened);
1630
1631        vfio_group_put(group);
1632
1633        return 0;
1634}
1635
1636static const struct file_operations vfio_group_fops = {
1637        .owner          = THIS_MODULE,
1638        .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1639#ifdef CONFIG_COMPAT
1640        .compat_ioctl   = vfio_group_fops_compat_ioctl,
1641#endif
1642        .open           = vfio_group_fops_open,
1643        .release        = vfio_group_fops_release,
1644};
1645
1646/**
1647 * VFIO Device fd
1648 */
1649static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1650{
1651        struct vfio_device *device = filep->private_data;
1652
1653        device->ops->release(device->device_data);
1654
1655        vfio_group_try_dissolve_container(device->group);
1656
1657        vfio_device_put(device);
1658
1659        return 0;
1660}
1661
1662static long vfio_device_fops_unl_ioctl(struct file *filep,
1663                                       unsigned int cmd, unsigned long arg)
1664{
1665        struct vfio_device *device = filep->private_data;
1666
1667        if (unlikely(!device->ops->ioctl))
1668                return -EINVAL;
1669
1670        return device->ops->ioctl(device->device_data, cmd, arg);
1671}
1672
1673static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1674                                     size_t count, loff_t *ppos)
1675{
1676        struct vfio_device *device = filep->private_data;
1677
1678        if (unlikely(!device->ops->read))
1679                return -EINVAL;
1680
1681        return device->ops->read(device->device_data, buf, count, ppos);
1682}
1683
1684static ssize_t vfio_device_fops_write(struct file *filep,
1685                                      const char __user *buf,
1686                                      size_t count, loff_t *ppos)
1687{
1688        struct vfio_device *device = filep->private_data;
1689
1690        if (unlikely(!device->ops->write))
1691                return -EINVAL;
1692
1693        return device->ops->write(device->device_data, buf, count, ppos);
1694}
1695
1696static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1697{
1698        struct vfio_device *device = filep->private_data;
1699
1700        if (unlikely(!device->ops->mmap))
1701                return -EINVAL;
1702
1703        return device->ops->mmap(device->device_data, vma);
1704}
1705
1706#ifdef CONFIG_COMPAT
1707static long vfio_device_fops_compat_ioctl(struct file *filep,
1708                                          unsigned int cmd, unsigned long arg)
1709{
1710        arg = (unsigned long)compat_ptr(arg);
1711        return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1712}
1713#endif  /* CONFIG_COMPAT */
1714
1715static const struct file_operations vfio_device_fops = {
1716        .owner          = THIS_MODULE,
1717        .release        = vfio_device_fops_release,
1718        .read           = vfio_device_fops_read,
1719        .write          = vfio_device_fops_write,
1720        .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1721#ifdef CONFIG_COMPAT
1722        .compat_ioctl   = vfio_device_fops_compat_ioctl,
1723#endif
1724        .mmap           = vfio_device_fops_mmap,
1725};
1726
1727/**
1728 * External user API, exported by symbols to be linked dynamically.
1729 *
1730 * The protocol includes:
1731 *  1. do normal VFIO init operation:
1732 *      - opening a new container;
1733 *      - attaching group(s) to it;
1734 *      - setting an IOMMU driver for a container.
1735 * When IOMMU is set for a container, all groups in it are
1736 * considered ready to use by an external user.
1737 *
1738 * 2. User space passes a group fd to an external user.
1739 * The external user calls vfio_group_get_external_user()
1740 * to verify that:
1741 *      - the group is initialized;
1742 *      - IOMMU is set for it.
1743 * If both checks passed, vfio_group_get_external_user()
1744 * increments the container user counter to prevent
1745 * the VFIO group from disposal before KVM exits.
1746 *
1747 * 3. The external user calls vfio_external_user_iommu_id()
1748 * to know an IOMMU ID.
1749 *
1750 * 4. When the external KVM finishes, it calls
1751 * vfio_group_put_external_user() to release the VFIO group.
1752 * This call decrements the container user counter.
1753 */
1754struct vfio_group *vfio_group_get_external_user(struct file *filep)
1755{
1756        struct vfio_group *group = filep->private_data;
1757        int ret;
1758
1759        if (filep->f_op != &vfio_group_fops)
1760                return ERR_PTR(-EINVAL);
1761
1762        ret = vfio_group_add_container_user(group);
1763        if (ret)
1764                return ERR_PTR(ret);
1765
1766        vfio_group_get(group);
1767
1768        return group;
1769}
1770EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1771
1772void vfio_group_put_external_user(struct vfio_group *group)
1773{
1774        vfio_group_try_dissolve_container(group);
1775        vfio_group_put(group);
1776}
1777EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1778
1779bool vfio_external_group_match_file(struct vfio_group *test_group,
1780                                    struct file *filep)
1781{
1782        struct vfio_group *group = filep->private_data;
1783
1784        return (filep->f_op == &vfio_group_fops) && (group == test_group);
1785}
1786EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1787
1788int vfio_external_user_iommu_id(struct vfio_group *group)
1789{
1790        return iommu_group_id(group->iommu_group);
1791}
1792EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1793
1794long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1795{
1796        return vfio_ioctl_check_extension(group->container, arg);
1797}
1798EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1799
1800/**
1801 * Sub-module support
1802 */
1803/*
1804 * Helper for managing a buffer of info chain capabilities, allocate or
1805 * reallocate a buffer with additional @size, filling in @id and @version
1806 * of the capability.  A pointer to the new capability is returned.
1807 *
1808 * NB. The chain is based at the head of the buffer, so new entries are
1809 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1810 * next offsets prior to copying to the user buffer.
1811 */
1812struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1813                                               size_t size, u16 id, u16 version)
1814{
1815        void *buf;
1816        struct vfio_info_cap_header *header, *tmp;
1817
1818        buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1819        if (!buf) {
1820                kfree(caps->buf);
1821                caps->size = 0;
1822                return ERR_PTR(-ENOMEM);
1823        }
1824
1825        caps->buf = buf;
1826        header = buf + caps->size;
1827
1828        /* Eventually copied to user buffer, zero */
1829        memset(header, 0, size);
1830
1831        header->id = id;
1832        header->version = version;
1833
1834        /* Add to the end of the capability chain */
1835        for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1836                ; /* nothing */
1837
1838        tmp->next = caps->size;
1839        caps->size += size;
1840
1841        return header;
1842}
1843EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1844
1845void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1846{
1847        struct vfio_info_cap_header *tmp;
1848        void *buf = (void *)caps->buf;
1849
1850        for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1851                tmp->next += offset;
1852}
1853EXPORT_SYMBOL(vfio_info_cap_shift);
1854
1855int vfio_info_add_capability(struct vfio_info_cap *caps,
1856                             struct vfio_info_cap_header *cap, size_t size)
1857{
1858        struct vfio_info_cap_header *header;
1859
1860        header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1861        if (IS_ERR(header))
1862                return PTR_ERR(header);
1863
1864        memcpy(header + 1, cap + 1, size - sizeof(*header));
1865
1866        return 0;
1867}
1868EXPORT_SYMBOL(vfio_info_add_capability);
1869
1870int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1871                                       int max_irq_type, size_t *data_size)
1872{
1873        unsigned long minsz;
1874        size_t size;
1875
1876        minsz = offsetofend(struct vfio_irq_set, count);
1877
1878        if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1879            (hdr->count >= (U32_MAX - hdr->start)) ||
1880            (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1881                                VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1882                return -EINVAL;
1883
1884        if (data_size)
1885                *data_size = 0;
1886
1887        if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1888                return -EINVAL;
1889
1890        switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1891        case VFIO_IRQ_SET_DATA_NONE:
1892                size = 0;
1893                break;
1894        case VFIO_IRQ_SET_DATA_BOOL:
1895                size = sizeof(uint8_t);
1896                break;
1897        case VFIO_IRQ_SET_DATA_EVENTFD:
1898                size = sizeof(int32_t);
1899                break;
1900        default:
1901                return -EINVAL;
1902        }
1903
1904        if (size) {
1905                if (hdr->argsz - minsz < hdr->count * size)
1906                        return -EINVAL;
1907
1908                if (!data_size)
1909                        return -EINVAL;
1910
1911                *data_size = hdr->count * size;
1912        }
1913
1914        return 0;
1915}
1916EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1917
1918/*
1919 * Pin a set of guest PFNs and return their associated host PFNs for local
1920 * domain only.
1921 * @dev [in]     : device
1922 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1923 * @npage [in]   : count of elements in user_pfn array.  This count should not
1924 *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1925 * @prot [in]    : protection flags
1926 * @phys_pfn[out]: array of host PFNs
1927 * Return error or number of pages pinned.
1928 */
1929int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1930                   int prot, unsigned long *phys_pfn)
1931{
1932        struct vfio_container *container;
1933        struct vfio_group *group;
1934        struct vfio_iommu_driver *driver;
1935        int ret;
1936
1937        if (!dev || !user_pfn || !phys_pfn || !npage)
1938                return -EINVAL;
1939
1940        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1941                return -E2BIG;
1942
1943        group = vfio_group_get_from_dev(dev);
1944        if (!group)
1945                return -ENODEV;
1946
1947        ret = vfio_group_add_container_user(group);
1948        if (ret)
1949                goto err_pin_pages;
1950
1951        container = group->container;
1952        driver = container->iommu_driver;
1953        if (likely(driver && driver->ops->pin_pages))
1954                ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1955                                             npage, prot, phys_pfn);
1956        else
1957                ret = -ENOTTY;
1958
1959        vfio_group_try_dissolve_container(group);
1960
1961err_pin_pages:
1962        vfio_group_put(group);
1963        return ret;
1964}
1965EXPORT_SYMBOL(vfio_pin_pages);
1966
1967/*
1968 * Unpin set of host PFNs for local domain only.
1969 * @dev [in]     : device
1970 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1971 *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1972 * @npage [in]   : count of elements in user_pfn array.  This count should not
1973 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1974 * Return error or number of pages unpinned.
1975 */
1976int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1977{
1978        struct vfio_container *container;
1979        struct vfio_group *group;
1980        struct vfio_iommu_driver *driver;
1981        int ret;
1982
1983        if (!dev || !user_pfn || !npage)
1984                return -EINVAL;
1985
1986        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1987                return -E2BIG;
1988
1989        group = vfio_group_get_from_dev(dev);
1990        if (!group)
1991                return -ENODEV;
1992
1993        ret = vfio_group_add_container_user(group);
1994        if (ret)
1995                goto err_unpin_pages;
1996
1997        container = group->container;
1998        driver = container->iommu_driver;
1999        if (likely(driver && driver->ops->unpin_pages))
2000                ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2001                                               npage);
2002        else
2003                ret = -ENOTTY;
2004
2005        vfio_group_try_dissolve_container(group);
2006
2007err_unpin_pages:
2008        vfio_group_put(group);
2009        return ret;
2010}
2011EXPORT_SYMBOL(vfio_unpin_pages);
2012
2013static int vfio_register_iommu_notifier(struct vfio_group *group,
2014                                        unsigned long *events,
2015                                        struct notifier_block *nb)
2016{
2017        struct vfio_container *container;
2018        struct vfio_iommu_driver *driver;
2019        int ret;
2020
2021        ret = vfio_group_add_container_user(group);
2022        if (ret)
2023                return -EINVAL;
2024
2025        container = group->container;
2026        driver = container->iommu_driver;
2027        if (likely(driver && driver->ops->register_notifier))
2028                ret = driver->ops->register_notifier(container->iommu_data,
2029                                                     events, nb);
2030        else
2031                ret = -ENOTTY;
2032
2033        vfio_group_try_dissolve_container(group);
2034
2035        return ret;
2036}
2037
2038static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2039                                          struct notifier_block *nb)
2040{
2041        struct vfio_container *container;
2042        struct vfio_iommu_driver *driver;
2043        int ret;
2044
2045        ret = vfio_group_add_container_user(group);
2046        if (ret)
2047                return -EINVAL;
2048
2049        container = group->container;
2050        driver = container->iommu_driver;
2051        if (likely(driver && driver->ops->unregister_notifier))
2052                ret = driver->ops->unregister_notifier(container->iommu_data,
2053                                                       nb);
2054        else
2055                ret = -ENOTTY;
2056
2057        vfio_group_try_dissolve_container(group);
2058
2059        return ret;
2060}
2061
2062void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2063{
2064        group->kvm = kvm;
2065        blocking_notifier_call_chain(&group->notifier,
2066                                VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2067}
2068EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2069
2070static int vfio_register_group_notifier(struct vfio_group *group,
2071                                        unsigned long *events,
2072                                        struct notifier_block *nb)
2073{
2074        int ret;
2075        bool set_kvm = false;
2076
2077        if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2078                set_kvm = true;
2079
2080        /* clear known events */
2081        *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2082
2083        /* refuse to continue if still events remaining */
2084        if (*events)
2085                return -EINVAL;
2086
2087        ret = vfio_group_add_container_user(group);
2088        if (ret)
2089                return -EINVAL;
2090
2091        ret = blocking_notifier_chain_register(&group->notifier, nb);
2092
2093        /*
2094         * The attaching of kvm and vfio_group might already happen, so
2095         * here we replay once upon registration.
2096         */
2097        if (!ret && set_kvm && group->kvm)
2098                blocking_notifier_call_chain(&group->notifier,
2099                                        VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2100
2101        vfio_group_try_dissolve_container(group);
2102
2103        return ret;
2104}
2105
2106static int vfio_unregister_group_notifier(struct vfio_group *group,
2107                                         struct notifier_block *nb)
2108{
2109        int ret;
2110
2111        ret = vfio_group_add_container_user(group);
2112        if (ret)
2113                return -EINVAL;
2114
2115        ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2116
2117        vfio_group_try_dissolve_container(group);
2118
2119        return ret;
2120}
2121
2122int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2123                           unsigned long *events, struct notifier_block *nb)
2124{
2125        struct vfio_group *group;
2126        int ret;
2127
2128        if (!dev || !nb || !events || (*events == 0))
2129                return -EINVAL;
2130
2131        group = vfio_group_get_from_dev(dev);
2132        if (!group)
2133                return -ENODEV;
2134
2135        switch (type) {
2136        case VFIO_IOMMU_NOTIFY:
2137                ret = vfio_register_iommu_notifier(group, events, nb);
2138                break;
2139        case VFIO_GROUP_NOTIFY:
2140                ret = vfio_register_group_notifier(group, events, nb);
2141                break;
2142        default:
2143                ret = -EINVAL;
2144        }
2145
2146        vfio_group_put(group);
2147        return ret;
2148}
2149EXPORT_SYMBOL(vfio_register_notifier);
2150
2151int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2152                             struct notifier_block *nb)
2153{
2154        struct vfio_group *group;
2155        int ret;
2156
2157        if (!dev || !nb)
2158                return -EINVAL;
2159
2160        group = vfio_group_get_from_dev(dev);
2161        if (!group)
2162                return -ENODEV;
2163
2164        switch (type) {
2165        case VFIO_IOMMU_NOTIFY:
2166                ret = vfio_unregister_iommu_notifier(group, nb);
2167                break;
2168        case VFIO_GROUP_NOTIFY:
2169                ret = vfio_unregister_group_notifier(group, nb);
2170                break;
2171        default:
2172                ret = -EINVAL;
2173        }
2174
2175        vfio_group_put(group);
2176        return ret;
2177}
2178EXPORT_SYMBOL(vfio_unregister_notifier);
2179
2180/**
2181 * Module/class support
2182 */
2183static char *vfio_devnode(struct device *dev, umode_t *mode)
2184{
2185        return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2186}
2187
2188static struct miscdevice vfio_dev = {
2189        .minor = VFIO_MINOR,
2190        .name = "vfio",
2191        .fops = &vfio_fops,
2192        .nodename = "vfio/vfio",
2193        .mode = S_IRUGO | S_IWUGO,
2194};
2195
2196static int __init vfio_init(void)
2197{
2198        int ret;
2199
2200        idr_init(&vfio.group_idr);
2201        mutex_init(&vfio.group_lock);
2202        mutex_init(&vfio.iommu_drivers_lock);
2203        INIT_LIST_HEAD(&vfio.group_list);
2204        INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2205        init_waitqueue_head(&vfio.release_q);
2206
2207        ret = misc_register(&vfio_dev);
2208        if (ret) {
2209                pr_err("vfio: misc device register failed\n");
2210                return ret;
2211        }
2212
2213        /* /dev/vfio/$GROUP */
2214        vfio.class = class_create(THIS_MODULE, "vfio");
2215        if (IS_ERR(vfio.class)) {
2216                ret = PTR_ERR(vfio.class);
2217                goto err_class;
2218        }
2219
2220        vfio.class->devnode = vfio_devnode;
2221
2222        ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2223        if (ret)
2224                goto err_alloc_chrdev;
2225
2226        cdev_init(&vfio.group_cdev, &vfio_group_fops);
2227        ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2228        if (ret)
2229                goto err_cdev_add;
2230
2231        pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2232
2233#ifdef CONFIG_VFIO_NOIOMMU
2234        vfio_register_iommu_driver(&vfio_noiommu_ops);
2235#endif
2236        return 0;
2237
2238err_cdev_add:
2239        unregister_chrdev_region(vfio.group_devt, MINORMASK);
2240err_alloc_chrdev:
2241        class_destroy(vfio.class);
2242        vfio.class = NULL;
2243err_class:
2244        misc_deregister(&vfio_dev);
2245        return ret;
2246}
2247
2248static void __exit vfio_cleanup(void)
2249{
2250        WARN_ON(!list_empty(&vfio.group_list));
2251
2252#ifdef CONFIG_VFIO_NOIOMMU
2253        vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2254#endif
2255        idr_destroy(&vfio.group_idr);
2256        cdev_del(&vfio.group_cdev);
2257        unregister_chrdev_region(vfio.group_devt, MINORMASK);
2258        class_destroy(vfio.class);
2259        vfio.class = NULL;
2260        misc_deregister(&vfio_dev);
2261}
2262
2263module_init(vfio_init);
2264module_exit(vfio_cleanup);
2265
2266MODULE_VERSION(DRIVER_VERSION);
2267MODULE_LICENSE("GPL v2");
2268MODULE_AUTHOR(DRIVER_AUTHOR);
2269MODULE_DESCRIPTION(DRIVER_DESC);
2270MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2271MODULE_ALIAS("devname:vfio/vfio");
2272MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2273