linux/drivers/vfio/vfio.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO core
   4 *
   5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6 *     Author: Alex Williamson <alex.williamson@redhat.com>
   7 *
   8 * Derived from original vfio:
   9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10 * Author: Tom Lyon, pugs@cisco.com
  11 */
  12
  13#include <linux/cdev.h>
  14#include <linux/compat.h>
  15#include <linux/device.h>
  16#include <linux/file.h>
  17#include <linux/anon_inodes.h>
  18#include <linux/fs.h>
  19#include <linux/idr.h>
  20#include <linux/iommu.h>
  21#include <linux/list.h>
  22#include <linux/miscdevice.h>
  23#include <linux/module.h>
  24#include <linux/mutex.h>
  25#include <linux/pci.h>
  26#include <linux/rwsem.h>
  27#include <linux/sched.h>
  28#include <linux/slab.h>
  29#include <linux/stat.h>
  30#include <linux/string.h>
  31#include <linux/uaccess.h>
  32#include <linux/vfio.h>
  33#include <linux/wait.h>
  34#include <linux/sched/signal.h>
  35
  36#define DRIVER_VERSION  "0.3"
  37#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  38#define DRIVER_DESC     "VFIO - User Level meta-driver"
  39
  40static struct vfio {
  41        struct class                    *class;
  42        struct list_head                iommu_drivers_list;
  43        struct mutex                    iommu_drivers_lock;
  44        struct list_head                group_list;
  45        struct idr                      group_idr;
  46        struct mutex                    group_lock;
  47        struct cdev                     group_cdev;
  48        dev_t                           group_devt;
  49        wait_queue_head_t               release_q;
  50} vfio;
  51
  52struct vfio_iommu_driver {
  53        const struct vfio_iommu_driver_ops      *ops;
  54        struct list_head                        vfio_next;
  55};
  56
  57struct vfio_container {
  58        struct kref                     kref;
  59        struct list_head                group_list;
  60        struct rw_semaphore             group_lock;
  61        struct vfio_iommu_driver        *iommu_driver;
  62        void                            *iommu_data;
  63        bool                            noiommu;
  64};
  65
  66struct vfio_unbound_dev {
  67        struct device                   *dev;
  68        struct list_head                unbound_next;
  69};
  70
  71struct vfio_group {
  72        struct kref                     kref;
  73        int                             minor;
  74        atomic_t                        container_users;
  75        struct iommu_group              *iommu_group;
  76        struct vfio_container           *container;
  77        struct list_head                device_list;
  78        struct mutex                    device_lock;
  79        struct device                   *dev;
  80        struct notifier_block           nb;
  81        struct list_head                vfio_next;
  82        struct list_head                container_next;
  83        struct list_head                unbound_list;
  84        struct mutex                    unbound_lock;
  85        atomic_t                        opened;
  86        wait_queue_head_t               container_q;
  87        bool                            noiommu;
  88        struct kvm                      *kvm;
  89        struct blocking_notifier_head   notifier;
  90};
  91
  92struct vfio_device {
  93        struct kref                     kref;
  94        struct device                   *dev;
  95        const struct vfio_device_ops    *ops;
  96        struct vfio_group               *group;
  97        struct list_head                group_next;
  98        void                            *device_data;
  99};
 100
 101#ifdef CONFIG_VFIO_NOIOMMU
 102static bool noiommu __read_mostly;
 103module_param_named(enable_unsafe_noiommu_mode,
 104                   noiommu, bool, S_IRUGO | S_IWUSR);
 105MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 106#endif
 107
 108/*
 109 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 110 * and remove functions, any use cases other than acquiring the first
 111 * reference for the purpose of calling vfio_add_group_dev() or removing
 112 * that symmetric reference after vfio_del_group_dev() should use the raw
 113 * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 114 * removes the device from the dummy group and cannot be nested.
 115 */
 116struct iommu_group *vfio_iommu_group_get(struct device *dev)
 117{
 118        struct iommu_group *group;
 119        int __maybe_unused ret;
 120
 121        group = iommu_group_get(dev);
 122
 123#ifdef CONFIG_VFIO_NOIOMMU
 124        /*
 125         * With noiommu enabled, an IOMMU group will be created for a device
 126         * that doesn't already have one and doesn't have an iommu_ops on their
 127         * bus.  We set iommudata simply to be able to identify these groups
 128         * as special use and for reclamation later.
 129         */
 130        if (group || !noiommu || iommu_present(dev->bus))
 131                return group;
 132
 133        group = iommu_group_alloc();
 134        if (IS_ERR(group))
 135                return NULL;
 136
 137        iommu_group_set_name(group, "vfio-noiommu");
 138        iommu_group_set_iommudata(group, &noiommu, NULL);
 139        ret = iommu_group_add_device(group, dev);
 140        if (ret) {
 141                iommu_group_put(group);
 142                return NULL;
 143        }
 144
 145        /*
 146         * Where to taint?  At this point we've added an IOMMU group for a
 147         * device that is not backed by iommu_ops, therefore any iommu_
 148         * callback using iommu_ops can legitimately Oops.  So, while we may
 149         * be about to give a DMA capable device to a user without IOMMU
 150         * protection, which is clearly taint-worthy, let's go ahead and do
 151         * it here.
 152         */
 153        add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 154        dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 155#endif
 156
 157        return group;
 158}
 159EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 160
 161void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 162{
 163#ifdef CONFIG_VFIO_NOIOMMU
 164        if (iommu_group_get_iommudata(group) == &noiommu)
 165                iommu_group_remove_device(dev);
 166#endif
 167
 168        iommu_group_put(group);
 169}
 170EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 171
 172#ifdef CONFIG_VFIO_NOIOMMU
 173static void *vfio_noiommu_open(unsigned long arg)
 174{
 175        if (arg != VFIO_NOIOMMU_IOMMU)
 176                return ERR_PTR(-EINVAL);
 177        if (!capable(CAP_SYS_RAWIO))
 178                return ERR_PTR(-EPERM);
 179
 180        return NULL;
 181}
 182
 183static void vfio_noiommu_release(void *iommu_data)
 184{
 185}
 186
 187static long vfio_noiommu_ioctl(void *iommu_data,
 188                               unsigned int cmd, unsigned long arg)
 189{
 190        if (cmd == VFIO_CHECK_EXTENSION)
 191                return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 192
 193        return -ENOTTY;
 194}
 195
 196static int vfio_noiommu_attach_group(void *iommu_data,
 197                                     struct iommu_group *iommu_group)
 198{
 199        return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 200}
 201
 202static void vfio_noiommu_detach_group(void *iommu_data,
 203                                      struct iommu_group *iommu_group)
 204{
 205}
 206
 207static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 208        .name = "vfio-noiommu",
 209        .owner = THIS_MODULE,
 210        .open = vfio_noiommu_open,
 211        .release = vfio_noiommu_release,
 212        .ioctl = vfio_noiommu_ioctl,
 213        .attach_group = vfio_noiommu_attach_group,
 214        .detach_group = vfio_noiommu_detach_group,
 215};
 216#endif
 217
 218
 219/**
 220 * IOMMU driver registration
 221 */
 222int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 223{
 224        struct vfio_iommu_driver *driver, *tmp;
 225
 226        driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 227        if (!driver)
 228                return -ENOMEM;
 229
 230        driver->ops = ops;
 231
 232        mutex_lock(&vfio.iommu_drivers_lock);
 233
 234        /* Check for duplicates */
 235        list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 236                if (tmp->ops == ops) {
 237                        mutex_unlock(&vfio.iommu_drivers_lock);
 238                        kfree(driver);
 239                        return -EINVAL;
 240                }
 241        }
 242
 243        list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 244
 245        mutex_unlock(&vfio.iommu_drivers_lock);
 246
 247        return 0;
 248}
 249EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 250
 251void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 252{
 253        struct vfio_iommu_driver *driver;
 254
 255        mutex_lock(&vfio.iommu_drivers_lock);
 256        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 257                if (driver->ops == ops) {
 258                        list_del(&driver->vfio_next);
 259                        mutex_unlock(&vfio.iommu_drivers_lock);
 260                        kfree(driver);
 261                        return;
 262                }
 263        }
 264        mutex_unlock(&vfio.iommu_drivers_lock);
 265}
 266EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 267
 268/**
 269 * Group minor allocation/free - both called with vfio.group_lock held
 270 */
 271static int vfio_alloc_group_minor(struct vfio_group *group)
 272{
 273        return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 274}
 275
 276static void vfio_free_group_minor(int minor)
 277{
 278        idr_remove(&vfio.group_idr, minor);
 279}
 280
 281static int vfio_iommu_group_notifier(struct notifier_block *nb,
 282                                     unsigned long action, void *data);
 283static void vfio_group_get(struct vfio_group *group);
 284
 285/**
 286 * Container objects - containers are created when /dev/vfio/vfio is
 287 * opened, but their lifecycle extends until the last user is done, so
 288 * it's freed via kref.  Must support container/group/device being
 289 * closed in any order.
 290 */
 291static void vfio_container_get(struct vfio_container *container)
 292{
 293        kref_get(&container->kref);
 294}
 295
 296static void vfio_container_release(struct kref *kref)
 297{
 298        struct vfio_container *container;
 299        container = container_of(kref, struct vfio_container, kref);
 300
 301        kfree(container);
 302}
 303
 304static void vfio_container_put(struct vfio_container *container)
 305{
 306        kref_put(&container->kref, vfio_container_release);
 307}
 308
 309static void vfio_group_unlock_and_free(struct vfio_group *group)
 310{
 311        mutex_unlock(&vfio.group_lock);
 312        /*
 313         * Unregister outside of lock.  A spurious callback is harmless now
 314         * that the group is no longer in vfio.group_list.
 315         */
 316        iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 317        kfree(group);
 318}
 319
 320/**
 321 * Group objects - create, release, get, put, search
 322 */
 323static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 324{
 325        struct vfio_group *group, *tmp;
 326        struct device *dev;
 327        int ret, minor;
 328
 329        group = kzalloc(sizeof(*group), GFP_KERNEL);
 330        if (!group)
 331                return ERR_PTR(-ENOMEM);
 332
 333        kref_init(&group->kref);
 334        INIT_LIST_HEAD(&group->device_list);
 335        mutex_init(&group->device_lock);
 336        INIT_LIST_HEAD(&group->unbound_list);
 337        mutex_init(&group->unbound_lock);
 338        atomic_set(&group->container_users, 0);
 339        atomic_set(&group->opened, 0);
 340        init_waitqueue_head(&group->container_q);
 341        group->iommu_group = iommu_group;
 342#ifdef CONFIG_VFIO_NOIOMMU
 343        group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 344#endif
 345        BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 346
 347        group->nb.notifier_call = vfio_iommu_group_notifier;
 348
 349        /*
 350         * blocking notifiers acquire a rwsem around registering and hold
 351         * it around callback.  Therefore, need to register outside of
 352         * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 353         * do anything unless it can find the group in vfio.group_list, so
 354         * no harm in registering early.
 355         */
 356        ret = iommu_group_register_notifier(iommu_group, &group->nb);
 357        if (ret) {
 358                kfree(group);
 359                return ERR_PTR(ret);
 360        }
 361
 362        mutex_lock(&vfio.group_lock);
 363
 364        /* Did we race creating this group? */
 365        list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 366                if (tmp->iommu_group == iommu_group) {
 367                        vfio_group_get(tmp);
 368                        vfio_group_unlock_and_free(group);
 369                        return tmp;
 370                }
 371        }
 372
 373        minor = vfio_alloc_group_minor(group);
 374        if (minor < 0) {
 375                vfio_group_unlock_and_free(group);
 376                return ERR_PTR(minor);
 377        }
 378
 379        dev = device_create(vfio.class, NULL,
 380                            MKDEV(MAJOR(vfio.group_devt), minor),
 381                            group, "%s%d", group->noiommu ? "noiommu-" : "",
 382                            iommu_group_id(iommu_group));
 383        if (IS_ERR(dev)) {
 384                vfio_free_group_minor(minor);
 385                vfio_group_unlock_and_free(group);
 386                return ERR_CAST(dev);
 387        }
 388
 389        group->minor = minor;
 390        group->dev = dev;
 391
 392        list_add(&group->vfio_next, &vfio.group_list);
 393
 394        mutex_unlock(&vfio.group_lock);
 395
 396        return group;
 397}
 398
 399/* called with vfio.group_lock held */
 400static void vfio_group_release(struct kref *kref)
 401{
 402        struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 403        struct vfio_unbound_dev *unbound, *tmp;
 404        struct iommu_group *iommu_group = group->iommu_group;
 405
 406        WARN_ON(!list_empty(&group->device_list));
 407        WARN_ON(group->notifier.head);
 408
 409        list_for_each_entry_safe(unbound, tmp,
 410                                 &group->unbound_list, unbound_next) {
 411                list_del(&unbound->unbound_next);
 412                kfree(unbound);
 413        }
 414
 415        device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 416        list_del(&group->vfio_next);
 417        vfio_free_group_minor(group->minor);
 418        vfio_group_unlock_and_free(group);
 419        iommu_group_put(iommu_group);
 420}
 421
 422static void vfio_group_put(struct vfio_group *group)
 423{
 424        kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 425}
 426
 427struct vfio_group_put_work {
 428        struct work_struct work;
 429        struct vfio_group *group;
 430};
 431
 432static void vfio_group_put_bg(struct work_struct *work)
 433{
 434        struct vfio_group_put_work *do_work;
 435
 436        do_work = container_of(work, struct vfio_group_put_work, work);
 437
 438        vfio_group_put(do_work->group);
 439        kfree(do_work);
 440}
 441
 442static void vfio_group_schedule_put(struct vfio_group *group)
 443{
 444        struct vfio_group_put_work *do_work;
 445
 446        do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 447        if (WARN_ON(!do_work))
 448                return;
 449
 450        INIT_WORK(&do_work->work, vfio_group_put_bg);
 451        do_work->group = group;
 452        schedule_work(&do_work->work);
 453}
 454
 455/* Assume group_lock or group reference is held */
 456static void vfio_group_get(struct vfio_group *group)
 457{
 458        kref_get(&group->kref);
 459}
 460
 461/*
 462 * Not really a try as we will sleep for mutex, but we need to make
 463 * sure the group pointer is valid under lock and get a reference.
 464 */
 465static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 466{
 467        struct vfio_group *target = group;
 468
 469        mutex_lock(&vfio.group_lock);
 470        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 471                if (group == target) {
 472                        vfio_group_get(group);
 473                        mutex_unlock(&vfio.group_lock);
 474                        return group;
 475                }
 476        }
 477        mutex_unlock(&vfio.group_lock);
 478
 479        return NULL;
 480}
 481
 482static
 483struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 484{
 485        struct vfio_group *group;
 486
 487        mutex_lock(&vfio.group_lock);
 488        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 489                if (group->iommu_group == iommu_group) {
 490                        vfio_group_get(group);
 491                        mutex_unlock(&vfio.group_lock);
 492                        return group;
 493                }
 494        }
 495        mutex_unlock(&vfio.group_lock);
 496
 497        return NULL;
 498}
 499
 500static struct vfio_group *vfio_group_get_from_minor(int minor)
 501{
 502        struct vfio_group *group;
 503
 504        mutex_lock(&vfio.group_lock);
 505        group = idr_find(&vfio.group_idr, minor);
 506        if (!group) {
 507                mutex_unlock(&vfio.group_lock);
 508                return NULL;
 509        }
 510        vfio_group_get(group);
 511        mutex_unlock(&vfio.group_lock);
 512
 513        return group;
 514}
 515
 516static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 517{
 518        struct iommu_group *iommu_group;
 519        struct vfio_group *group;
 520
 521        iommu_group = iommu_group_get(dev);
 522        if (!iommu_group)
 523                return NULL;
 524
 525        group = vfio_group_get_from_iommu(iommu_group);
 526        iommu_group_put(iommu_group);
 527
 528        return group;
 529}
 530
 531/**
 532 * Device objects - create, release, get, put, search
 533 */
 534static
 535struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 536                                             struct device *dev,
 537                                             const struct vfio_device_ops *ops,
 538                                             void *device_data)
 539{
 540        struct vfio_device *device;
 541
 542        device = kzalloc(sizeof(*device), GFP_KERNEL);
 543        if (!device)
 544                return ERR_PTR(-ENOMEM);
 545
 546        kref_init(&device->kref);
 547        device->dev = dev;
 548        device->group = group;
 549        device->ops = ops;
 550        device->device_data = device_data;
 551        dev_set_drvdata(dev, device);
 552
 553        /* No need to get group_lock, caller has group reference */
 554        vfio_group_get(group);
 555
 556        mutex_lock(&group->device_lock);
 557        list_add(&device->group_next, &group->device_list);
 558        mutex_unlock(&group->device_lock);
 559
 560        return device;
 561}
 562
 563static void vfio_device_release(struct kref *kref)
 564{
 565        struct vfio_device *device = container_of(kref,
 566                                                  struct vfio_device, kref);
 567        struct vfio_group *group = device->group;
 568
 569        list_del(&device->group_next);
 570        mutex_unlock(&group->device_lock);
 571
 572        dev_set_drvdata(device->dev, NULL);
 573
 574        kfree(device);
 575
 576        /* vfio_del_group_dev may be waiting for this device */
 577        wake_up(&vfio.release_q);
 578}
 579
 580/* Device reference always implies a group reference */
 581void vfio_device_put(struct vfio_device *device)
 582{
 583        struct vfio_group *group = device->group;
 584        kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 585        vfio_group_put(group);
 586}
 587EXPORT_SYMBOL_GPL(vfio_device_put);
 588
 589static void vfio_device_get(struct vfio_device *device)
 590{
 591        vfio_group_get(device->group);
 592        kref_get(&device->kref);
 593}
 594
 595static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 596                                                 struct device *dev)
 597{
 598        struct vfio_device *device;
 599
 600        mutex_lock(&group->device_lock);
 601        list_for_each_entry(device, &group->device_list, group_next) {
 602                if (device->dev == dev) {
 603                        vfio_device_get(device);
 604                        mutex_unlock(&group->device_lock);
 605                        return device;
 606                }
 607        }
 608        mutex_unlock(&group->device_lock);
 609        return NULL;
 610}
 611
 612/*
 613 * Some drivers, like pci-stub, are only used to prevent other drivers from
 614 * claiming a device and are therefore perfectly legitimate for a user owned
 615 * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 616 * of the device, but it does prevent the user from having direct access to
 617 * the device, which is useful in some circumstances.
 618 *
 619 * We also assume that we can include PCI interconnect devices, ie. bridges.
 620 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 621 * then all of the downstream devices will be part of the same IOMMU group as
 622 * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 623 * breaks anything, it only does so for user owned devices downstream.  Note
 624 * that error notification via MSI can be affected for platforms that handle
 625 * MSI within the same IOVA space as DMA.
 626 */
 627static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 628
 629static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 630{
 631        if (dev_is_pci(dev)) {
 632                struct pci_dev *pdev = to_pci_dev(dev);
 633
 634                if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 635                        return true;
 636        }
 637
 638        return match_string(vfio_driver_whitelist,
 639                            ARRAY_SIZE(vfio_driver_whitelist),
 640                            drv->name) >= 0;
 641}
 642
 643/*
 644 * A vfio group is viable for use by userspace if all devices are in
 645 * one of the following states:
 646 *  - driver-less
 647 *  - bound to a vfio driver
 648 *  - bound to a whitelisted driver
 649 *  - a PCI interconnect device
 650 *
 651 * We use two methods to determine whether a device is bound to a vfio
 652 * driver.  The first is to test whether the device exists in the vfio
 653 * group.  The second is to test if the device exists on the group
 654 * unbound_list, indicating it's in the middle of transitioning from
 655 * a vfio driver to driver-less.
 656 */
 657static int vfio_dev_viable(struct device *dev, void *data)
 658{
 659        struct vfio_group *group = data;
 660        struct vfio_device *device;
 661        struct device_driver *drv = READ_ONCE(dev->driver);
 662        struct vfio_unbound_dev *unbound;
 663        int ret = -EINVAL;
 664
 665        mutex_lock(&group->unbound_lock);
 666        list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 667                if (dev == unbound->dev) {
 668                        ret = 0;
 669                        break;
 670                }
 671        }
 672        mutex_unlock(&group->unbound_lock);
 673
 674        if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 675                return 0;
 676
 677        device = vfio_group_get_device(group, dev);
 678        if (device) {
 679                vfio_device_put(device);
 680                return 0;
 681        }
 682
 683        return ret;
 684}
 685
 686/**
 687 * Async device support
 688 */
 689static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 690{
 691        struct vfio_device *device;
 692
 693        /* Do we already know about it?  We shouldn't */
 694        device = vfio_group_get_device(group, dev);
 695        if (WARN_ON_ONCE(device)) {
 696                vfio_device_put(device);
 697                return 0;
 698        }
 699
 700        /* Nothing to do for idle groups */
 701        if (!atomic_read(&group->container_users))
 702                return 0;
 703
 704        /* TODO Prevent device auto probing */
 705        dev_WARN(dev, "Device added to live group %d!\n",
 706                 iommu_group_id(group->iommu_group));
 707
 708        return 0;
 709}
 710
 711static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 712{
 713        /* We don't care what happens when the group isn't in use */
 714        if (!atomic_read(&group->container_users))
 715                return 0;
 716
 717        return vfio_dev_viable(dev, group);
 718}
 719
 720static int vfio_iommu_group_notifier(struct notifier_block *nb,
 721                                     unsigned long action, void *data)
 722{
 723        struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 724        struct device *dev = data;
 725        struct vfio_unbound_dev *unbound;
 726
 727        /*
 728         * Need to go through a group_lock lookup to get a reference or we
 729         * risk racing a group being removed.  Ignore spurious notifies.
 730         */
 731        group = vfio_group_try_get(group);
 732        if (!group)
 733                return NOTIFY_OK;
 734
 735        switch (action) {
 736        case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 737                vfio_group_nb_add_dev(group, dev);
 738                break;
 739        case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 740                /*
 741                 * Nothing to do here.  If the device is in use, then the
 742                 * vfio sub-driver should block the remove callback until
 743                 * it is unused.  If the device is unused or attached to a
 744                 * stub driver, then it should be released and we don't
 745                 * care that it will be going away.
 746                 */
 747                break;
 748        case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 749                dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
 750                        iommu_group_id(group->iommu_group));
 751                break;
 752        case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 753                dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
 754                        iommu_group_id(group->iommu_group), dev->driver->name);
 755                BUG_ON(vfio_group_nb_verify(group, dev));
 756                break;
 757        case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 758                dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
 759                        __func__, iommu_group_id(group->iommu_group),
 760                        dev->driver->name);
 761                break;
 762        case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 763                dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
 764                        iommu_group_id(group->iommu_group));
 765                /*
 766                 * XXX An unbound device in a live group is ok, but we'd
 767                 * really like to avoid the above BUG_ON by preventing other
 768                 * drivers from binding to it.  Once that occurs, we have to
 769                 * stop the system to maintain isolation.  At a minimum, we'd
 770                 * want a toggle to disable driver auto probe for this device.
 771                 */
 772
 773                mutex_lock(&group->unbound_lock);
 774                list_for_each_entry(unbound,
 775                                    &group->unbound_list, unbound_next) {
 776                        if (dev == unbound->dev) {
 777                                list_del(&unbound->unbound_next);
 778                                kfree(unbound);
 779                                break;
 780                        }
 781                }
 782                mutex_unlock(&group->unbound_lock);
 783                break;
 784        }
 785
 786        /*
 787         * If we're the last reference to the group, the group will be
 788         * released, which includes unregistering the iommu group notifier.
 789         * We hold a read-lock on that notifier list, unregistering needs
 790         * a write-lock... deadlock.  Release our reference asynchronously
 791         * to avoid that situation.
 792         */
 793        vfio_group_schedule_put(group);
 794        return NOTIFY_OK;
 795}
 796
 797/**
 798 * VFIO driver API
 799 */
 800int vfio_add_group_dev(struct device *dev,
 801                       const struct vfio_device_ops *ops, void *device_data)
 802{
 803        struct iommu_group *iommu_group;
 804        struct vfio_group *group;
 805        struct vfio_device *device;
 806
 807        iommu_group = iommu_group_get(dev);
 808        if (!iommu_group)
 809                return -EINVAL;
 810
 811        group = vfio_group_get_from_iommu(iommu_group);
 812        if (!group) {
 813                group = vfio_create_group(iommu_group);
 814                if (IS_ERR(group)) {
 815                        iommu_group_put(iommu_group);
 816                        return PTR_ERR(group);
 817                }
 818        } else {
 819                /*
 820                 * A found vfio_group already holds a reference to the
 821                 * iommu_group.  A created vfio_group keeps the reference.
 822                 */
 823                iommu_group_put(iommu_group);
 824        }
 825
 826        device = vfio_group_get_device(group, dev);
 827        if (device) {
 828                dev_WARN(dev, "Device already exists on group %d\n",
 829                         iommu_group_id(iommu_group));
 830                vfio_device_put(device);
 831                vfio_group_put(group);
 832                return -EBUSY;
 833        }
 834
 835        device = vfio_group_create_device(group, dev, ops, device_data);
 836        if (IS_ERR(device)) {
 837                vfio_group_put(group);
 838                return PTR_ERR(device);
 839        }
 840
 841        /*
 842         * Drop all but the vfio_device reference.  The vfio_device holds
 843         * a reference to the vfio_group, which holds a reference to the
 844         * iommu_group.
 845         */
 846        vfio_group_put(group);
 847
 848        return 0;
 849}
 850EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 851
 852/**
 853 * Get a reference to the vfio_device for a device.  Even if the
 854 * caller thinks they own the device, they could be racing with a
 855 * release call path, so we can't trust drvdata for the shortcut.
 856 * Go the long way around, from the iommu_group to the vfio_group
 857 * to the vfio_device.
 858 */
 859struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 860{
 861        struct vfio_group *group;
 862        struct vfio_device *device;
 863
 864        group = vfio_group_get_from_dev(dev);
 865        if (!group)
 866                return NULL;
 867
 868        device = vfio_group_get_device(group, dev);
 869        vfio_group_put(group);
 870
 871        return device;
 872}
 873EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 874
 875static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 876                                                     char *buf)
 877{
 878        struct vfio_device *it, *device = NULL;
 879
 880        mutex_lock(&group->device_lock);
 881        list_for_each_entry(it, &group->device_list, group_next) {
 882                if (!strcmp(dev_name(it->dev), buf)) {
 883                        device = it;
 884                        vfio_device_get(device);
 885                        break;
 886                }
 887        }
 888        mutex_unlock(&group->device_lock);
 889
 890        return device;
 891}
 892
 893/*
 894 * Caller must hold a reference to the vfio_device
 895 */
 896void *vfio_device_data(struct vfio_device *device)
 897{
 898        return device->device_data;
 899}
 900EXPORT_SYMBOL_GPL(vfio_device_data);
 901
 902/*
 903 * Decrement the device reference count and wait for the device to be
 904 * removed.  Open file descriptors for the device... */
 905void *vfio_del_group_dev(struct device *dev)
 906{
 907        DEFINE_WAIT_FUNC(wait, woken_wake_function);
 908        struct vfio_device *device = dev_get_drvdata(dev);
 909        struct vfio_group *group = device->group;
 910        void *device_data = device->device_data;
 911        struct vfio_unbound_dev *unbound;
 912        unsigned int i = 0;
 913        bool interrupted = false;
 914
 915        /*
 916         * The group exists so long as we have a device reference.  Get
 917         * a group reference and use it to scan for the device going away.
 918         */
 919        vfio_group_get(group);
 920
 921        /*
 922         * When the device is removed from the group, the group suddenly
 923         * becomes non-viable; the device has a driver (until the unbind
 924         * completes), but it's not present in the group.  This is bad news
 925         * for any external users that need to re-acquire a group reference
 926         * in order to match and release their existing reference.  To
 927         * solve this, we track such devices on the unbound_list to bridge
 928         * the gap until they're fully unbound.
 929         */
 930        unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 931        if (unbound) {
 932                unbound->dev = dev;
 933                mutex_lock(&group->unbound_lock);
 934                list_add(&unbound->unbound_next, &group->unbound_list);
 935                mutex_unlock(&group->unbound_lock);
 936        }
 937        WARN_ON(!unbound);
 938
 939        vfio_device_put(device);
 940
 941        /*
 942         * If the device is still present in the group after the above
 943         * 'put', then it is in use and we need to request it from the
 944         * bus driver.  The driver may in turn need to request the
 945         * device from the user.  We send the request on an arbitrary
 946         * interval with counter to allow the driver to take escalating
 947         * measures to release the device if it has the ability to do so.
 948         */
 949        add_wait_queue(&vfio.release_q, &wait);
 950
 951        do {
 952                device = vfio_group_get_device(group, dev);
 953                if (!device)
 954                        break;
 955
 956                if (device->ops->request)
 957                        device->ops->request(device_data, i++);
 958
 959                vfio_device_put(device);
 960
 961                if (interrupted) {
 962                        wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
 963                } else {
 964                        wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
 965                        if (signal_pending(current)) {
 966                                interrupted = true;
 967                                dev_warn(dev,
 968                                         "Device is currently in use, task"
 969                                         " \"%s\" (%d) "
 970                                         "blocked until device is released",
 971                                         current->comm, task_pid_nr(current));
 972                        }
 973                }
 974
 975        } while (1);
 976
 977        remove_wait_queue(&vfio.release_q, &wait);
 978        /*
 979         * In order to support multiple devices per group, devices can be
 980         * plucked from the group while other devices in the group are still
 981         * in use.  The container persists with this group and those remaining
 982         * devices still attached.  If the user creates an isolation violation
 983         * by binding this device to another driver while the group is still in
 984         * use, that's their fault.  However, in the case of removing the last,
 985         * or potentially the only, device in the group there can be no other
 986         * in-use devices in the group.  The user has done their due diligence
 987         * and we should lay no claims to those devices.  In order to do that,
 988         * we need to make sure the group is detached from the container.
 989         * Without this stall, we're potentially racing with a user process
 990         * that may attempt to immediately bind this device to another driver.
 991         */
 992        if (list_empty(&group->device_list))
 993                wait_event(group->container_q, !group->container);
 994
 995        vfio_group_put(group);
 996
 997        return device_data;
 998}
 999EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1000
1001/**
1002 * VFIO base fd, /dev/vfio/vfio
1003 */
1004static long vfio_ioctl_check_extension(struct vfio_container *container,
1005                                       unsigned long arg)
1006{
1007        struct vfio_iommu_driver *driver;
1008        long ret = 0;
1009
1010        down_read(&container->group_lock);
1011
1012        driver = container->iommu_driver;
1013
1014        switch (arg) {
1015                /* No base extensions yet */
1016        default:
1017                /*
1018                 * If no driver is set, poll all registered drivers for
1019                 * extensions and return the first positive result.  If
1020                 * a driver is already set, further queries will be passed
1021                 * only to that driver.
1022                 */
1023                if (!driver) {
1024                        mutex_lock(&vfio.iommu_drivers_lock);
1025                        list_for_each_entry(driver, &vfio.iommu_drivers_list,
1026                                            vfio_next) {
1027
1028#ifdef CONFIG_VFIO_NOIOMMU
1029                                if (!list_empty(&container->group_list) &&
1030                                    (container->noiommu !=
1031                                     (driver->ops == &vfio_noiommu_ops)))
1032                                        continue;
1033#endif
1034
1035                                if (!try_module_get(driver->ops->owner))
1036                                        continue;
1037
1038                                ret = driver->ops->ioctl(NULL,
1039                                                         VFIO_CHECK_EXTENSION,
1040                                                         arg);
1041                                module_put(driver->ops->owner);
1042                                if (ret > 0)
1043                                        break;
1044                        }
1045                        mutex_unlock(&vfio.iommu_drivers_lock);
1046                } else
1047                        ret = driver->ops->ioctl(container->iommu_data,
1048                                                 VFIO_CHECK_EXTENSION, arg);
1049        }
1050
1051        up_read(&container->group_lock);
1052
1053        return ret;
1054}
1055
1056/* hold write lock on container->group_lock */
1057static int __vfio_container_attach_groups(struct vfio_container *container,
1058                                          struct vfio_iommu_driver *driver,
1059                                          void *data)
1060{
1061        struct vfio_group *group;
1062        int ret = -ENODEV;
1063
1064        list_for_each_entry(group, &container->group_list, container_next) {
1065                ret = driver->ops->attach_group(data, group->iommu_group);
1066                if (ret)
1067                        goto unwind;
1068        }
1069
1070        return ret;
1071
1072unwind:
1073        list_for_each_entry_continue_reverse(group, &container->group_list,
1074                                             container_next) {
1075                driver->ops->detach_group(data, group->iommu_group);
1076        }
1077
1078        return ret;
1079}
1080
1081static long vfio_ioctl_set_iommu(struct vfio_container *container,
1082                                 unsigned long arg)
1083{
1084        struct vfio_iommu_driver *driver;
1085        long ret = -ENODEV;
1086
1087        down_write(&container->group_lock);
1088
1089        /*
1090         * The container is designed to be an unprivileged interface while
1091         * the group can be assigned to specific users.  Therefore, only by
1092         * adding a group to a container does the user get the privilege of
1093         * enabling the iommu, which may allocate finite resources.  There
1094         * is no unset_iommu, but by removing all the groups from a container,
1095         * the container is deprivileged and returns to an unset state.
1096         */
1097        if (list_empty(&container->group_list) || container->iommu_driver) {
1098                up_write(&container->group_lock);
1099                return -EINVAL;
1100        }
1101
1102        mutex_lock(&vfio.iommu_drivers_lock);
1103        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1104                void *data;
1105
1106#ifdef CONFIG_VFIO_NOIOMMU
1107                /*
1108                 * Only noiommu containers can use vfio-noiommu and noiommu
1109                 * containers can only use vfio-noiommu.
1110                 */
1111                if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1112                        continue;
1113#endif
1114
1115                if (!try_module_get(driver->ops->owner))
1116                        continue;
1117
1118                /*
1119                 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1120                 * so test which iommu driver reported support for this
1121                 * extension and call open on them.  We also pass them the
1122                 * magic, allowing a single driver to support multiple
1123                 * interfaces if they'd like.
1124                 */
1125                if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1126                        module_put(driver->ops->owner);
1127                        continue;
1128                }
1129
1130                data = driver->ops->open(arg);
1131                if (IS_ERR(data)) {
1132                        ret = PTR_ERR(data);
1133                        module_put(driver->ops->owner);
1134                        continue;
1135                }
1136
1137                ret = __vfio_container_attach_groups(container, driver, data);
1138                if (ret) {
1139                        driver->ops->release(data);
1140                        module_put(driver->ops->owner);
1141                        continue;
1142                }
1143
1144                container->iommu_driver = driver;
1145                container->iommu_data = data;
1146                break;
1147        }
1148
1149        mutex_unlock(&vfio.iommu_drivers_lock);
1150        up_write(&container->group_lock);
1151
1152        return ret;
1153}
1154
1155static long vfio_fops_unl_ioctl(struct file *filep,
1156                                unsigned int cmd, unsigned long arg)
1157{
1158        struct vfio_container *container = filep->private_data;
1159        struct vfio_iommu_driver *driver;
1160        void *data;
1161        long ret = -EINVAL;
1162
1163        if (!container)
1164                return ret;
1165
1166        switch (cmd) {
1167        case VFIO_GET_API_VERSION:
1168                ret = VFIO_API_VERSION;
1169                break;
1170        case VFIO_CHECK_EXTENSION:
1171                ret = vfio_ioctl_check_extension(container, arg);
1172                break;
1173        case VFIO_SET_IOMMU:
1174                ret = vfio_ioctl_set_iommu(container, arg);
1175                break;
1176        default:
1177                driver = container->iommu_driver;
1178                data = container->iommu_data;
1179
1180                if (driver) /* passthrough all unrecognized ioctls */
1181                        ret = driver->ops->ioctl(data, cmd, arg);
1182        }
1183
1184        return ret;
1185}
1186
1187#ifdef CONFIG_COMPAT
1188static long vfio_fops_compat_ioctl(struct file *filep,
1189                                   unsigned int cmd, unsigned long arg)
1190{
1191        arg = (unsigned long)compat_ptr(arg);
1192        return vfio_fops_unl_ioctl(filep, cmd, arg);
1193}
1194#endif  /* CONFIG_COMPAT */
1195
1196static int vfio_fops_open(struct inode *inode, struct file *filep)
1197{
1198        struct vfio_container *container;
1199
1200        container = kzalloc(sizeof(*container), GFP_KERNEL);
1201        if (!container)
1202                return -ENOMEM;
1203
1204        INIT_LIST_HEAD(&container->group_list);
1205        init_rwsem(&container->group_lock);
1206        kref_init(&container->kref);
1207
1208        filep->private_data = container;
1209
1210        return 0;
1211}
1212
1213static int vfio_fops_release(struct inode *inode, struct file *filep)
1214{
1215        struct vfio_container *container = filep->private_data;
1216
1217        filep->private_data = NULL;
1218
1219        vfio_container_put(container);
1220
1221        return 0;
1222}
1223
1224/*
1225 * Once an iommu driver is set, we optionally pass read/write/mmap
1226 * on to the driver, allowing management interfaces beyond ioctl.
1227 */
1228static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1229                              size_t count, loff_t *ppos)
1230{
1231        struct vfio_container *container = filep->private_data;
1232        struct vfio_iommu_driver *driver;
1233        ssize_t ret = -EINVAL;
1234
1235        driver = container->iommu_driver;
1236        if (likely(driver && driver->ops->read))
1237                ret = driver->ops->read(container->iommu_data,
1238                                        buf, count, ppos);
1239
1240        return ret;
1241}
1242
1243static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1244                               size_t count, loff_t *ppos)
1245{
1246        struct vfio_container *container = filep->private_data;
1247        struct vfio_iommu_driver *driver;
1248        ssize_t ret = -EINVAL;
1249
1250        driver = container->iommu_driver;
1251        if (likely(driver && driver->ops->write))
1252                ret = driver->ops->write(container->iommu_data,
1253                                         buf, count, ppos);
1254
1255        return ret;
1256}
1257
1258static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1259{
1260        struct vfio_container *container = filep->private_data;
1261        struct vfio_iommu_driver *driver;
1262        int ret = -EINVAL;
1263
1264        driver = container->iommu_driver;
1265        if (likely(driver && driver->ops->mmap))
1266                ret = driver->ops->mmap(container->iommu_data, vma);
1267
1268        return ret;
1269}
1270
1271static const struct file_operations vfio_fops = {
1272        .owner          = THIS_MODULE,
1273        .open           = vfio_fops_open,
1274        .release        = vfio_fops_release,
1275        .read           = vfio_fops_read,
1276        .write          = vfio_fops_write,
1277        .unlocked_ioctl = vfio_fops_unl_ioctl,
1278#ifdef CONFIG_COMPAT
1279        .compat_ioctl   = vfio_fops_compat_ioctl,
1280#endif
1281        .mmap           = vfio_fops_mmap,
1282};
1283
1284/**
1285 * VFIO Group fd, /dev/vfio/$GROUP
1286 */
1287static void __vfio_group_unset_container(struct vfio_group *group)
1288{
1289        struct vfio_container *container = group->container;
1290        struct vfio_iommu_driver *driver;
1291
1292        down_write(&container->group_lock);
1293
1294        driver = container->iommu_driver;
1295        if (driver)
1296                driver->ops->detach_group(container->iommu_data,
1297                                          group->iommu_group);
1298
1299        group->container = NULL;
1300        wake_up(&group->container_q);
1301        list_del(&group->container_next);
1302
1303        /* Detaching the last group deprivileges a container, remove iommu */
1304        if (driver && list_empty(&container->group_list)) {
1305                driver->ops->release(container->iommu_data);
1306                module_put(driver->ops->owner);
1307                container->iommu_driver = NULL;
1308                container->iommu_data = NULL;
1309        }
1310
1311        up_write(&container->group_lock);
1312
1313        vfio_container_put(container);
1314}
1315
1316/*
1317 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1318 * if there was no container to unset.  Since the ioctl is called on
1319 * the group, we know that still exists, therefore the only valid
1320 * transition here is 1->0.
1321 */
1322static int vfio_group_unset_container(struct vfio_group *group)
1323{
1324        int users = atomic_cmpxchg(&group->container_users, 1, 0);
1325
1326        if (!users)
1327                return -EINVAL;
1328        if (users != 1)
1329                return -EBUSY;
1330
1331        __vfio_group_unset_container(group);
1332
1333        return 0;
1334}
1335
1336/*
1337 * When removing container users, anything that removes the last user
1338 * implicitly removes the group from the container.  That is, if the
1339 * group file descriptor is closed, as well as any device file descriptors,
1340 * the group is free.
1341 */
1342static void vfio_group_try_dissolve_container(struct vfio_group *group)
1343{
1344        if (0 == atomic_dec_if_positive(&group->container_users))
1345                __vfio_group_unset_container(group);
1346}
1347
1348static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1349{
1350        struct fd f;
1351        struct vfio_container *container;
1352        struct vfio_iommu_driver *driver;
1353        int ret = 0;
1354
1355        if (atomic_read(&group->container_users))
1356                return -EINVAL;
1357
1358        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1359                return -EPERM;
1360
1361        f = fdget(container_fd);
1362        if (!f.file)
1363                return -EBADF;
1364
1365        /* Sanity check, is this really our fd? */
1366        if (f.file->f_op != &vfio_fops) {
1367                fdput(f);
1368                return -EINVAL;
1369        }
1370
1371        container = f.file->private_data;
1372        WARN_ON(!container); /* fget ensures we don't race vfio_release */
1373
1374        down_write(&container->group_lock);
1375
1376        /* Real groups and fake groups cannot mix */
1377        if (!list_empty(&container->group_list) &&
1378            container->noiommu != group->noiommu) {
1379                ret = -EPERM;
1380                goto unlock_out;
1381        }
1382
1383        driver = container->iommu_driver;
1384        if (driver) {
1385                ret = driver->ops->attach_group(container->iommu_data,
1386                                                group->iommu_group);
1387                if (ret)
1388                        goto unlock_out;
1389        }
1390
1391        group->container = container;
1392        container->noiommu = group->noiommu;
1393        list_add(&group->container_next, &container->group_list);
1394
1395        /* Get a reference on the container and mark a user within the group */
1396        vfio_container_get(container);
1397        atomic_inc(&group->container_users);
1398
1399unlock_out:
1400        up_write(&container->group_lock);
1401        fdput(f);
1402        return ret;
1403}
1404
1405static bool vfio_group_viable(struct vfio_group *group)
1406{
1407        return (iommu_group_for_each_dev(group->iommu_group,
1408                                         group, vfio_dev_viable) == 0);
1409}
1410
1411static int vfio_group_add_container_user(struct vfio_group *group)
1412{
1413        if (!atomic_inc_not_zero(&group->container_users))
1414                return -EINVAL;
1415
1416        if (group->noiommu) {
1417                atomic_dec(&group->container_users);
1418                return -EPERM;
1419        }
1420        if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1421                atomic_dec(&group->container_users);
1422                return -EINVAL;
1423        }
1424
1425        return 0;
1426}
1427
1428static const struct file_operations vfio_device_fops;
1429
1430static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1431{
1432        struct vfio_device *device;
1433        struct file *filep;
1434        int ret;
1435
1436        if (0 == atomic_read(&group->container_users) ||
1437            !group->container->iommu_driver || !vfio_group_viable(group))
1438                return -EINVAL;
1439
1440        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1441                return -EPERM;
1442
1443        device = vfio_device_get_from_name(group, buf);
1444        if (!device)
1445                return -ENODEV;
1446
1447        ret = device->ops->open(device->device_data);
1448        if (ret) {
1449                vfio_device_put(device);
1450                return ret;
1451        }
1452
1453        /*
1454         * We can't use anon_inode_getfd() because we need to modify
1455         * the f_mode flags directly to allow more than just ioctls
1456         */
1457        ret = get_unused_fd_flags(O_CLOEXEC);
1458        if (ret < 0) {
1459                device->ops->release(device->device_data);
1460                vfio_device_put(device);
1461                return ret;
1462        }
1463
1464        filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1465                                   device, O_RDWR);
1466        if (IS_ERR(filep)) {
1467                put_unused_fd(ret);
1468                ret = PTR_ERR(filep);
1469                device->ops->release(device->device_data);
1470                vfio_device_put(device);
1471                return ret;
1472        }
1473
1474        /*
1475         * TODO: add an anon_inode interface to do this.
1476         * Appears to be missing by lack of need rather than
1477         * explicitly prevented.  Now there's need.
1478         */
1479        filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1480
1481        atomic_inc(&group->container_users);
1482
1483        fd_install(ret, filep);
1484
1485        if (group->noiommu)
1486                dev_warn(device->dev, "vfio-noiommu device opened by user "
1487                         "(%s:%d)\n", current->comm, task_pid_nr(current));
1488
1489        return ret;
1490}
1491
1492static long vfio_group_fops_unl_ioctl(struct file *filep,
1493                                      unsigned int cmd, unsigned long arg)
1494{
1495        struct vfio_group *group = filep->private_data;
1496        long ret = -ENOTTY;
1497
1498        switch (cmd) {
1499        case VFIO_GROUP_GET_STATUS:
1500        {
1501                struct vfio_group_status status;
1502                unsigned long minsz;
1503
1504                minsz = offsetofend(struct vfio_group_status, flags);
1505
1506                if (copy_from_user(&status, (void __user *)arg, minsz))
1507                        return -EFAULT;
1508
1509                if (status.argsz < minsz)
1510                        return -EINVAL;
1511
1512                status.flags = 0;
1513
1514                if (vfio_group_viable(group))
1515                        status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1516
1517                if (group->container)
1518                        status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1519
1520                if (copy_to_user((void __user *)arg, &status, minsz))
1521                        return -EFAULT;
1522
1523                ret = 0;
1524                break;
1525        }
1526        case VFIO_GROUP_SET_CONTAINER:
1527        {
1528                int fd;
1529
1530                if (get_user(fd, (int __user *)arg))
1531                        return -EFAULT;
1532
1533                if (fd < 0)
1534                        return -EINVAL;
1535
1536                ret = vfio_group_set_container(group, fd);
1537                break;
1538        }
1539        case VFIO_GROUP_UNSET_CONTAINER:
1540                ret = vfio_group_unset_container(group);
1541                break;
1542        case VFIO_GROUP_GET_DEVICE_FD:
1543        {
1544                char *buf;
1545
1546                buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1547                if (IS_ERR(buf))
1548                        return PTR_ERR(buf);
1549
1550                ret = vfio_group_get_device_fd(group, buf);
1551                kfree(buf);
1552                break;
1553        }
1554        }
1555
1556        return ret;
1557}
1558
1559#ifdef CONFIG_COMPAT
1560static long vfio_group_fops_compat_ioctl(struct file *filep,
1561                                         unsigned int cmd, unsigned long arg)
1562{
1563        arg = (unsigned long)compat_ptr(arg);
1564        return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1565}
1566#endif  /* CONFIG_COMPAT */
1567
1568static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1569{
1570        struct vfio_group *group;
1571        int opened;
1572
1573        group = vfio_group_get_from_minor(iminor(inode));
1574        if (!group)
1575                return -ENODEV;
1576
1577        if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1578                vfio_group_put(group);
1579                return -EPERM;
1580        }
1581
1582        /* Do we need multiple instances of the group open?  Seems not. */
1583        opened = atomic_cmpxchg(&group->opened, 0, 1);
1584        if (opened) {
1585                vfio_group_put(group);
1586                return -EBUSY;
1587        }
1588
1589        /* Is something still in use from a previous open? */
1590        if (group->container) {
1591                atomic_dec(&group->opened);
1592                vfio_group_put(group);
1593                return -EBUSY;
1594        }
1595
1596        /* Warn if previous user didn't cleanup and re-init to drop them */
1597        if (WARN_ON(group->notifier.head))
1598                BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1599
1600        filep->private_data = group;
1601
1602        return 0;
1603}
1604
1605static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1606{
1607        struct vfio_group *group = filep->private_data;
1608
1609        filep->private_data = NULL;
1610
1611        vfio_group_try_dissolve_container(group);
1612
1613        atomic_dec(&group->opened);
1614
1615        vfio_group_put(group);
1616
1617        return 0;
1618}
1619
1620static const struct file_operations vfio_group_fops = {
1621        .owner          = THIS_MODULE,
1622        .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1623#ifdef CONFIG_COMPAT
1624        .compat_ioctl   = vfio_group_fops_compat_ioctl,
1625#endif
1626        .open           = vfio_group_fops_open,
1627        .release        = vfio_group_fops_release,
1628};
1629
1630/**
1631 * VFIO Device fd
1632 */
1633static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1634{
1635        struct vfio_device *device = filep->private_data;
1636
1637        device->ops->release(device->device_data);
1638
1639        vfio_group_try_dissolve_container(device->group);
1640
1641        vfio_device_put(device);
1642
1643        return 0;
1644}
1645
1646static long vfio_device_fops_unl_ioctl(struct file *filep,
1647                                       unsigned int cmd, unsigned long arg)
1648{
1649        struct vfio_device *device = filep->private_data;
1650
1651        if (unlikely(!device->ops->ioctl))
1652                return -EINVAL;
1653
1654        return device->ops->ioctl(device->device_data, cmd, arg);
1655}
1656
1657static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1658                                     size_t count, loff_t *ppos)
1659{
1660        struct vfio_device *device = filep->private_data;
1661
1662        if (unlikely(!device->ops->read))
1663                return -EINVAL;
1664
1665        return device->ops->read(device->device_data, buf, count, ppos);
1666}
1667
1668static ssize_t vfio_device_fops_write(struct file *filep,
1669                                      const char __user *buf,
1670                                      size_t count, loff_t *ppos)
1671{
1672        struct vfio_device *device = filep->private_data;
1673
1674        if (unlikely(!device->ops->write))
1675                return -EINVAL;
1676
1677        return device->ops->write(device->device_data, buf, count, ppos);
1678}
1679
1680static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1681{
1682        struct vfio_device *device = filep->private_data;
1683
1684        if (unlikely(!device->ops->mmap))
1685                return -EINVAL;
1686
1687        return device->ops->mmap(device->device_data, vma);
1688}
1689
1690#ifdef CONFIG_COMPAT
1691static long vfio_device_fops_compat_ioctl(struct file *filep,
1692                                          unsigned int cmd, unsigned long arg)
1693{
1694        arg = (unsigned long)compat_ptr(arg);
1695        return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1696}
1697#endif  /* CONFIG_COMPAT */
1698
1699static const struct file_operations vfio_device_fops = {
1700        .owner          = THIS_MODULE,
1701        .release        = vfio_device_fops_release,
1702        .read           = vfio_device_fops_read,
1703        .write          = vfio_device_fops_write,
1704        .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1705#ifdef CONFIG_COMPAT
1706        .compat_ioctl   = vfio_device_fops_compat_ioctl,
1707#endif
1708        .mmap           = vfio_device_fops_mmap,
1709};
1710
1711/**
1712 * External user API, exported by symbols to be linked dynamically.
1713 *
1714 * The protocol includes:
1715 *  1. do normal VFIO init operation:
1716 *      - opening a new container;
1717 *      - attaching group(s) to it;
1718 *      - setting an IOMMU driver for a container.
1719 * When IOMMU is set for a container, all groups in it are
1720 * considered ready to use by an external user.
1721 *
1722 * 2. User space passes a group fd to an external user.
1723 * The external user calls vfio_group_get_external_user()
1724 * to verify that:
1725 *      - the group is initialized;
1726 *      - IOMMU is set for it.
1727 * If both checks passed, vfio_group_get_external_user()
1728 * increments the container user counter to prevent
1729 * the VFIO group from disposal before KVM exits.
1730 *
1731 * 3. The external user calls vfio_external_user_iommu_id()
1732 * to know an IOMMU ID.
1733 *
1734 * 4. When the external KVM finishes, it calls
1735 * vfio_group_put_external_user() to release the VFIO group.
1736 * This call decrements the container user counter.
1737 */
1738struct vfio_group *vfio_group_get_external_user(struct file *filep)
1739{
1740        struct vfio_group *group = filep->private_data;
1741        int ret;
1742
1743        if (filep->f_op != &vfio_group_fops)
1744                return ERR_PTR(-EINVAL);
1745
1746        ret = vfio_group_add_container_user(group);
1747        if (ret)
1748                return ERR_PTR(ret);
1749
1750        vfio_group_get(group);
1751
1752        return group;
1753}
1754EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1755
1756void vfio_group_put_external_user(struct vfio_group *group)
1757{
1758        vfio_group_try_dissolve_container(group);
1759        vfio_group_put(group);
1760}
1761EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1762
1763bool vfio_external_group_match_file(struct vfio_group *test_group,
1764                                    struct file *filep)
1765{
1766        struct vfio_group *group = filep->private_data;
1767
1768        return (filep->f_op == &vfio_group_fops) && (group == test_group);
1769}
1770EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1771
1772int vfio_external_user_iommu_id(struct vfio_group *group)
1773{
1774        return iommu_group_id(group->iommu_group);
1775}
1776EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1777
1778long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1779{
1780        return vfio_ioctl_check_extension(group->container, arg);
1781}
1782EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1783
1784/**
1785 * Sub-module support
1786 */
1787/*
1788 * Helper for managing a buffer of info chain capabilities, allocate or
1789 * reallocate a buffer with additional @size, filling in @id and @version
1790 * of the capability.  A pointer to the new capability is returned.
1791 *
1792 * NB. The chain is based at the head of the buffer, so new entries are
1793 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1794 * next offsets prior to copying to the user buffer.
1795 */
1796struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1797                                               size_t size, u16 id, u16 version)
1798{
1799        void *buf;
1800        struct vfio_info_cap_header *header, *tmp;
1801
1802        buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1803        if (!buf) {
1804                kfree(caps->buf);
1805                caps->size = 0;
1806                return ERR_PTR(-ENOMEM);
1807        }
1808
1809        caps->buf = buf;
1810        header = buf + caps->size;
1811
1812        /* Eventually copied to user buffer, zero */
1813        memset(header, 0, size);
1814
1815        header->id = id;
1816        header->version = version;
1817
1818        /* Add to the end of the capability chain */
1819        for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1820                ; /* nothing */
1821
1822        tmp->next = caps->size;
1823        caps->size += size;
1824
1825        return header;
1826}
1827EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1828
1829void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1830{
1831        struct vfio_info_cap_header *tmp;
1832        void *buf = (void *)caps->buf;
1833
1834        for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1835                tmp->next += offset;
1836}
1837EXPORT_SYMBOL(vfio_info_cap_shift);
1838
1839int vfio_info_add_capability(struct vfio_info_cap *caps,
1840                             struct vfio_info_cap_header *cap, size_t size)
1841{
1842        struct vfio_info_cap_header *header;
1843
1844        header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1845        if (IS_ERR(header))
1846                return PTR_ERR(header);
1847
1848        memcpy(header + 1, cap + 1, size - sizeof(*header));
1849
1850        return 0;
1851}
1852EXPORT_SYMBOL(vfio_info_add_capability);
1853
1854int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1855                                       int max_irq_type, size_t *data_size)
1856{
1857        unsigned long minsz;
1858        size_t size;
1859
1860        minsz = offsetofend(struct vfio_irq_set, count);
1861
1862        if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1863            (hdr->count >= (U32_MAX - hdr->start)) ||
1864            (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1865                                VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1866                return -EINVAL;
1867
1868        if (data_size)
1869                *data_size = 0;
1870
1871        if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1872                return -EINVAL;
1873
1874        switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1875        case VFIO_IRQ_SET_DATA_NONE:
1876                size = 0;
1877                break;
1878        case VFIO_IRQ_SET_DATA_BOOL:
1879                size = sizeof(uint8_t);
1880                break;
1881        case VFIO_IRQ_SET_DATA_EVENTFD:
1882                size = sizeof(int32_t);
1883                break;
1884        default:
1885                return -EINVAL;
1886        }
1887
1888        if (size) {
1889                if (hdr->argsz - minsz < hdr->count * size)
1890                        return -EINVAL;
1891
1892                if (!data_size)
1893                        return -EINVAL;
1894
1895                *data_size = hdr->count * size;
1896        }
1897
1898        return 0;
1899}
1900EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1901
1902/*
1903 * Pin a set of guest PFNs and return their associated host PFNs for local
1904 * domain only.
1905 * @dev [in]     : device
1906 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1907 * @npage [in]   : count of elements in user_pfn array.  This count should not
1908 *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1909 * @prot [in]    : protection flags
1910 * @phys_pfn[out]: array of host PFNs
1911 * Return error or number of pages pinned.
1912 */
1913int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1914                   int prot, unsigned long *phys_pfn)
1915{
1916        struct vfio_container *container;
1917        struct vfio_group *group;
1918        struct vfio_iommu_driver *driver;
1919        int ret;
1920
1921        if (!dev || !user_pfn || !phys_pfn || !npage)
1922                return -EINVAL;
1923
1924        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1925                return -E2BIG;
1926
1927        group = vfio_group_get_from_dev(dev);
1928        if (!group)
1929                return -ENODEV;
1930
1931        ret = vfio_group_add_container_user(group);
1932        if (ret)
1933                goto err_pin_pages;
1934
1935        container = group->container;
1936        driver = container->iommu_driver;
1937        if (likely(driver && driver->ops->pin_pages))
1938                ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1939                                             npage, prot, phys_pfn);
1940        else
1941                ret = -ENOTTY;
1942
1943        vfio_group_try_dissolve_container(group);
1944
1945err_pin_pages:
1946        vfio_group_put(group);
1947        return ret;
1948}
1949EXPORT_SYMBOL(vfio_pin_pages);
1950
1951/*
1952 * Unpin set of host PFNs for local domain only.
1953 * @dev [in]     : device
1954 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1955 *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1956 * @npage [in]   : count of elements in user_pfn array.  This count should not
1957 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1958 * Return error or number of pages unpinned.
1959 */
1960int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1961{
1962        struct vfio_container *container;
1963        struct vfio_group *group;
1964        struct vfio_iommu_driver *driver;
1965        int ret;
1966
1967        if (!dev || !user_pfn || !npage)
1968                return -EINVAL;
1969
1970        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1971                return -E2BIG;
1972
1973        group = vfio_group_get_from_dev(dev);
1974        if (!group)
1975                return -ENODEV;
1976
1977        ret = vfio_group_add_container_user(group);
1978        if (ret)
1979                goto err_unpin_pages;
1980
1981        container = group->container;
1982        driver = container->iommu_driver;
1983        if (likely(driver && driver->ops->unpin_pages))
1984                ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1985                                               npage);
1986        else
1987                ret = -ENOTTY;
1988
1989        vfio_group_try_dissolve_container(group);
1990
1991err_unpin_pages:
1992        vfio_group_put(group);
1993        return ret;
1994}
1995EXPORT_SYMBOL(vfio_unpin_pages);
1996
1997static int vfio_register_iommu_notifier(struct vfio_group *group,
1998                                        unsigned long *events,
1999                                        struct notifier_block *nb)
2000{
2001        struct vfio_container *container;
2002        struct vfio_iommu_driver *driver;
2003        int ret;
2004
2005        ret = vfio_group_add_container_user(group);
2006        if (ret)
2007                return -EINVAL;
2008
2009        container = group->container;
2010        driver = container->iommu_driver;
2011        if (likely(driver && driver->ops->register_notifier))
2012                ret = driver->ops->register_notifier(container->iommu_data,
2013                                                     events, nb);
2014        else
2015                ret = -ENOTTY;
2016
2017        vfio_group_try_dissolve_container(group);
2018
2019        return ret;
2020}
2021
2022static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2023                                          struct notifier_block *nb)
2024{
2025        struct vfio_container *container;
2026        struct vfio_iommu_driver *driver;
2027        int ret;
2028
2029        ret = vfio_group_add_container_user(group);
2030        if (ret)
2031                return -EINVAL;
2032
2033        container = group->container;
2034        driver = container->iommu_driver;
2035        if (likely(driver && driver->ops->unregister_notifier))
2036                ret = driver->ops->unregister_notifier(container->iommu_data,
2037                                                       nb);
2038        else
2039                ret = -ENOTTY;
2040
2041        vfio_group_try_dissolve_container(group);
2042
2043        return ret;
2044}
2045
2046void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2047{
2048        group->kvm = kvm;
2049        blocking_notifier_call_chain(&group->notifier,
2050                                VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2051}
2052EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2053
2054static int vfio_register_group_notifier(struct vfio_group *group,
2055                                        unsigned long *events,
2056                                        struct notifier_block *nb)
2057{
2058        int ret;
2059        bool set_kvm = false;
2060
2061        if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2062                set_kvm = true;
2063
2064        /* clear known events */
2065        *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2066
2067        /* refuse to continue if still events remaining */
2068        if (*events)
2069                return -EINVAL;
2070
2071        ret = vfio_group_add_container_user(group);
2072        if (ret)
2073                return -EINVAL;
2074
2075        ret = blocking_notifier_chain_register(&group->notifier, nb);
2076
2077        /*
2078         * The attaching of kvm and vfio_group might already happen, so
2079         * here we replay once upon registration.
2080         */
2081        if (!ret && set_kvm && group->kvm)
2082                blocking_notifier_call_chain(&group->notifier,
2083                                        VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2084
2085        vfio_group_try_dissolve_container(group);
2086
2087        return ret;
2088}
2089
2090static int vfio_unregister_group_notifier(struct vfio_group *group,
2091                                         struct notifier_block *nb)
2092{
2093        int ret;
2094
2095        ret = vfio_group_add_container_user(group);
2096        if (ret)
2097                return -EINVAL;
2098
2099        ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2100
2101        vfio_group_try_dissolve_container(group);
2102
2103        return ret;
2104}
2105
2106int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2107                           unsigned long *events, struct notifier_block *nb)
2108{
2109        struct vfio_group *group;
2110        int ret;
2111
2112        if (!dev || !nb || !events || (*events == 0))
2113                return -EINVAL;
2114
2115        group = vfio_group_get_from_dev(dev);
2116        if (!group)
2117                return -ENODEV;
2118
2119        switch (type) {
2120        case VFIO_IOMMU_NOTIFY:
2121                ret = vfio_register_iommu_notifier(group, events, nb);
2122                break;
2123        case VFIO_GROUP_NOTIFY:
2124                ret = vfio_register_group_notifier(group, events, nb);
2125                break;
2126        default:
2127                ret = -EINVAL;
2128        }
2129
2130        vfio_group_put(group);
2131        return ret;
2132}
2133EXPORT_SYMBOL(vfio_register_notifier);
2134
2135int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2136                             struct notifier_block *nb)
2137{
2138        struct vfio_group *group;
2139        int ret;
2140
2141        if (!dev || !nb)
2142                return -EINVAL;
2143
2144        group = vfio_group_get_from_dev(dev);
2145        if (!group)
2146                return -ENODEV;
2147
2148        switch (type) {
2149        case VFIO_IOMMU_NOTIFY:
2150                ret = vfio_unregister_iommu_notifier(group, nb);
2151                break;
2152        case VFIO_GROUP_NOTIFY:
2153                ret = vfio_unregister_group_notifier(group, nb);
2154                break;
2155        default:
2156                ret = -EINVAL;
2157        }
2158
2159        vfio_group_put(group);
2160        return ret;
2161}
2162EXPORT_SYMBOL(vfio_unregister_notifier);
2163
2164/**
2165 * Module/class support
2166 */
2167static char *vfio_devnode(struct device *dev, umode_t *mode)
2168{
2169        return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2170}
2171
2172static struct miscdevice vfio_dev = {
2173        .minor = VFIO_MINOR,
2174        .name = "vfio",
2175        .fops = &vfio_fops,
2176        .nodename = "vfio/vfio",
2177        .mode = S_IRUGO | S_IWUGO,
2178};
2179
2180static int __init vfio_init(void)
2181{
2182        int ret;
2183
2184        idr_init(&vfio.group_idr);
2185        mutex_init(&vfio.group_lock);
2186        mutex_init(&vfio.iommu_drivers_lock);
2187        INIT_LIST_HEAD(&vfio.group_list);
2188        INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2189        init_waitqueue_head(&vfio.release_q);
2190
2191        ret = misc_register(&vfio_dev);
2192        if (ret) {
2193                pr_err("vfio: misc device register failed\n");
2194                return ret;
2195        }
2196
2197        /* /dev/vfio/$GROUP */
2198        vfio.class = class_create(THIS_MODULE, "vfio");
2199        if (IS_ERR(vfio.class)) {
2200                ret = PTR_ERR(vfio.class);
2201                goto err_class;
2202        }
2203
2204        vfio.class->devnode = vfio_devnode;
2205
2206        ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2207        if (ret)
2208                goto err_alloc_chrdev;
2209
2210        cdev_init(&vfio.group_cdev, &vfio_group_fops);
2211        ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2212        if (ret)
2213                goto err_cdev_add;
2214
2215        pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2216
2217#ifdef CONFIG_VFIO_NOIOMMU
2218        vfio_register_iommu_driver(&vfio_noiommu_ops);
2219#endif
2220        return 0;
2221
2222err_cdev_add:
2223        unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2224err_alloc_chrdev:
2225        class_destroy(vfio.class);
2226        vfio.class = NULL;
2227err_class:
2228        misc_deregister(&vfio_dev);
2229        return ret;
2230}
2231
2232static void __exit vfio_cleanup(void)
2233{
2234        WARN_ON(!list_empty(&vfio.group_list));
2235
2236#ifdef CONFIG_VFIO_NOIOMMU
2237        vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2238#endif
2239        idr_destroy(&vfio.group_idr);
2240        cdev_del(&vfio.group_cdev);
2241        unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2242        class_destroy(vfio.class);
2243        vfio.class = NULL;
2244        misc_deregister(&vfio_dev);
2245}
2246
2247module_init(vfio_init);
2248module_exit(vfio_cleanup);
2249
2250MODULE_VERSION(DRIVER_VERSION);
2251MODULE_LICENSE("GPL v2");
2252MODULE_AUTHOR(DRIVER_AUTHOR);
2253MODULE_DESCRIPTION(DRIVER_DESC);
2254MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2255MODULE_ALIAS("devname:vfio/vfio");
2256MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2257