linux/drivers/vfio/vfio.c
<<
>>
Prefs
   1/*
   2 * VFIO core
   3 *
   4 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5 *     Author: Alex Williamson <alex.williamson@redhat.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio:
  12 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13 * Author: Tom Lyon, pugs@cisco.com
  14 */
  15
  16#include <linux/cdev.h>
  17#include <linux/compat.h>
  18#include <linux/device.h>
  19#include <linux/file.h>
  20#include <linux/anon_inodes.h>
  21#include <linux/fs.h>
  22#include <linux/idr.h>
  23#include <linux/iommu.h>
  24#include <linux/list.h>
  25#include <linux/miscdevice.h>
  26#include <linux/module.h>
  27#include <linux/mutex.h>
  28#include <linux/pci.h>
  29#include <linux/rwsem.h>
  30#include <linux/sched.h>
  31#include <linux/slab.h>
  32#include <linux/stat.h>
  33#include <linux/string.h>
  34#include <linux/uaccess.h>
  35#include <linux/vfio.h>
  36#include <linux/wait.h>
  37
  38#define DRIVER_VERSION  "0.3"
  39#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  40#define DRIVER_DESC     "VFIO - User Level meta-driver"
  41
  42static struct vfio {
  43        struct class                    *class;
  44        struct list_head                iommu_drivers_list;
  45        struct mutex                    iommu_drivers_lock;
  46        struct list_head                group_list;
  47        struct idr                      group_idr;
  48        struct mutex                    group_lock;
  49        struct cdev                     group_cdev;
  50        dev_t                           group_devt;
  51        wait_queue_head_t               release_q;
  52} vfio;
  53
  54struct vfio_iommu_driver {
  55        const struct vfio_iommu_driver_ops      *ops;
  56        struct list_head                        vfio_next;
  57};
  58
  59struct vfio_container {
  60        struct kref                     kref;
  61        struct list_head                group_list;
  62        struct rw_semaphore             group_lock;
  63        struct vfio_iommu_driver        *iommu_driver;
  64        void                            *iommu_data;
  65        bool                            noiommu;
  66};
  67
  68struct vfio_unbound_dev {
  69        struct device                   *dev;
  70        struct list_head                unbound_next;
  71};
  72
  73struct vfio_group {
  74        struct kref                     kref;
  75        int                             minor;
  76        atomic_t                        container_users;
  77        struct iommu_group              *iommu_group;
  78        struct vfio_container           *container;
  79        struct list_head                device_list;
  80        struct mutex                    device_lock;
  81        struct device                   *dev;
  82        struct notifier_block           nb;
  83        struct list_head                vfio_next;
  84        struct list_head                container_next;
  85        struct list_head                unbound_list;
  86        struct mutex                    unbound_lock;
  87        atomic_t                        opened;
  88        bool                            noiommu;
  89        struct kvm                      *kvm;
  90        struct blocking_notifier_head   notifier;
  91};
  92
  93struct vfio_device {
  94        struct kref                     kref;
  95        struct device                   *dev;
  96        const struct vfio_device_ops    *ops;
  97        struct vfio_group               *group;
  98        struct list_head                group_next;
  99        void                            *device_data;
 100};
 101
 102#ifdef CONFIG_VFIO_NOIOMMU
 103static bool noiommu __read_mostly;
 104module_param_named(enable_unsafe_noiommu_mode,
 105                   noiommu, bool, S_IRUGO | S_IWUSR);
 106MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 107#endif
 108
 109/*
 110 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 111 * and remove functions, any use cases other than acquiring the first
 112 * reference for the purpose of calling vfio_add_group_dev() or removing
 113 * that symmetric reference after vfio_del_group_dev() should use the raw
 114 * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 115 * removes the device from the dummy group and cannot be nested.
 116 */
 117struct iommu_group *vfio_iommu_group_get(struct device *dev)
 118{
 119        struct iommu_group *group;
 120        int __maybe_unused ret;
 121
 122        group = iommu_group_get(dev);
 123
 124#ifdef CONFIG_VFIO_NOIOMMU
 125        /*
 126         * With noiommu enabled, an IOMMU group will be created for a device
 127         * that doesn't already have one and doesn't have an iommu_ops on their
 128         * bus.  We set iommudata simply to be able to identify these groups
 129         * as special use and for reclamation later.
 130         */
 131        if (group || !noiommu || iommu_present(dev->bus))
 132                return group;
 133
 134        group = iommu_group_alloc();
 135        if (IS_ERR(group))
 136                return NULL;
 137
 138        iommu_group_set_name(group, "vfio-noiommu");
 139        iommu_group_set_iommudata(group, &noiommu, NULL);
 140        ret = iommu_group_add_device(group, dev);
 141        iommu_group_put(group);
 142        if (ret)
 143                return NULL;
 144
 145        /*
 146         * Where to taint?  At this point we've added an IOMMU group for a
 147         * device that is not backed by iommu_ops, therefore any iommu_
 148         * callback using iommu_ops can legitimately Oops.  So, while we may
 149         * be about to give a DMA capable device to a user without IOMMU
 150         * protection, which is clearly taint-worthy, let's go ahead and do
 151         * it here.
 152         */
 153        add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 154        dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 155#endif
 156
 157        return group;
 158}
 159EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 160
 161void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 162{
 163#ifdef CONFIG_VFIO_NOIOMMU
 164        if (iommu_group_get_iommudata(group) == &noiommu)
 165                iommu_group_remove_device(dev);
 166#endif
 167
 168        iommu_group_put(group);
 169}
 170EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 171
 172#ifdef CONFIG_VFIO_NOIOMMU
 173static void *vfio_noiommu_open(unsigned long arg)
 174{
 175        if (arg != VFIO_NOIOMMU_IOMMU)
 176                return ERR_PTR(-EINVAL);
 177        if (!capable(CAP_SYS_RAWIO))
 178                return ERR_PTR(-EPERM);
 179
 180        return NULL;
 181}
 182
 183static void vfio_noiommu_release(void *iommu_data)
 184{
 185}
 186
 187static long vfio_noiommu_ioctl(void *iommu_data,
 188                               unsigned int cmd, unsigned long arg)
 189{
 190        if (cmd == VFIO_CHECK_EXTENSION)
 191                return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 192
 193        return -ENOTTY;
 194}
 195
 196static int vfio_noiommu_attach_group(void *iommu_data,
 197                                     struct iommu_group *iommu_group)
 198{
 199        return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 200}
 201
 202static void vfio_noiommu_detach_group(void *iommu_data,
 203                                      struct iommu_group *iommu_group)
 204{
 205}
 206
 207static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 208        .name = "vfio-noiommu",
 209        .owner = THIS_MODULE,
 210        .open = vfio_noiommu_open,
 211        .release = vfio_noiommu_release,
 212        .ioctl = vfio_noiommu_ioctl,
 213        .attach_group = vfio_noiommu_attach_group,
 214        .detach_group = vfio_noiommu_detach_group,
 215};
 216#endif
 217
 218
 219/**
 220 * IOMMU driver registration
 221 */
 222int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 223{
 224        struct vfio_iommu_driver *driver, *tmp;
 225
 226        driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 227        if (!driver)
 228                return -ENOMEM;
 229
 230        driver->ops = ops;
 231
 232        mutex_lock(&vfio.iommu_drivers_lock);
 233
 234        /* Check for duplicates */
 235        list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 236                if (tmp->ops == ops) {
 237                        mutex_unlock(&vfio.iommu_drivers_lock);
 238                        kfree(driver);
 239                        return -EINVAL;
 240                }
 241        }
 242
 243        list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 244
 245        mutex_unlock(&vfio.iommu_drivers_lock);
 246
 247        return 0;
 248}
 249EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 250
 251void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 252{
 253        struct vfio_iommu_driver *driver;
 254
 255        mutex_lock(&vfio.iommu_drivers_lock);
 256        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 257                if (driver->ops == ops) {
 258                        list_del(&driver->vfio_next);
 259                        mutex_unlock(&vfio.iommu_drivers_lock);
 260                        kfree(driver);
 261                        return;
 262                }
 263        }
 264        mutex_unlock(&vfio.iommu_drivers_lock);
 265}
 266EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 267
 268/**
 269 * Group minor allocation/free - both called with vfio.group_lock held
 270 */
 271static int vfio_alloc_group_minor(struct vfio_group *group)
 272{
 273        return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 274}
 275
 276static void vfio_free_group_minor(int minor)
 277{
 278        idr_remove(&vfio.group_idr, minor);
 279}
 280
 281static int vfio_iommu_group_notifier(struct notifier_block *nb,
 282                                     unsigned long action, void *data);
 283static void vfio_group_get(struct vfio_group *group);
 284
 285/**
 286 * Container objects - containers are created when /dev/vfio/vfio is
 287 * opened, but their lifecycle extends until the last user is done, so
 288 * it's freed via kref.  Must support container/group/device being
 289 * closed in any order.
 290 */
 291static void vfio_container_get(struct vfio_container *container)
 292{
 293        kref_get(&container->kref);
 294}
 295
 296static void vfio_container_release(struct kref *kref)
 297{
 298        struct vfio_container *container;
 299        container = container_of(kref, struct vfio_container, kref);
 300
 301        kfree(container);
 302}
 303
 304static void vfio_container_put(struct vfio_container *container)
 305{
 306        kref_put(&container->kref, vfio_container_release);
 307}
 308
 309static void vfio_group_unlock_and_free(struct vfio_group *group)
 310{
 311        mutex_unlock(&vfio.group_lock);
 312        /*
 313         * Unregister outside of lock.  A spurious callback is harmless now
 314         * that the group is no longer in vfio.group_list.
 315         */
 316        iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 317        kfree(group);
 318}
 319
 320/**
 321 * Group objects - create, release, get, put, search
 322 */
 323static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 324{
 325        struct vfio_group *group, *tmp;
 326        struct device *dev;
 327        int ret, minor;
 328
 329        group = kzalloc(sizeof(*group), GFP_KERNEL);
 330        if (!group)
 331                return ERR_PTR(-ENOMEM);
 332
 333        kref_init(&group->kref);
 334        INIT_LIST_HEAD(&group->device_list);
 335        mutex_init(&group->device_lock);
 336        INIT_LIST_HEAD(&group->unbound_list);
 337        mutex_init(&group->unbound_lock);
 338        atomic_set(&group->container_users, 0);
 339        atomic_set(&group->opened, 0);
 340        group->iommu_group = iommu_group;
 341#ifdef CONFIG_VFIO_NOIOMMU
 342        group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 343#endif
 344        BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 345
 346        group->nb.notifier_call = vfio_iommu_group_notifier;
 347
 348        /*
 349         * blocking notifiers acquire a rwsem around registering and hold
 350         * it around callback.  Therefore, need to register outside of
 351         * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 352         * do anything unless it can find the group in vfio.group_list, so
 353         * no harm in registering early.
 354         */
 355        ret = iommu_group_register_notifier(iommu_group, &group->nb);
 356        if (ret) {
 357                kfree(group);
 358                return ERR_PTR(ret);
 359        }
 360
 361        mutex_lock(&vfio.group_lock);
 362
 363        /* Did we race creating this group? */
 364        list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 365                if (tmp->iommu_group == iommu_group) {
 366                        vfio_group_get(tmp);
 367                        vfio_group_unlock_and_free(group);
 368                        return tmp;
 369                }
 370        }
 371
 372        minor = vfio_alloc_group_minor(group);
 373        if (minor < 0) {
 374                vfio_group_unlock_and_free(group);
 375                return ERR_PTR(minor);
 376        }
 377
 378        dev = device_create(vfio.class, NULL,
 379                            MKDEV(MAJOR(vfio.group_devt), minor),
 380                            group, "%s%d", group->noiommu ? "noiommu-" : "",
 381                            iommu_group_id(iommu_group));
 382        if (IS_ERR(dev)) {
 383                vfio_free_group_minor(minor);
 384                vfio_group_unlock_and_free(group);
 385                return ERR_CAST(dev);
 386        }
 387
 388        group->minor = minor;
 389        group->dev = dev;
 390
 391        list_add(&group->vfio_next, &vfio.group_list);
 392
 393        mutex_unlock(&vfio.group_lock);
 394
 395        return group;
 396}
 397
 398/* called with vfio.group_lock held */
 399static void vfio_group_release(struct kref *kref)
 400{
 401        struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 402        struct vfio_unbound_dev *unbound, *tmp;
 403        struct iommu_group *iommu_group = group->iommu_group;
 404
 405        WARN_ON(!list_empty(&group->device_list));
 406        WARN_ON(group->notifier.head);
 407
 408        list_for_each_entry_safe(unbound, tmp,
 409                                 &group->unbound_list, unbound_next) {
 410                list_del(&unbound->unbound_next);
 411                kfree(unbound);
 412        }
 413
 414        device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 415        list_del(&group->vfio_next);
 416        vfio_free_group_minor(group->minor);
 417        vfio_group_unlock_and_free(group);
 418        iommu_group_put(iommu_group);
 419}
 420
 421static void vfio_group_put(struct vfio_group *group)
 422{
 423        kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 424}
 425
 426struct vfio_group_put_work {
 427        struct work_struct work;
 428        struct vfio_group *group;
 429};
 430
 431static void vfio_group_put_bg(struct work_struct *work)
 432{
 433        struct vfio_group_put_work *do_work;
 434
 435        do_work = container_of(work, struct vfio_group_put_work, work);
 436
 437        vfio_group_put(do_work->group);
 438        kfree(do_work);
 439}
 440
 441static void vfio_group_schedule_put(struct vfio_group *group)
 442{
 443        struct vfio_group_put_work *do_work;
 444
 445        do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 446        if (WARN_ON(!do_work))
 447                return;
 448
 449        INIT_WORK(&do_work->work, vfio_group_put_bg);
 450        do_work->group = group;
 451        schedule_work(&do_work->work);
 452}
 453
 454/* Assume group_lock or group reference is held */
 455static void vfio_group_get(struct vfio_group *group)
 456{
 457        kref_get(&group->kref);
 458}
 459
 460/*
 461 * Not really a try as we will sleep for mutex, but we need to make
 462 * sure the group pointer is valid under lock and get a reference.
 463 */
 464static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 465{
 466        struct vfio_group *target = group;
 467
 468        mutex_lock(&vfio.group_lock);
 469        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 470                if (group == target) {
 471                        vfio_group_get(group);
 472                        mutex_unlock(&vfio.group_lock);
 473                        return group;
 474                }
 475        }
 476        mutex_unlock(&vfio.group_lock);
 477
 478        return NULL;
 479}
 480
 481static
 482struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 483{
 484        struct vfio_group *group;
 485
 486        mutex_lock(&vfio.group_lock);
 487        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 488                if (group->iommu_group == iommu_group) {
 489                        vfio_group_get(group);
 490                        mutex_unlock(&vfio.group_lock);
 491                        return group;
 492                }
 493        }
 494        mutex_unlock(&vfio.group_lock);
 495
 496        return NULL;
 497}
 498
 499static struct vfio_group *vfio_group_get_from_minor(int minor)
 500{
 501        struct vfio_group *group;
 502
 503        mutex_lock(&vfio.group_lock);
 504        group = idr_find(&vfio.group_idr, minor);
 505        if (!group) {
 506                mutex_unlock(&vfio.group_lock);
 507                return NULL;
 508        }
 509        vfio_group_get(group);
 510        mutex_unlock(&vfio.group_lock);
 511
 512        return group;
 513}
 514
 515static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 516{
 517        struct iommu_group *iommu_group;
 518        struct vfio_group *group;
 519
 520        iommu_group = iommu_group_get(dev);
 521        if (!iommu_group)
 522                return NULL;
 523
 524        group = vfio_group_get_from_iommu(iommu_group);
 525        iommu_group_put(iommu_group);
 526
 527        return group;
 528}
 529
 530/**
 531 * Device objects - create, release, get, put, search
 532 */
 533static
 534struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 535                                             struct device *dev,
 536                                             const struct vfio_device_ops *ops,
 537                                             void *device_data)
 538{
 539        struct vfio_device *device;
 540
 541        device = kzalloc(sizeof(*device), GFP_KERNEL);
 542        if (!device)
 543                return ERR_PTR(-ENOMEM);
 544
 545        kref_init(&device->kref);
 546        device->dev = dev;
 547        device->group = group;
 548        device->ops = ops;
 549        device->device_data = device_data;
 550        dev_set_drvdata(dev, device);
 551
 552        /* No need to get group_lock, caller has group reference */
 553        vfio_group_get(group);
 554
 555        mutex_lock(&group->device_lock);
 556        list_add(&device->group_next, &group->device_list);
 557        mutex_unlock(&group->device_lock);
 558
 559        return device;
 560}
 561
 562static void vfio_device_release(struct kref *kref)
 563{
 564        struct vfio_device *device = container_of(kref,
 565                                                  struct vfio_device, kref);
 566        struct vfio_group *group = device->group;
 567
 568        list_del(&device->group_next);
 569        mutex_unlock(&group->device_lock);
 570
 571        dev_set_drvdata(device->dev, NULL);
 572
 573        kfree(device);
 574
 575        /* vfio_del_group_dev may be waiting for this device */
 576        wake_up(&vfio.release_q);
 577}
 578
 579/* Device reference always implies a group reference */
 580void vfio_device_put(struct vfio_device *device)
 581{
 582        struct vfio_group *group = device->group;
 583        kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 584        vfio_group_put(group);
 585}
 586EXPORT_SYMBOL_GPL(vfio_device_put);
 587
 588static void vfio_device_get(struct vfio_device *device)
 589{
 590        vfio_group_get(device->group);
 591        kref_get(&device->kref);
 592}
 593
 594static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 595                                                 struct device *dev)
 596{
 597        struct vfio_device *device;
 598
 599        mutex_lock(&group->device_lock);
 600        list_for_each_entry(device, &group->device_list, group_next) {
 601                if (device->dev == dev) {
 602                        vfio_device_get(device);
 603                        mutex_unlock(&group->device_lock);
 604                        return device;
 605                }
 606        }
 607        mutex_unlock(&group->device_lock);
 608        return NULL;
 609}
 610
 611/*
 612 * Some drivers, like pci-stub, are only used to prevent other drivers from
 613 * claiming a device and are therefore perfectly legitimate for a user owned
 614 * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 615 * of the device, but it does prevent the user from having direct access to
 616 * the device, which is useful in some circumstances.
 617 *
 618 * We also assume that we can include PCI interconnect devices, ie. bridges.
 619 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 620 * then all of the downstream devices will be part of the same IOMMU group as
 621 * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 622 * breaks anything, it only does so for user owned devices downstream.  Note
 623 * that error notification via MSI can be affected for platforms that handle
 624 * MSI within the same IOVA space as DMA.
 625 */
 626static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 627
 628static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 629{
 630        int i;
 631
 632        if (dev_is_pci(dev)) {
 633                struct pci_dev *pdev = to_pci_dev(dev);
 634
 635                if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 636                        return true;
 637        }
 638
 639        for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
 640                if (!strcmp(drv->name, vfio_driver_whitelist[i]))
 641                        return true;
 642        }
 643
 644        return false;
 645}
 646
 647/*
 648 * A vfio group is viable for use by userspace if all devices are in
 649 * one of the following states:
 650 *  - driver-less
 651 *  - bound to a vfio driver
 652 *  - bound to a whitelisted driver
 653 *  - a PCI interconnect device
 654 *
 655 * We use two methods to determine whether a device is bound to a vfio
 656 * driver.  The first is to test whether the device exists in the vfio
 657 * group.  The second is to test if the device exists on the group
 658 * unbound_list, indicating it's in the middle of transitioning from
 659 * a vfio driver to driver-less.
 660 */
 661static int vfio_dev_viable(struct device *dev, void *data)
 662{
 663        struct vfio_group *group = data;
 664        struct vfio_device *device;
 665        struct device_driver *drv = ACCESS_ONCE(dev->driver);
 666        struct vfio_unbound_dev *unbound;
 667        int ret = -EINVAL;
 668
 669        mutex_lock(&group->unbound_lock);
 670        list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 671                if (dev == unbound->dev) {
 672                        ret = 0;
 673                        break;
 674                }
 675        }
 676        mutex_unlock(&group->unbound_lock);
 677
 678        if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 679                return 0;
 680
 681        device = vfio_group_get_device(group, dev);
 682        if (device) {
 683                vfio_device_put(device);
 684                return 0;
 685        }
 686
 687        return ret;
 688}
 689
 690/**
 691 * Async device support
 692 */
 693static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 694{
 695        struct vfio_device *device;
 696
 697        /* Do we already know about it?  We shouldn't */
 698        device = vfio_group_get_device(group, dev);
 699        if (WARN_ON_ONCE(device)) {
 700                vfio_device_put(device);
 701                return 0;
 702        }
 703
 704        /* Nothing to do for idle groups */
 705        if (!atomic_read(&group->container_users))
 706                return 0;
 707
 708        /* TODO Prevent device auto probing */
 709        WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
 710             iommu_group_id(group->iommu_group));
 711
 712        return 0;
 713}
 714
 715static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 716{
 717        /* We don't care what happens when the group isn't in use */
 718        if (!atomic_read(&group->container_users))
 719                return 0;
 720
 721        return vfio_dev_viable(dev, group);
 722}
 723
 724static int vfio_iommu_group_notifier(struct notifier_block *nb,
 725                                     unsigned long action, void *data)
 726{
 727        struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 728        struct device *dev = data;
 729        struct vfio_unbound_dev *unbound;
 730
 731        /*
 732         * Need to go through a group_lock lookup to get a reference or we
 733         * risk racing a group being removed.  Ignore spurious notifies.
 734         */
 735        group = vfio_group_try_get(group);
 736        if (!group)
 737                return NOTIFY_OK;
 738
 739        switch (action) {
 740        case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 741                vfio_group_nb_add_dev(group, dev);
 742                break;
 743        case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 744                /*
 745                 * Nothing to do here.  If the device is in use, then the
 746                 * vfio sub-driver should block the remove callback until
 747                 * it is unused.  If the device is unused or attached to a
 748                 * stub driver, then it should be released and we don't
 749                 * care that it will be going away.
 750                 */
 751                break;
 752        case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 753                pr_debug("%s: Device %s, group %d binding to driver\n",
 754                         __func__, dev_name(dev),
 755                         iommu_group_id(group->iommu_group));
 756                break;
 757        case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 758                pr_debug("%s: Device %s, group %d bound to driver %s\n",
 759                         __func__, dev_name(dev),
 760                         iommu_group_id(group->iommu_group), dev->driver->name);
 761                BUG_ON(vfio_group_nb_verify(group, dev));
 762                break;
 763        case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 764                pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
 765                         __func__, dev_name(dev),
 766                         iommu_group_id(group->iommu_group), dev->driver->name);
 767                break;
 768        case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 769                pr_debug("%s: Device %s, group %d unbound from driver\n",
 770                         __func__, dev_name(dev),
 771                         iommu_group_id(group->iommu_group));
 772                /*
 773                 * XXX An unbound device in a live group is ok, but we'd
 774                 * really like to avoid the above BUG_ON by preventing other
 775                 * drivers from binding to it.  Once that occurs, we have to
 776                 * stop the system to maintain isolation.  At a minimum, we'd
 777                 * want a toggle to disable driver auto probe for this device.
 778                 */
 779
 780                mutex_lock(&group->unbound_lock);
 781                list_for_each_entry(unbound,
 782                                    &group->unbound_list, unbound_next) {
 783                        if (dev == unbound->dev) {
 784                                list_del(&unbound->unbound_next);
 785                                kfree(unbound);
 786                                break;
 787                        }
 788                }
 789                mutex_unlock(&group->unbound_lock);
 790                break;
 791        }
 792
 793        /*
 794         * If we're the last reference to the group, the group will be
 795         * released, which includes unregistering the iommu group notifier.
 796         * We hold a read-lock on that notifier list, unregistering needs
 797         * a write-lock... deadlock.  Release our reference asynchronously
 798         * to avoid that situation.
 799         */
 800        vfio_group_schedule_put(group);
 801        return NOTIFY_OK;
 802}
 803
 804/**
 805 * VFIO driver API
 806 */
 807int vfio_add_group_dev(struct device *dev,
 808                       const struct vfio_device_ops *ops, void *device_data)
 809{
 810        struct iommu_group *iommu_group;
 811        struct vfio_group *group;
 812        struct vfio_device *device;
 813
 814        iommu_group = iommu_group_get(dev);
 815        if (!iommu_group)
 816                return -EINVAL;
 817
 818        group = vfio_group_get_from_iommu(iommu_group);
 819        if (!group) {
 820                group = vfio_create_group(iommu_group);
 821                if (IS_ERR(group)) {
 822                        iommu_group_put(iommu_group);
 823                        return PTR_ERR(group);
 824                }
 825        } else {
 826                /*
 827                 * A found vfio_group already holds a reference to the
 828                 * iommu_group.  A created vfio_group keeps the reference.
 829                 */
 830                iommu_group_put(iommu_group);
 831        }
 832
 833        device = vfio_group_get_device(group, dev);
 834        if (device) {
 835                WARN(1, "Device %s already exists on group %d\n",
 836                     dev_name(dev), iommu_group_id(iommu_group));
 837                vfio_device_put(device);
 838                vfio_group_put(group);
 839                return -EBUSY;
 840        }
 841
 842        device = vfio_group_create_device(group, dev, ops, device_data);
 843        if (IS_ERR(device)) {
 844                vfio_group_put(group);
 845                return PTR_ERR(device);
 846        }
 847
 848        /*
 849         * Drop all but the vfio_device reference.  The vfio_device holds
 850         * a reference to the vfio_group, which holds a reference to the
 851         * iommu_group.
 852         */
 853        vfio_group_put(group);
 854
 855        return 0;
 856}
 857EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 858
 859/**
 860 * Get a reference to the vfio_device for a device.  Even if the
 861 * caller thinks they own the device, they could be racing with a
 862 * release call path, so we can't trust drvdata for the shortcut.
 863 * Go the long way around, from the iommu_group to the vfio_group
 864 * to the vfio_device.
 865 */
 866struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 867{
 868        struct vfio_group *group;
 869        struct vfio_device *device;
 870
 871        group = vfio_group_get_from_dev(dev);
 872        if (!group)
 873                return NULL;
 874
 875        device = vfio_group_get_device(group, dev);
 876        vfio_group_put(group);
 877
 878        return device;
 879}
 880EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 881
 882static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 883                                                     char *buf)
 884{
 885        struct vfio_device *it, *device = NULL;
 886
 887        mutex_lock(&group->device_lock);
 888        list_for_each_entry(it, &group->device_list, group_next) {
 889                if (!strcmp(dev_name(it->dev), buf)) {
 890                        device = it;
 891                        vfio_device_get(device);
 892                        break;
 893                }
 894        }
 895        mutex_unlock(&group->device_lock);
 896
 897        return device;
 898}
 899
 900/*
 901 * Caller must hold a reference to the vfio_device
 902 */
 903void *vfio_device_data(struct vfio_device *device)
 904{
 905        return device->device_data;
 906}
 907EXPORT_SYMBOL_GPL(vfio_device_data);
 908
 909/* Given a referenced group, check if it contains the device */
 910static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
 911{
 912        struct vfio_device *device;
 913
 914        device = vfio_group_get_device(group, dev);
 915        if (!device)
 916                return false;
 917
 918        vfio_device_put(device);
 919        return true;
 920}
 921
 922/*
 923 * Decrement the device reference count and wait for the device to be
 924 * removed.  Open file descriptors for the device... */
 925void *vfio_del_group_dev(struct device *dev)
 926{
 927        struct vfio_device *device = dev_get_drvdata(dev);
 928        struct vfio_group *group = device->group;
 929        void *device_data = device->device_data;
 930        struct vfio_unbound_dev *unbound;
 931        unsigned int i = 0;
 932        long ret;
 933        bool interrupted = false;
 934
 935        /*
 936         * The group exists so long as we have a device reference.  Get
 937         * a group reference and use it to scan for the device going away.
 938         */
 939        vfio_group_get(group);
 940
 941        /*
 942         * When the device is removed from the group, the group suddenly
 943         * becomes non-viable; the device has a driver (until the unbind
 944         * completes), but it's not present in the group.  This is bad news
 945         * for any external users that need to re-acquire a group reference
 946         * in order to match and release their existing reference.  To
 947         * solve this, we track such devices on the unbound_list to bridge
 948         * the gap until they're fully unbound.
 949         */
 950        unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 951        if (unbound) {
 952                unbound->dev = dev;
 953                mutex_lock(&group->unbound_lock);
 954                list_add(&unbound->unbound_next, &group->unbound_list);
 955                mutex_unlock(&group->unbound_lock);
 956        }
 957        WARN_ON(!unbound);
 958
 959        vfio_device_put(device);
 960
 961        /*
 962         * If the device is still present in the group after the above
 963         * 'put', then it is in use and we need to request it from the
 964         * bus driver.  The driver may in turn need to request the
 965         * device from the user.  We send the request on an arbitrary
 966         * interval with counter to allow the driver to take escalating
 967         * measures to release the device if it has the ability to do so.
 968         */
 969        do {
 970                device = vfio_group_get_device(group, dev);
 971                if (!device)
 972                        break;
 973
 974                if (device->ops->request)
 975                        device->ops->request(device_data, i++);
 976
 977                vfio_device_put(device);
 978
 979                if (interrupted) {
 980                        ret = wait_event_timeout(vfio.release_q,
 981                                        !vfio_dev_present(group, dev), HZ * 10);
 982                } else {
 983                        ret = wait_event_interruptible_timeout(vfio.release_q,
 984                                        !vfio_dev_present(group, dev), HZ * 10);
 985                        if (ret == -ERESTARTSYS) {
 986                                interrupted = true;
 987                                dev_warn(dev,
 988                                         "Device is currently in use, task"
 989                                         " \"%s\" (%d) "
 990                                         "blocked until device is released",
 991                                         current->comm, task_pid_nr(current));
 992                        }
 993                }
 994        } while (ret <= 0);
 995
 996        vfio_group_put(group);
 997
 998        return device_data;
 999}
1000EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1001
1002/**
1003 * VFIO base fd, /dev/vfio/vfio
1004 */
1005static long vfio_ioctl_check_extension(struct vfio_container *container,
1006                                       unsigned long arg)
1007{
1008        struct vfio_iommu_driver *driver;
1009        long ret = 0;
1010
1011        down_read(&container->group_lock);
1012
1013        driver = container->iommu_driver;
1014
1015        switch (arg) {
1016                /* No base extensions yet */
1017        default:
1018                /*
1019                 * If no driver is set, poll all registered drivers for
1020                 * extensions and return the first positive result.  If
1021                 * a driver is already set, further queries will be passed
1022                 * only to that driver.
1023                 */
1024                if (!driver) {
1025                        mutex_lock(&vfio.iommu_drivers_lock);
1026                        list_for_each_entry(driver, &vfio.iommu_drivers_list,
1027                                            vfio_next) {
1028
1029#ifdef CONFIG_VFIO_NOIOMMU
1030                                if (!list_empty(&container->group_list) &&
1031                                    (container->noiommu !=
1032                                     (driver->ops == &vfio_noiommu_ops)))
1033                                        continue;
1034#endif
1035
1036                                if (!try_module_get(driver->ops->owner))
1037                                        continue;
1038
1039                                ret = driver->ops->ioctl(NULL,
1040                                                         VFIO_CHECK_EXTENSION,
1041                                                         arg);
1042                                module_put(driver->ops->owner);
1043                                if (ret > 0)
1044                                        break;
1045                        }
1046                        mutex_unlock(&vfio.iommu_drivers_lock);
1047                } else
1048                        ret = driver->ops->ioctl(container->iommu_data,
1049                                                 VFIO_CHECK_EXTENSION, arg);
1050        }
1051
1052        up_read(&container->group_lock);
1053
1054        return ret;
1055}
1056
1057/* hold write lock on container->group_lock */
1058static int __vfio_container_attach_groups(struct vfio_container *container,
1059                                          struct vfio_iommu_driver *driver,
1060                                          void *data)
1061{
1062        struct vfio_group *group;
1063        int ret = -ENODEV;
1064
1065        list_for_each_entry(group, &container->group_list, container_next) {
1066                ret = driver->ops->attach_group(data, group->iommu_group);
1067                if (ret)
1068                        goto unwind;
1069        }
1070
1071        return ret;
1072
1073unwind:
1074        list_for_each_entry_continue_reverse(group, &container->group_list,
1075                                             container_next) {
1076                driver->ops->detach_group(data, group->iommu_group);
1077        }
1078
1079        return ret;
1080}
1081
1082static long vfio_ioctl_set_iommu(struct vfio_container *container,
1083                                 unsigned long arg)
1084{
1085        struct vfio_iommu_driver *driver;
1086        long ret = -ENODEV;
1087
1088        down_write(&container->group_lock);
1089
1090        /*
1091         * The container is designed to be an unprivileged interface while
1092         * the group can be assigned to specific users.  Therefore, only by
1093         * adding a group to a container does the user get the privilege of
1094         * enabling the iommu, which may allocate finite resources.  There
1095         * is no unset_iommu, but by removing all the groups from a container,
1096         * the container is deprivileged and returns to an unset state.
1097         */
1098        if (list_empty(&container->group_list) || container->iommu_driver) {
1099                up_write(&container->group_lock);
1100                return -EINVAL;
1101        }
1102
1103        mutex_lock(&vfio.iommu_drivers_lock);
1104        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1105                void *data;
1106
1107#ifdef CONFIG_VFIO_NOIOMMU
1108                /*
1109                 * Only noiommu containers can use vfio-noiommu and noiommu
1110                 * containers can only use vfio-noiommu.
1111                 */
1112                if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1113                        continue;
1114#endif
1115
1116                if (!try_module_get(driver->ops->owner))
1117                        continue;
1118
1119                /*
1120                 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1121                 * so test which iommu driver reported support for this
1122                 * extension and call open on them.  We also pass them the
1123                 * magic, allowing a single driver to support multiple
1124                 * interfaces if they'd like.
1125                 */
1126                if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1127                        module_put(driver->ops->owner);
1128                        continue;
1129                }
1130
1131                data = driver->ops->open(arg);
1132                if (IS_ERR(data)) {
1133                        ret = PTR_ERR(data);
1134                        module_put(driver->ops->owner);
1135                        continue;
1136                }
1137
1138                ret = __vfio_container_attach_groups(container, driver, data);
1139                if (ret) {
1140                        driver->ops->release(data);
1141                        module_put(driver->ops->owner);
1142                        continue;
1143                }
1144
1145                container->iommu_driver = driver;
1146                container->iommu_data = data;
1147                break;
1148        }
1149
1150        mutex_unlock(&vfio.iommu_drivers_lock);
1151        up_write(&container->group_lock);
1152
1153        return ret;
1154}
1155
1156static long vfio_fops_unl_ioctl(struct file *filep,
1157                                unsigned int cmd, unsigned long arg)
1158{
1159        struct vfio_container *container = filep->private_data;
1160        struct vfio_iommu_driver *driver;
1161        void *data;
1162        long ret = -EINVAL;
1163
1164        if (!container)
1165                return ret;
1166
1167        switch (cmd) {
1168        case VFIO_GET_API_VERSION:
1169                ret = VFIO_API_VERSION;
1170                break;
1171        case VFIO_CHECK_EXTENSION:
1172                ret = vfio_ioctl_check_extension(container, arg);
1173                break;
1174        case VFIO_SET_IOMMU:
1175                ret = vfio_ioctl_set_iommu(container, arg);
1176                break;
1177        default:
1178                driver = container->iommu_driver;
1179                data = container->iommu_data;
1180
1181                if (driver) /* passthrough all unrecognized ioctls */
1182                        ret = driver->ops->ioctl(data, cmd, arg);
1183        }
1184
1185        return ret;
1186}
1187
1188#ifdef CONFIG_COMPAT
1189static long vfio_fops_compat_ioctl(struct file *filep,
1190                                   unsigned int cmd, unsigned long arg)
1191{
1192        arg = (unsigned long)compat_ptr(arg);
1193        return vfio_fops_unl_ioctl(filep, cmd, arg);
1194}
1195#endif  /* CONFIG_COMPAT */
1196
1197static int vfio_fops_open(struct inode *inode, struct file *filep)
1198{
1199        struct vfio_container *container;
1200
1201        container = kzalloc(sizeof(*container), GFP_KERNEL);
1202        if (!container)
1203                return -ENOMEM;
1204
1205        INIT_LIST_HEAD(&container->group_list);
1206        init_rwsem(&container->group_lock);
1207        kref_init(&container->kref);
1208
1209        filep->private_data = container;
1210
1211        return 0;
1212}
1213
1214static int vfio_fops_release(struct inode *inode, struct file *filep)
1215{
1216        struct vfio_container *container = filep->private_data;
1217
1218        filep->private_data = NULL;
1219
1220        vfio_container_put(container);
1221
1222        return 0;
1223}
1224
1225/*
1226 * Once an iommu driver is set, we optionally pass read/write/mmap
1227 * on to the driver, allowing management interfaces beyond ioctl.
1228 */
1229static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1230                              size_t count, loff_t *ppos)
1231{
1232        struct vfio_container *container = filep->private_data;
1233        struct vfio_iommu_driver *driver;
1234        ssize_t ret = -EINVAL;
1235
1236        driver = container->iommu_driver;
1237        if (likely(driver && driver->ops->read))
1238                ret = driver->ops->read(container->iommu_data,
1239                                        buf, count, ppos);
1240
1241        return ret;
1242}
1243
1244static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1245                               size_t count, loff_t *ppos)
1246{
1247        struct vfio_container *container = filep->private_data;
1248        struct vfio_iommu_driver *driver;
1249        ssize_t ret = -EINVAL;
1250
1251        driver = container->iommu_driver;
1252        if (likely(driver && driver->ops->write))
1253                ret = driver->ops->write(container->iommu_data,
1254                                         buf, count, ppos);
1255
1256        return ret;
1257}
1258
1259static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1260{
1261        struct vfio_container *container = filep->private_data;
1262        struct vfio_iommu_driver *driver;
1263        int ret = -EINVAL;
1264
1265        driver = container->iommu_driver;
1266        if (likely(driver && driver->ops->mmap))
1267                ret = driver->ops->mmap(container->iommu_data, vma);
1268
1269        return ret;
1270}
1271
1272static const struct file_operations vfio_fops = {
1273        .owner          = THIS_MODULE,
1274        .open           = vfio_fops_open,
1275        .release        = vfio_fops_release,
1276        .read           = vfio_fops_read,
1277        .write          = vfio_fops_write,
1278        .unlocked_ioctl = vfio_fops_unl_ioctl,
1279#ifdef CONFIG_COMPAT
1280        .compat_ioctl   = vfio_fops_compat_ioctl,
1281#endif
1282        .mmap           = vfio_fops_mmap,
1283};
1284
1285/**
1286 * VFIO Group fd, /dev/vfio/$GROUP
1287 */
1288static void __vfio_group_unset_container(struct vfio_group *group)
1289{
1290        struct vfio_container *container = group->container;
1291        struct vfio_iommu_driver *driver;
1292
1293        down_write(&container->group_lock);
1294
1295        driver = container->iommu_driver;
1296        if (driver)
1297                driver->ops->detach_group(container->iommu_data,
1298                                          group->iommu_group);
1299
1300        group->container = NULL;
1301        list_del(&group->container_next);
1302
1303        /* Detaching the last group deprivileges a container, remove iommu */
1304        if (driver && list_empty(&container->group_list)) {
1305                driver->ops->release(container->iommu_data);
1306                module_put(driver->ops->owner);
1307                container->iommu_driver = NULL;
1308                container->iommu_data = NULL;
1309        }
1310
1311        up_write(&container->group_lock);
1312
1313        vfio_container_put(container);
1314}
1315
1316/*
1317 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1318 * if there was no container to unset.  Since the ioctl is called on
1319 * the group, we know that still exists, therefore the only valid
1320 * transition here is 1->0.
1321 */
1322static int vfio_group_unset_container(struct vfio_group *group)
1323{
1324        int users = atomic_cmpxchg(&group->container_users, 1, 0);
1325
1326        if (!users)
1327                return -EINVAL;
1328        if (users != 1)
1329                return -EBUSY;
1330
1331        __vfio_group_unset_container(group);
1332
1333        return 0;
1334}
1335
1336/*
1337 * When removing container users, anything that removes the last user
1338 * implicitly removes the group from the container.  That is, if the
1339 * group file descriptor is closed, as well as any device file descriptors,
1340 * the group is free.
1341 */
1342static void vfio_group_try_dissolve_container(struct vfio_group *group)
1343{
1344        if (0 == atomic_dec_if_positive(&group->container_users))
1345                __vfio_group_unset_container(group);
1346}
1347
1348static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1349{
1350        struct fd f;
1351        struct vfio_container *container;
1352        struct vfio_iommu_driver *driver;
1353        int ret = 0;
1354
1355        if (atomic_read(&group->container_users))
1356                return -EINVAL;
1357
1358        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1359                return -EPERM;
1360
1361        f = fdget(container_fd);
1362        if (!f.file)
1363                return -EBADF;
1364
1365        /* Sanity check, is this really our fd? */
1366        if (f.file->f_op != &vfio_fops) {
1367                fdput(f);
1368                return -EINVAL;
1369        }
1370
1371        container = f.file->private_data;
1372        WARN_ON(!container); /* fget ensures we don't race vfio_release */
1373
1374        down_write(&container->group_lock);
1375
1376        /* Real groups and fake groups cannot mix */
1377        if (!list_empty(&container->group_list) &&
1378            container->noiommu != group->noiommu) {
1379                ret = -EPERM;
1380                goto unlock_out;
1381        }
1382
1383        driver = container->iommu_driver;
1384        if (driver) {
1385                ret = driver->ops->attach_group(container->iommu_data,
1386                                                group->iommu_group);
1387                if (ret)
1388                        goto unlock_out;
1389        }
1390
1391        group->container = container;
1392        container->noiommu = group->noiommu;
1393        list_add(&group->container_next, &container->group_list);
1394
1395        /* Get a reference on the container and mark a user within the group */
1396        vfio_container_get(container);
1397        atomic_inc(&group->container_users);
1398
1399unlock_out:
1400        up_write(&container->group_lock);
1401        fdput(f);
1402        return ret;
1403}
1404
1405static bool vfio_group_viable(struct vfio_group *group)
1406{
1407        return (iommu_group_for_each_dev(group->iommu_group,
1408                                         group, vfio_dev_viable) == 0);
1409}
1410
1411static int vfio_group_add_container_user(struct vfio_group *group)
1412{
1413        if (!atomic_inc_not_zero(&group->container_users))
1414                return -EINVAL;
1415
1416        if (group->noiommu) {
1417                atomic_dec(&group->container_users);
1418                return -EPERM;
1419        }
1420        if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1421                atomic_dec(&group->container_users);
1422                return -EINVAL;
1423        }
1424
1425        return 0;
1426}
1427
1428static const struct file_operations vfio_device_fops;
1429
1430static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1431{
1432        struct vfio_device *device;
1433        struct file *filep;
1434        int ret;
1435
1436        if (0 == atomic_read(&group->container_users) ||
1437            !group->container->iommu_driver || !vfio_group_viable(group))
1438                return -EINVAL;
1439
1440        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1441                return -EPERM;
1442
1443        device = vfio_device_get_from_name(group, buf);
1444        if (!device)
1445                return -ENODEV;
1446
1447        ret = device->ops->open(device->device_data);
1448        if (ret) {
1449                vfio_device_put(device);
1450                return ret;
1451        }
1452
1453        /*
1454         * We can't use anon_inode_getfd() because we need to modify
1455         * the f_mode flags directly to allow more than just ioctls
1456         */
1457        ret = get_unused_fd_flags(O_CLOEXEC);
1458        if (ret < 0) {
1459                device->ops->release(device->device_data);
1460                vfio_device_put(device);
1461                return ret;
1462        }
1463
1464        filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1465                                   device, O_RDWR);
1466        if (IS_ERR(filep)) {
1467                put_unused_fd(ret);
1468                ret = PTR_ERR(filep);
1469                device->ops->release(device->device_data);
1470                vfio_device_put(device);
1471                return ret;
1472        }
1473
1474        /*
1475         * TODO: add an anon_inode interface to do this.
1476         * Appears to be missing by lack of need rather than
1477         * explicitly prevented.  Now there's need.
1478         */
1479        filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1480
1481        atomic_inc(&group->container_users);
1482
1483        fd_install(ret, filep);
1484
1485        if (group->noiommu)
1486                dev_warn(device->dev, "vfio-noiommu device opened by user "
1487                         "(%s:%d)\n", current->comm, task_pid_nr(current));
1488
1489        return ret;
1490}
1491
1492static long vfio_group_fops_unl_ioctl(struct file *filep,
1493                                      unsigned int cmd, unsigned long arg)
1494{
1495        struct vfio_group *group = filep->private_data;
1496        long ret = -ENOTTY;
1497
1498        switch (cmd) {
1499        case VFIO_GROUP_GET_STATUS:
1500        {
1501                struct vfio_group_status status;
1502                unsigned long minsz;
1503
1504                minsz = offsetofend(struct vfio_group_status, flags);
1505
1506                if (copy_from_user(&status, (void __user *)arg, minsz))
1507                        return -EFAULT;
1508
1509                if (status.argsz < minsz)
1510                        return -EINVAL;
1511
1512                status.flags = 0;
1513
1514                if (vfio_group_viable(group))
1515                        status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1516
1517                if (group->container)
1518                        status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1519
1520                if (copy_to_user((void __user *)arg, &status, minsz))
1521                        return -EFAULT;
1522
1523                ret = 0;
1524                break;
1525        }
1526        case VFIO_GROUP_SET_CONTAINER:
1527        {
1528                int fd;
1529
1530                if (get_user(fd, (int __user *)arg))
1531                        return -EFAULT;
1532
1533                if (fd < 0)
1534                        return -EINVAL;
1535
1536                ret = vfio_group_set_container(group, fd);
1537                break;
1538        }
1539        case VFIO_GROUP_UNSET_CONTAINER:
1540                ret = vfio_group_unset_container(group);
1541                break;
1542        case VFIO_GROUP_GET_DEVICE_FD:
1543        {
1544                char *buf;
1545
1546                buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1547                if (IS_ERR(buf))
1548                        return PTR_ERR(buf);
1549
1550                ret = vfio_group_get_device_fd(group, buf);
1551                kfree(buf);
1552                break;
1553        }
1554        }
1555
1556        return ret;
1557}
1558
1559#ifdef CONFIG_COMPAT
1560static long vfio_group_fops_compat_ioctl(struct file *filep,
1561                                         unsigned int cmd, unsigned long arg)
1562{
1563        arg = (unsigned long)compat_ptr(arg);
1564        return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1565}
1566#endif  /* CONFIG_COMPAT */
1567
1568static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1569{
1570        struct vfio_group *group;
1571        int opened;
1572
1573        group = vfio_group_get_from_minor(iminor(inode));
1574        if (!group)
1575                return -ENODEV;
1576
1577        if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1578                vfio_group_put(group);
1579                return -EPERM;
1580        }
1581
1582        /* Do we need multiple instances of the group open?  Seems not. */
1583        opened = atomic_cmpxchg(&group->opened, 0, 1);
1584        if (opened) {
1585                vfio_group_put(group);
1586                return -EBUSY;
1587        }
1588
1589        /* Is something still in use from a previous open? */
1590        if (group->container) {
1591                atomic_dec(&group->opened);
1592                vfio_group_put(group);
1593                return -EBUSY;
1594        }
1595
1596        /* Warn if previous user didn't cleanup and re-init to drop them */
1597        if (WARN_ON(group->notifier.head))
1598                BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1599
1600        filep->private_data = group;
1601
1602        return 0;
1603}
1604
1605static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1606{
1607        struct vfio_group *group = filep->private_data;
1608
1609        filep->private_data = NULL;
1610
1611        vfio_group_try_dissolve_container(group);
1612
1613        atomic_dec(&group->opened);
1614
1615        vfio_group_put(group);
1616
1617        return 0;
1618}
1619
1620static const struct file_operations vfio_group_fops = {
1621        .owner          = THIS_MODULE,
1622        .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1623#ifdef CONFIG_COMPAT
1624        .compat_ioctl   = vfio_group_fops_compat_ioctl,
1625#endif
1626        .open           = vfio_group_fops_open,
1627        .release        = vfio_group_fops_release,
1628};
1629
1630/**
1631 * VFIO Device fd
1632 */
1633static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1634{
1635        struct vfio_device *device = filep->private_data;
1636
1637        device->ops->release(device->device_data);
1638
1639        vfio_group_try_dissolve_container(device->group);
1640
1641        vfio_device_put(device);
1642
1643        return 0;
1644}
1645
1646static long vfio_device_fops_unl_ioctl(struct file *filep,
1647                                       unsigned int cmd, unsigned long arg)
1648{
1649        struct vfio_device *device = filep->private_data;
1650
1651        if (unlikely(!device->ops->ioctl))
1652                return -EINVAL;
1653
1654        return device->ops->ioctl(device->device_data, cmd, arg);
1655}
1656
1657static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1658                                     size_t count, loff_t *ppos)
1659{
1660        struct vfio_device *device = filep->private_data;
1661
1662        if (unlikely(!device->ops->read))
1663                return -EINVAL;
1664
1665        return device->ops->read(device->device_data, buf, count, ppos);
1666}
1667
1668static ssize_t vfio_device_fops_write(struct file *filep,
1669                                      const char __user *buf,
1670                                      size_t count, loff_t *ppos)
1671{
1672        struct vfio_device *device = filep->private_data;
1673
1674        if (unlikely(!device->ops->write))
1675                return -EINVAL;
1676
1677        return device->ops->write(device->device_data, buf, count, ppos);
1678}
1679
1680static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1681{
1682        struct vfio_device *device = filep->private_data;
1683
1684        if (unlikely(!device->ops->mmap))
1685                return -EINVAL;
1686
1687        return device->ops->mmap(device->device_data, vma);
1688}
1689
1690#ifdef CONFIG_COMPAT
1691static long vfio_device_fops_compat_ioctl(struct file *filep,
1692                                          unsigned int cmd, unsigned long arg)
1693{
1694        arg = (unsigned long)compat_ptr(arg);
1695        return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1696}
1697#endif  /* CONFIG_COMPAT */
1698
1699static const struct file_operations vfio_device_fops = {
1700        .owner          = THIS_MODULE,
1701        .release        = vfio_device_fops_release,
1702        .read           = vfio_device_fops_read,
1703        .write          = vfio_device_fops_write,
1704        .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1705#ifdef CONFIG_COMPAT
1706        .compat_ioctl   = vfio_device_fops_compat_ioctl,
1707#endif
1708        .mmap           = vfio_device_fops_mmap,
1709};
1710
1711/**
1712 * External user API, exported by symbols to be linked dynamically.
1713 *
1714 * The protocol includes:
1715 *  1. do normal VFIO init operation:
1716 *      - opening a new container;
1717 *      - attaching group(s) to it;
1718 *      - setting an IOMMU driver for a container.
1719 * When IOMMU is set for a container, all groups in it are
1720 * considered ready to use by an external user.
1721 *
1722 * 2. User space passes a group fd to an external user.
1723 * The external user calls vfio_group_get_external_user()
1724 * to verify that:
1725 *      - the group is initialized;
1726 *      - IOMMU is set for it.
1727 * If both checks passed, vfio_group_get_external_user()
1728 * increments the container user counter to prevent
1729 * the VFIO group from disposal before KVM exits.
1730 *
1731 * 3. The external user calls vfio_external_user_iommu_id()
1732 * to know an IOMMU ID.
1733 *
1734 * 4. When the external KVM finishes, it calls
1735 * vfio_group_put_external_user() to release the VFIO group.
1736 * This call decrements the container user counter.
1737 */
1738struct vfio_group *vfio_group_get_external_user(struct file *filep)
1739{
1740        struct vfio_group *group = filep->private_data;
1741        int ret;
1742
1743        if (filep->f_op != &vfio_group_fops)
1744                return ERR_PTR(-EINVAL);
1745
1746        ret = vfio_group_add_container_user(group);
1747        if (ret)
1748                return ERR_PTR(ret);
1749
1750        vfio_group_get(group);
1751
1752        return group;
1753}
1754EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1755
1756void vfio_group_put_external_user(struct vfio_group *group)
1757{
1758        vfio_group_try_dissolve_container(group);
1759        vfio_group_put(group);
1760}
1761EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1762
1763bool vfio_external_group_match_file(struct vfio_group *test_group,
1764                                    struct file *filep)
1765{
1766        struct vfio_group *group = filep->private_data;
1767
1768        return (filep->f_op == &vfio_group_fops) && (group == test_group);
1769}
1770EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1771
1772int vfio_external_user_iommu_id(struct vfio_group *group)
1773{
1774        return iommu_group_id(group->iommu_group);
1775}
1776EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1777
1778long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1779{
1780        return vfio_ioctl_check_extension(group->container, arg);
1781}
1782EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1783
1784/**
1785 * Sub-module support
1786 */
1787/*
1788 * Helper for managing a buffer of info chain capabilities, allocate or
1789 * reallocate a buffer with additional @size, filling in @id and @version
1790 * of the capability.  A pointer to the new capability is returned.
1791 *
1792 * NB. The chain is based at the head of the buffer, so new entries are
1793 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1794 * next offsets prior to copying to the user buffer.
1795 */
1796struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1797                                               size_t size, u16 id, u16 version)
1798{
1799        void *buf;
1800        struct vfio_info_cap_header *header, *tmp;
1801
1802        buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1803        if (!buf) {
1804                kfree(caps->buf);
1805                caps->size = 0;
1806                return ERR_PTR(-ENOMEM);
1807        }
1808
1809        caps->buf = buf;
1810        header = buf + caps->size;
1811
1812        /* Eventually copied to user buffer, zero */
1813        memset(header, 0, size);
1814
1815        header->id = id;
1816        header->version = version;
1817
1818        /* Add to the end of the capability chain */
1819        for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1820                ; /* nothing */
1821
1822        tmp->next = caps->size;
1823        caps->size += size;
1824
1825        return header;
1826}
1827EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1828
1829void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1830{
1831        struct vfio_info_cap_header *tmp;
1832        void *buf = (void *)caps->buf;
1833
1834        for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1835                tmp->next += offset;
1836}
1837EXPORT_SYMBOL(vfio_info_cap_shift);
1838
1839static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1840{
1841        struct vfio_info_cap_header *header;
1842        struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1843        size_t size;
1844
1845        size = sizeof(*sparse) + sparse->nr_areas *  sizeof(*sparse->areas);
1846        header = vfio_info_cap_add(caps, size,
1847                                   VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1848        if (IS_ERR(header))
1849                return PTR_ERR(header);
1850
1851        sparse_cap = container_of(header,
1852                        struct vfio_region_info_cap_sparse_mmap, header);
1853        sparse_cap->nr_areas = sparse->nr_areas;
1854        memcpy(sparse_cap->areas, sparse->areas,
1855               sparse->nr_areas * sizeof(*sparse->areas));
1856        return 0;
1857}
1858
1859static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1860{
1861        struct vfio_info_cap_header *header;
1862        struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1863
1864        header = vfio_info_cap_add(caps, sizeof(*cap),
1865                                   VFIO_REGION_INFO_CAP_TYPE, 1);
1866        if (IS_ERR(header))
1867                return PTR_ERR(header);
1868
1869        type_cap = container_of(header, struct vfio_region_info_cap_type,
1870                                header);
1871        type_cap->type = cap->type;
1872        type_cap->subtype = cap->subtype;
1873        return 0;
1874}
1875
1876int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1877                             void *cap_type)
1878{
1879        int ret = -EINVAL;
1880
1881        if (!cap_type)
1882                return 0;
1883
1884        switch (cap_type_id) {
1885        case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1886                ret = sparse_mmap_cap(caps, cap_type);
1887                break;
1888
1889        case VFIO_REGION_INFO_CAP_TYPE:
1890                ret = region_type_cap(caps, cap_type);
1891                break;
1892        }
1893
1894        return ret;
1895}
1896EXPORT_SYMBOL(vfio_info_add_capability);
1897
1898int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1899                                       int max_irq_type, size_t *data_size)
1900{
1901        unsigned long minsz;
1902        size_t size;
1903
1904        minsz = offsetofend(struct vfio_irq_set, count);
1905
1906        if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1907            (hdr->count >= (U32_MAX - hdr->start)) ||
1908            (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1909                                VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1910                return -EINVAL;
1911
1912        if (data_size)
1913                *data_size = 0;
1914
1915        if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1916                return -EINVAL;
1917
1918        switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1919        case VFIO_IRQ_SET_DATA_NONE:
1920                size = 0;
1921                break;
1922        case VFIO_IRQ_SET_DATA_BOOL:
1923                size = sizeof(uint8_t);
1924                break;
1925        case VFIO_IRQ_SET_DATA_EVENTFD:
1926                size = sizeof(int32_t);
1927                break;
1928        default:
1929                return -EINVAL;
1930        }
1931
1932        if (size) {
1933                if (hdr->argsz - minsz < hdr->count * size)
1934                        return -EINVAL;
1935
1936                if (!data_size)
1937                        return -EINVAL;
1938
1939                *data_size = hdr->count * size;
1940        }
1941
1942        return 0;
1943}
1944EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1945
1946/*
1947 * Pin a set of guest PFNs and return their associated host PFNs for local
1948 * domain only.
1949 * @dev [in]     : device
1950 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1951 * @npage [in]   : count of elements in user_pfn array.  This count should not
1952 *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1953 * @prot [in]    : protection flags
1954 * @phys_pfn[out]: array of host PFNs
1955 * Return error or number of pages pinned.
1956 */
1957int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1958                   int prot, unsigned long *phys_pfn)
1959{
1960        struct vfio_container *container;
1961        struct vfio_group *group;
1962        struct vfio_iommu_driver *driver;
1963        int ret;
1964
1965        if (!dev || !user_pfn || !phys_pfn || !npage)
1966                return -EINVAL;
1967
1968        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1969                return -E2BIG;
1970
1971        group = vfio_group_get_from_dev(dev);
1972        if (!group)
1973                return -ENODEV;
1974
1975        ret = vfio_group_add_container_user(group);
1976        if (ret)
1977                goto err_pin_pages;
1978
1979        container = group->container;
1980        driver = container->iommu_driver;
1981        if (likely(driver && driver->ops->pin_pages))
1982                ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1983                                             npage, prot, phys_pfn);
1984        else
1985                ret = -ENOTTY;
1986
1987        vfio_group_try_dissolve_container(group);
1988
1989err_pin_pages:
1990        vfio_group_put(group);
1991        return ret;
1992}
1993EXPORT_SYMBOL(vfio_pin_pages);
1994
1995/*
1996 * Unpin set of host PFNs for local domain only.
1997 * @dev [in]     : device
1998 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1999 *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2000 * @npage [in]   : count of elements in user_pfn array.  This count should not
2001 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2002 * Return error or number of pages unpinned.
2003 */
2004int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2005{
2006        struct vfio_container *container;
2007        struct vfio_group *group;
2008        struct vfio_iommu_driver *driver;
2009        int ret;
2010
2011        if (!dev || !user_pfn || !npage)
2012                return -EINVAL;
2013
2014        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2015                return -E2BIG;
2016
2017        group = vfio_group_get_from_dev(dev);
2018        if (!group)
2019                return -ENODEV;
2020
2021        ret = vfio_group_add_container_user(group);
2022        if (ret)
2023                goto err_unpin_pages;
2024
2025        container = group->container;
2026        driver = container->iommu_driver;
2027        if (likely(driver && driver->ops->unpin_pages))
2028                ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2029                                               npage);
2030        else
2031                ret = -ENOTTY;
2032
2033        vfio_group_try_dissolve_container(group);
2034
2035err_unpin_pages:
2036        vfio_group_put(group);
2037        return ret;
2038}
2039EXPORT_SYMBOL(vfio_unpin_pages);
2040
2041static int vfio_register_iommu_notifier(struct vfio_group *group,
2042                                        unsigned long *events,
2043                                        struct notifier_block *nb)
2044{
2045        struct vfio_container *container;
2046        struct vfio_iommu_driver *driver;
2047        int ret;
2048
2049        ret = vfio_group_add_container_user(group);
2050        if (ret)
2051                return -EINVAL;
2052
2053        container = group->container;
2054        driver = container->iommu_driver;
2055        if (likely(driver && driver->ops->register_notifier))
2056                ret = driver->ops->register_notifier(container->iommu_data,
2057                                                     events, nb);
2058        else
2059                ret = -ENOTTY;
2060
2061        vfio_group_try_dissolve_container(group);
2062
2063        return ret;
2064}
2065
2066static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2067                                          struct notifier_block *nb)
2068{
2069        struct vfio_container *container;
2070        struct vfio_iommu_driver *driver;
2071        int ret;
2072
2073        ret = vfio_group_add_container_user(group);
2074        if (ret)
2075                return -EINVAL;
2076
2077        container = group->container;
2078        driver = container->iommu_driver;
2079        if (likely(driver && driver->ops->unregister_notifier))
2080                ret = driver->ops->unregister_notifier(container->iommu_data,
2081                                                       nb);
2082        else
2083                ret = -ENOTTY;
2084
2085        vfio_group_try_dissolve_container(group);
2086
2087        return ret;
2088}
2089
2090void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2091{
2092        group->kvm = kvm;
2093        blocking_notifier_call_chain(&group->notifier,
2094                                VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2095}
2096EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2097
2098static int vfio_register_group_notifier(struct vfio_group *group,
2099                                        unsigned long *events,
2100                                        struct notifier_block *nb)
2101{
2102        int ret;
2103        bool set_kvm = false;
2104
2105        if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2106                set_kvm = true;
2107
2108        /* clear known events */
2109        *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2110
2111        /* refuse to continue if still events remaining */
2112        if (*events)
2113                return -EINVAL;
2114
2115        ret = vfio_group_add_container_user(group);
2116        if (ret)
2117                return -EINVAL;
2118
2119        ret = blocking_notifier_chain_register(&group->notifier, nb);
2120
2121        /*
2122         * The attaching of kvm and vfio_group might already happen, so
2123         * here we replay once upon registration.
2124         */
2125        if (!ret && set_kvm && group->kvm)
2126                blocking_notifier_call_chain(&group->notifier,
2127                                        VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2128
2129        vfio_group_try_dissolve_container(group);
2130
2131        return ret;
2132}
2133
2134static int vfio_unregister_group_notifier(struct vfio_group *group,
2135                                         struct notifier_block *nb)
2136{
2137        int ret;
2138
2139        ret = vfio_group_add_container_user(group);
2140        if (ret)
2141                return -EINVAL;
2142
2143        ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2144
2145        vfio_group_try_dissolve_container(group);
2146
2147        return ret;
2148}
2149
2150int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2151                           unsigned long *events, struct notifier_block *nb)
2152{
2153        struct vfio_group *group;
2154        int ret;
2155
2156        if (!dev || !nb || !events || (*events == 0))
2157                return -EINVAL;
2158
2159        group = vfio_group_get_from_dev(dev);
2160        if (!group)
2161                return -ENODEV;
2162
2163        switch (type) {
2164        case VFIO_IOMMU_NOTIFY:
2165                ret = vfio_register_iommu_notifier(group, events, nb);
2166                break;
2167        case VFIO_GROUP_NOTIFY:
2168                ret = vfio_register_group_notifier(group, events, nb);
2169                break;
2170        default:
2171                ret = -EINVAL;
2172        }
2173
2174        vfio_group_put(group);
2175        return ret;
2176}
2177EXPORT_SYMBOL(vfio_register_notifier);
2178
2179int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2180                             struct notifier_block *nb)
2181{
2182        struct vfio_group *group;
2183        int ret;
2184
2185        if (!dev || !nb)
2186                return -EINVAL;
2187
2188        group = vfio_group_get_from_dev(dev);
2189        if (!group)
2190                return -ENODEV;
2191
2192        switch (type) {
2193        case VFIO_IOMMU_NOTIFY:
2194                ret = vfio_unregister_iommu_notifier(group, nb);
2195                break;
2196        case VFIO_GROUP_NOTIFY:
2197                ret = vfio_unregister_group_notifier(group, nb);
2198                break;
2199        default:
2200                ret = -EINVAL;
2201        }
2202
2203        vfio_group_put(group);
2204        return ret;
2205}
2206EXPORT_SYMBOL(vfio_unregister_notifier);
2207
2208/**
2209 * Module/class support
2210 */
2211static char *vfio_devnode(struct device *dev, umode_t *mode)
2212{
2213        return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2214}
2215
2216static struct miscdevice vfio_dev = {
2217        .minor = VFIO_MINOR,
2218        .name = "vfio",
2219        .fops = &vfio_fops,
2220        .nodename = "vfio/vfio",
2221        .mode = S_IRUGO | S_IWUGO,
2222};
2223
2224static int __init vfio_init(void)
2225{
2226        int ret;
2227
2228        idr_init(&vfio.group_idr);
2229        mutex_init(&vfio.group_lock);
2230        mutex_init(&vfio.iommu_drivers_lock);
2231        INIT_LIST_HEAD(&vfio.group_list);
2232        INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2233        init_waitqueue_head(&vfio.release_q);
2234
2235        ret = misc_register(&vfio_dev);
2236        if (ret) {
2237                pr_err("vfio: misc device register failed\n");
2238                return ret;
2239        }
2240
2241        /* /dev/vfio/$GROUP */
2242        vfio.class = class_create(THIS_MODULE, "vfio");
2243        if (IS_ERR(vfio.class)) {
2244                ret = PTR_ERR(vfio.class);
2245                goto err_class;
2246        }
2247
2248        vfio.class->devnode = vfio_devnode;
2249
2250        ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2251        if (ret)
2252                goto err_alloc_chrdev;
2253
2254        cdev_init(&vfio.group_cdev, &vfio_group_fops);
2255        ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2256        if (ret)
2257                goto err_cdev_add;
2258
2259        pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2260
2261#ifdef CONFIG_VFIO_NOIOMMU
2262        vfio_register_iommu_driver(&vfio_noiommu_ops);
2263#endif
2264        return 0;
2265
2266err_cdev_add:
2267        unregister_chrdev_region(vfio.group_devt, MINORMASK);
2268err_alloc_chrdev:
2269        class_destroy(vfio.class);
2270        vfio.class = NULL;
2271err_class:
2272        misc_deregister(&vfio_dev);
2273        return ret;
2274}
2275
2276static void __exit vfio_cleanup(void)
2277{
2278        WARN_ON(!list_empty(&vfio.group_list));
2279
2280#ifdef CONFIG_VFIO_NOIOMMU
2281        vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2282#endif
2283        idr_destroy(&vfio.group_idr);
2284        cdev_del(&vfio.group_cdev);
2285        unregister_chrdev_region(vfio.group_devt, MINORMASK);
2286        class_destroy(vfio.class);
2287        vfio.class = NULL;
2288        misc_deregister(&vfio_dev);
2289}
2290
2291module_init(vfio_init);
2292module_exit(vfio_cleanup);
2293
2294MODULE_VERSION(DRIVER_VERSION);
2295MODULE_LICENSE("GPL v2");
2296MODULE_AUTHOR(DRIVER_AUTHOR);
2297MODULE_DESCRIPTION(DRIVER_DESC);
2298MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2299MODULE_ALIAS("devname:vfio/vfio");
2300MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2301