LXR linux/drivers/vfio/vfio.c

   1/*
   2 * VFIO core
   3 *
   4 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5 *     Author: Alex Williamson <alex.williamson@redhat.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio:
  12 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13 * Author: Tom Lyon, pugs@cisco.com
  14 */
  15
  16#include <linux/cdev.h>
  17#include <linux/compat.h>
  18#include <linux/device.h>
  19#include <linux/file.h>
  20#include <linux/anon_inodes.h>
  21#include <linux/fs.h>
  22#include <linux/idr.h>
  23#include <linux/iommu.h>
  24#include <linux/list.h>
  25#include <linux/miscdevice.h>
  26#include <linux/module.h>
  27#include <linux/mutex.h>
  28#include <linux/pci.h>
  29#include <linux/rwsem.h>
  30#include <linux/sched.h>
  31#include <linux/slab.h>
  32#include <linux/stat.h>
  33#include <linux/string.h>
  34#include <linux/uaccess.h>
  35#include <linux/vfio.h>
  36#include <linux/wait.h>
  37
  38#define DRIVER_VERSION  "0.3"
  39#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  40#define DRIVER_DESC     "VFIO - User Level meta-driver"
  41
  42static struct vfio {
  43        struct class                    *class;
  44        struct list_head                iommu_drivers_list;
  45        struct mutex                    iommu_drivers_lock;
  46        struct list_head                group_list;
  47        struct idr                      group_idr;
  48        struct mutex                    group_lock;
  49        struct cdev                     group_cdev;
  50        dev_t                           group_devt;
  51        wait_queue_head_t               release_q;
  52} vfio;
  53
  54struct vfio_iommu_driver {
  55        const struct vfio_iommu_driver_ops      *ops;
  56        struct list_head                        vfio_next;
  57};
  58
  59struct vfio_container {
  60        struct kref                     kref;
  61        struct list_head                group_list;
  62        struct rw_semaphore             group_lock;
  63        struct vfio_iommu_driver        *iommu_driver;
  64        void                            *iommu_data;
  65        bool                            noiommu;
  66};
  67
  68struct vfio_unbound_dev {
  69        struct device                   *dev;
  70        struct list_head                unbound_next;
  71};
  72
  73struct vfio_group {
  74        struct kref                     kref;
  75        int                             minor;
  76        atomic_t                        container_users;
  77        struct iommu_group              *iommu_group;
  78        struct vfio_container           *container;
  79        struct list_head                device_list;
  80        struct mutex                    device_lock;
  81        struct device                   *dev;
  82        struct notifier_block           nb;
  83        struct list_head                vfio_next;
  84        struct list_head                container_next;
  85        struct list_head                unbound_list;
  86        struct mutex                    unbound_lock;
  87        atomic_t                        opened;
  88        wait_queue_head_t               container_q;
  89        bool                            noiommu;
  90        struct kvm                      *kvm;
  91        struct blocking_notifier_head   notifier;
  92};
  93
  94struct vfio_device {
  95        struct kref                     kref;
  96        struct device                   *dev;
  97        const struct vfio_device_ops    *ops;
  98        struct vfio_group               *group;
  99        struct list_head                group_next;
 100        void                            *device_data;
 101};
 102
 103#ifdef CONFIG_VFIO_NOIOMMU
 104static bool noiommu __read_mostly;
 105module_param_named(enable_unsafe_noiommu_mode,
 106                   noiommu, bool, S_IRUGO | S_IWUSR);
 107MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 108#endif
 109
 110/*
 111 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 112 * and remove functions, any use cases other than acquiring the first
 113 * reference for the purpose of calling vfio_add_group_dev() or removing
 114 * that symmetric reference after vfio_del_group_dev() should use the raw
 115 * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 116 * removes the device from the dummy group and cannot be nested.
 117 */
 118struct iommu_group *vfio_iommu_group_get(struct device *dev)
 119{
 120        struct iommu_group *group;
 121        int __maybe_unused ret;
 122
 123        group = iommu_group_get(dev);
 124
 125#ifdef CONFIG_VFIO_NOIOMMU
 126        /*
 127         * With noiommu enabled, an IOMMU group will be created for a device
 128         * that doesn't already have one and doesn't have an iommu_ops on their
 129         * bus.  We set iommudata simply to be able to identify these groups
 130         * as special use and for reclamation later.
 131         */
 132        if (group || !noiommu || iommu_present(dev->bus))
 133                return group;
 134
 135        group = iommu_group_alloc();
 136        if (IS_ERR(group))
 137                return NULL;
 138
 139        iommu_group_set_name(group, "vfio-noiommu");
 140        iommu_group_set_iommudata(group, &noiommu, NULL);
 141        ret = iommu_group_add_device(group, dev);
 142        if (ret) {
 143                iommu_group_put(group);
 144                return NULL;
 145        }
 146
 147        /*
 148         * Where to taint?  At this point we've added an IOMMU group for a
 149         * device that is not backed by iommu_ops, therefore any iommu_
 150         * callback using iommu_ops can legitimately Oops.  So, while we may
 151         * be about to give a DMA capable device to a user without IOMMU
 152         * protection, which is clearly taint-worthy, let's go ahead and do
 153         * it here.
 154         */
 155        add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 156        dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 157#endif
 158
 159        return group;
 160}
 161EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 162
 163void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 164{
 165#ifdef CONFIG_VFIO_NOIOMMU
 166        if (iommu_group_get_iommudata(group) == &noiommu)
 167                iommu_group_remove_device(dev);
 168#endif
 169
 170        iommu_group_put(group);
 171}
 172EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 173
 174#ifdef CONFIG_VFIO_NOIOMMU
 175static void *vfio_noiommu_open(unsigned long arg)
 176{
 177        if (arg != VFIO_NOIOMMU_IOMMU)
 178                return ERR_PTR(-EINVAL);
 179        if (!capable(CAP_SYS_RAWIO))
 180                return ERR_PTR(-EPERM);
 181
 182        return NULL;
 183}
 184
 185static void vfio_noiommu_release(void *iommu_data)
 186{
 187}
 188
 189static long vfio_noiommu_ioctl(void *iommu_data,
 190                               unsigned int cmd, unsigned long arg)
 191{
 192        if (cmd == VFIO_CHECK_EXTENSION)
 193                return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 194
 195        return -ENOTTY;
 196}
 197
 198static int vfio_noiommu_attach_group(void *iommu_data,
 199                                     struct iommu_group *iommu_group)
 200{
 201        return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 202}
 203
 204static void vfio_noiommu_detach_group(void *iommu_data,
 205                                      struct iommu_group *iommu_group)
 206{
 207}
 208
 209static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 210        .name = "vfio-noiommu",
 211        .owner = THIS_MODULE,
 212        .open = vfio_noiommu_open,
 213        .release = vfio_noiommu_release,
 214        .ioctl = vfio_noiommu_ioctl,
 215        .attach_group = vfio_noiommu_attach_group,
 216        .detach_group = vfio_noiommu_detach_group,
 217};
 218#endif
 219
 220
 221/**
 222 * IOMMU driver registration
 223 */
 224int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 225{
 226        struct vfio_iommu_driver *driver, *tmp;
 227
 228        driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 229        if (!driver)
 230                return -ENOMEM;
 231
 232        driver->ops = ops;
 233
 234        mutex_lock(&vfio.iommu_drivers_lock);
 235
 236        /* Check for duplicates */
 237        list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 238                if (tmp->ops == ops) {
 239                        mutex_unlock(&vfio.iommu_drivers_lock);
 240                        kfree(driver);
 241                        return -EINVAL;
 242                }
 243        }
 244
 245        list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 246
 247        mutex_unlock(&vfio.iommu_drivers_lock);
 248
 249        return 0;
 250}
 251EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 252
 253void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 254{
 255        struct vfio_iommu_driver *driver;
 256
 257        mutex_lock(&vfio.iommu_drivers_lock);
 258        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 259                if (driver->ops == ops) {
 260                        list_del(&driver->vfio_next);
 261                        mutex_unlock(&vfio.iommu_drivers_lock);
 262                        kfree(driver);
 263                        return;
 264                }
 265        }
 266        mutex_unlock(&vfio.iommu_drivers_lock);
 267}
 268EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 269
 270/**
 271 * Group minor allocation/free - both called with vfio.group_lock held
 272 */
 273static int vfio_alloc_group_minor(struct vfio_group *group)
 274{
 275        return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 276}
 277
 278static void vfio_free_group_minor(int minor)
 279{
 280        idr_remove(&vfio.group_idr, minor);
 281}
 282
 283static int vfio_iommu_group_notifier(struct notifier_block *nb,
 284                                     unsigned long action, void *data);
 285static void vfio_group_get(struct vfio_group *group);
 286
 287/**
 288 * Container objects - containers are created when /dev/vfio/vfio is
 289 * opened, but their lifecycle extends until the last user is done, so
 290 * it's freed via kref.  Must support container/group/device being
 291 * closed in any order.
 292 */
 293static void vfio_container_get(struct vfio_container *container)
 294{
 295        kref_get(&container->kref);
 296}
 297
 298static void vfio_container_release(struct kref *kref)
 299{
 300        struct vfio_container *container;
 301        container = container_of(kref, struct vfio_container, kref);
 302
 303        kfree(container);
 304}
 305
 306static void vfio_container_put(struct vfio_container *container)
 307{
 308        kref_put(&container->kref, vfio_container_release);
 309}
 310
 311static void vfio_group_unlock_and_free(struct vfio_group *group)
 312{
 313        mutex_unlock(&vfio.group_lock);
 314        /*
 315         * Unregister outside of lock.  A spurious callback is harmless now
 316         * that the group is no longer in vfio.group_list.
 317         */
 318        iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 319        kfree(group);
 320}
 321
 322/**
 323 * Group objects - create, release, get, put, search
 324 */
 325static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 326{
 327        struct vfio_group *group, *tmp;
 328        struct device *dev;
 329        int ret, minor;
 330
 331        group = kzalloc(sizeof(*group), GFP_KERNEL);
 332        if (!group)
 333                return ERR_PTR(-ENOMEM);
 334
 335        kref_init(&group->kref);
 336        INIT_LIST_HEAD(&group->device_list);
 337        mutex_init(&group->device_lock);
 338        INIT_LIST_HEAD(&group->unbound_list);
 339        mutex_init(&group->unbound_lock);
 340        atomic_set(&group->container_users, 0);
 341        atomic_set(&group->opened, 0);
 342        init_waitqueue_head(&group->container_q);
 343        group->iommu_group = iommu_group;
 344#ifdef CONFIG_VFIO_NOIOMMU
 345        group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 346#endif
 347        BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 348
 349        group->nb.notifier_call = vfio_iommu_group_notifier;
 350
 351        /*
 352         * blocking notifiers acquire a rwsem around registering and hold
 353         * it around callback.  Therefore, need to register outside of
 354         * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 355         * do anything unless it can find the group in vfio.group_list, so
 356         * no harm in registering early.
 357         */
 358        ret = iommu_group_register_notifier(iommu_group, &group->nb);
 359        if (ret) {
 360                kfree(group);
 361                return ERR_PTR(ret);
 362        }
 363
 364        mutex_lock(&vfio.group_lock);
 365
 366        /* Did we race creating this group? */
 367        list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 368                if (tmp->iommu_group == iommu_group) {
 369                        vfio_group_get(tmp);
 370                        vfio_group_unlock_and_free(group);
 371                        return tmp;
 372                }
 373        }
 374
 375        minor = vfio_alloc_group_minor(group);
 376        if (minor < 0) {
 377                vfio_group_unlock_and_free(group);
 378                return ERR_PTR(minor);
 379        }
 380
 381        dev = device_create(vfio.class, NULL,
 382                            MKDEV(MAJOR(vfio.group_devt), minor),
 383                            group, "%s%d", group->noiommu ? "noiommu-" : "",
 384                            iommu_group_id(iommu_group));
 385        if (IS_ERR(dev)) {
 386                vfio_free_group_minor(minor);
 387                vfio_group_unlock_and_free(group);
 388                return ERR_CAST(dev);
 389        }
 390
 391        group->minor = minor;
 392        group->dev = dev;
 393
 394        list_add(&group->vfio_next, &vfio.group_list);
 395
 396        mutex_unlock(&vfio.group_lock);
 397
 398        return group;
 399}
 400
 401/* called with vfio.group_lock held */
 402static void vfio_group_release(struct kref *kref)
 403{
 404        struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 405        struct vfio_unbound_dev *unbound, *tmp;
 406        struct iommu_group *iommu_group = group->iommu_group;
 407
 408        WARN_ON(!list_empty(&group->device_list));
 409        WARN_ON(group->notifier.head);
 410
 411        list_for_each_entry_safe(unbound, tmp,
 412                                 &group->unbound_list, unbound_next) {
 413                list_del(&unbound->unbound_next);
 414                kfree(unbound);
 415        }
 416
 417        device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 418        list_del(&group->vfio_next);
 419        vfio_free_group_minor(group->minor);
 420        vfio_group_unlock_and_free(group);
 421        iommu_group_put(iommu_group);
 422}
 423
 424static void vfio_group_put(struct vfio_group *group)
 425{
 426        kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 427}
 428
 429struct vfio_group_put_work {
 430        struct work_struct work;
 431        struct vfio_group *group;
 432};
 433
 434static void vfio_group_put_bg(struct work_struct *work)
 435{
 436        struct vfio_group_put_work *do_work;
 437
 438        do_work = container_of(work, struct vfio_group_put_work, work);
 439
 440        vfio_group_put(do_work->group);
 441        kfree(do_work);
 442}
 443
 444static void vfio_group_schedule_put(struct vfio_group *group)
 445{
 446        struct vfio_group_put_work *do_work;
 447
 448        do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 449        if (WARN_ON(!do_work))
 450                return;
 451
 452        INIT_WORK(&do_work->work, vfio_group_put_bg);
 453        do_work->group = group;
 454        schedule_work(&do_work->work);
 455}
 456
 457/* Assume group_lock or group reference is held */
 458static void vfio_group_get(struct vfio_group *group)
 459{
 460        kref_get(&group->kref);
 461}
 462
 463/*
 464 * Not really a try as we will sleep for mutex, but we need to make
 465 * sure the group pointer is valid under lock and get a reference.
 466 */
 467static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 468{
 469        struct vfio_group *target = group;
 470
 471        mutex_lock(&vfio.group_lock);
 472        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 473                if (group == target) {
 474                        vfio_group_get(group);
 475                        mutex_unlock(&vfio.group_lock);
 476                        return group;
 477                }
 478        }
 479        mutex_unlock(&vfio.group_lock);
 480
 481        return NULL;
 482}
 483
 484static
 485struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 486{
 487        struct vfio_group *group;
 488
 489        mutex_lock(&vfio.group_lock);
 490        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 491                if (group->iommu_group == iommu_group) {
 492                        vfio_group_get(group);
 493                        mutex_unlock(&vfio.group_lock);
 494                        return group;
 495                }
 496        }
 497        mutex_unlock(&vfio.group_lock);
 498
 499        return NULL;
 500}
 501
 502static struct vfio_group *vfio_group_get_from_minor(int minor)
 503{
 504        struct vfio_group *group;
 505
 506        mutex_lock(&vfio.group_lock);
 507        group = idr_find(&vfio.group_idr, minor);
 508        if (!group) {
 509                mutex_unlock(&vfio.group_lock);
 510                return NULL;
 511        }
 512        vfio_group_get(group);
 513        mutex_unlock(&vfio.group_lock);
 514
 515        return group;
 516}
 517
 518static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 519{
 520        struct iommu_group *iommu_group;
 521        struct vfio_group *group;
 522
 523        iommu_group = iommu_group_get(dev);
 524        if (!iommu_group)
 525                return NULL;
 526
 527        group = vfio_group_get_from_iommu(iommu_group);
 528        iommu_group_put(iommu_group);
 529
 530        return group;
 531}
 532
 533/**
 534 * Device objects - create, release, get, put, search
 535 */
 536static
 537struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 538                                             struct device *dev,
 539                                             const struct vfio_device_ops *ops,
 540                                             void *device_data)
 541{
 542        struct vfio_device *device;
 543
 544        device = kzalloc(sizeof(*device), GFP_KERNEL);
 545        if (!device)
 546                return ERR_PTR(-ENOMEM);
 547
 548        kref_init(&device->kref);
 549        device->dev = dev;
 550        device->group = group;
 551        device->ops = ops;
 552        device->device_data = device_data;
 553        dev_set_drvdata(dev, device);
 554
 555        /* No need to get group_lock, caller has group reference */
 556        vfio_group_get(group);
 557
 558        mutex_lock(&group->device_lock);
 559        list_add(&device->group_next, &group->device_list);
 560        mutex_unlock(&group->device_lock);
 561
 562        return device;
 563}
 564
 565static void vfio_device_release(struct kref *kref)
 566{
 567        struct vfio_device *device = container_of(kref,
 568                                                  struct vfio_device, kref);
 569        struct vfio_group *group = device->group;
 570
 571        list_del(&device->group_next);
 572        mutex_unlock(&group->device_lock);
 573
 574        dev_set_drvdata(device->dev, NULL);
 575
 576        kfree(device);
 577
 578        /* vfio_del_group_dev may be waiting for this device */
 579        wake_up(&vfio.release_q);
 580}
 581
 582/* Device reference always implies a group reference */
 583void vfio_device_put(struct vfio_device *device)
 584{
 585        struct vfio_group *group = device->group;
 586        kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 587        vfio_group_put(group);
 588}
 589EXPORT_SYMBOL_GPL(vfio_device_put);
 590
 591static void vfio_device_get(struct vfio_device *device)
 592{
 593        vfio_group_get(device->group);
 594        kref_get(&device->kref);
 595}
 596
 597static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 598                                                 struct device *dev)
 599{
 600        struct vfio_device *device;
 601
 602        mutex_lock(&group->device_lock);
 603        list_for_each_entry(device, &group->device_list, group_next) {
 604                if (device->dev == dev) {
 605                        vfio_device_get(device);
 606                        mutex_unlock(&group->device_lock);
 607                        return device;
 608                }
 609        }
 610        mutex_unlock(&group->device_lock);
 611        return NULL;
 612}
 613
 614/*
 615 * Some drivers, like pci-stub, are only used to prevent other drivers from
 616 * claiming a device and are therefore perfectly legitimate for a user owned
 617 * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 618 * of the device, but it does prevent the user from having direct access to
 619 * the device, which is useful in some circumstances.
 620 *
 621 * We also assume that we can include PCI interconnect devices, ie. bridges.
 622 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 623 * then all of the downstream devices will be part of the same IOMMU group as
 624 * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 625 * breaks anything, it only does so for user owned devices downstream.  Note
 626 * that error notification via MSI can be affected for platforms that handle
 627 * MSI within the same IOVA space as DMA.
 628 */
 629static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 630
 631static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 632{
 633        int i;
 634
 635        if (dev_is_pci(dev)) {
 636                struct pci_dev *pdev = to_pci_dev(dev);
 637
 638                if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 639                        return true;
 640        }
 641
 642        for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
 643                if (!strcmp(drv->name, vfio_driver_whitelist[i]))
 644                        return true;
 645        }
 646
 647        return false;
 648}
 649
 650/*
 651 * A vfio group is viable for use by userspace if all devices are in
 652 * one of the following states:
 653 *  - driver-less
 654 *  - bound to a vfio driver
 655 *  - bound to a whitelisted driver
 656 *  - a PCI interconnect device
 657 *
 658 * We use two methods to determine whether a device is bound to a vfio
 659 * driver.  The first is to test whether the device exists in the vfio
 660 * group.  The second is to test if the device exists on the group
 661 * unbound_list, indicating it's in the middle of transitioning from
 662 * a vfio driver to driver-less.
 663 */
 664static int vfio_dev_viable(struct device *dev, void *data)
 665{
 666        struct vfio_group *group = data;
 667        struct vfio_device *device;
 668        struct device_driver *drv = ACCESS_ONCE(dev->driver);
 669        struct vfio_unbound_dev *unbound;
 670        int ret = -EINVAL;
 671
 672        mutex_lock(&group->unbound_lock);
 673        list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 674                if (dev == unbound->dev) {
 675                        ret = 0;
 676                        break;
 677                }
 678        }
 679        mutex_unlock(&group->unbound_lock);
 680
 681        if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 682                return 0;
 683
 684        device = vfio_group_get_device(group, dev);
 685        if (device) {
 686                vfio_device_put(device);
 687                return 0;
 688        }
 689
 690        return ret;
 691}
 692
 693/**
 694 * Async device support
 695 */
 696static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 697{
 698        struct vfio_device *device;
 699
 700        /* Do we already know about it?  We shouldn't */
 701        device = vfio_group_get_device(group, dev);
 702        if (WARN_ON_ONCE(device)) {
 703                vfio_device_put(device);
 704                return 0;
 705        }
 706
 707        /* Nothing to do for idle groups */
 708        if (!atomic_read(&group->container_users))
 709                return 0;
 710
 711        /* TODO Prevent device auto probing */
 712        WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
 713             iommu_group_id(group->iommu_group));
 714
 715        return 0;
 716}
 717
 718static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 719{
 720        /* We don't care what happens when the group isn't in use */
 721        if (!atomic_read(&group->container_users))
 722                return 0;
 723
 724        return vfio_dev_viable(dev, group);
 725}
 726
 727static int vfio_iommu_group_notifier(struct notifier_block *nb,
 728                                     unsigned long action, void *data)
 729{
 730        struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 731        struct device *dev = data;
 732        struct vfio_unbound_dev *unbound;
 733
 734        /*
 735         * Need to go through a group_lock lookup to get a reference or we
 736         * risk racing a group being removed.  Ignore spurious notifies.
 737         */
 738        group = vfio_group_try_get(group);
 739        if (!group)
 740                return NOTIFY_OK;
 741
 742        switch (action) {
 743        case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 744                vfio_group_nb_add_dev(group, dev);
 745                break;
 746        case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 747                /*
 748                 * Nothing to do here.  If the device is in use, then the
 749                 * vfio sub-driver should block the remove callback until
 750                 * it is unused.  If the device is unused or attached to a
 751                 * stub driver, then it should be released and we don't
 752                 * care that it will be going away.
 753                 */
 754                break;
 755        case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 756                pr_debug("%s: Device %s, group %d binding to driver\n",
 757                         __func__, dev_name(dev),
 758                         iommu_group_id(group->iommu_group));
 759                break;
 760        case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 761                pr_debug("%s: Device %s, group %d bound to driver %s\n",
 762                         __func__, dev_name(dev),
 763                         iommu_group_id(group->iommu_group), dev->driver->name);
 764                BUG_ON(vfio_group_nb_verify(group, dev));
 765                break;
 766        case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 767                pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
 768                         __func__, dev_name(dev),
 769                         iommu_group_id(group->iommu_group), dev->driver->name);
 770                break;
 771        case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 772                pr_debug("%s: Device %s, group %d unbound from driver\n",
 773                         __func__, dev_name(dev),
 774                         iommu_group_id(group->iommu_group));
 775                /*
 776                 * XXX An unbound device in a live group is ok, but we'd
 777                 * really like to avoid the above BUG_ON by preventing other
 778                 * drivers from binding to it.  Once that occurs, we have to
 779                 * stop the system to maintain isolation.  At a minimum, we'd
 780                 * want a toggle to disable driver auto probe for this device.
 781                 */
 782
 783                mutex_lock(&group->unbound_lock);
 784                list_for_each_entry(unbound,
 785                                    &group->unbound_list, unbound_next) {
 786                        if (dev == unbound->dev) {
 787                                list_del(&unbound->unbound_next);
 788                                kfree(unbound);
 789                                break;
 790                        }
 791                }
 792                mutex_unlock(&group->unbound_lock);
 793                break;
 794        }
 795
 796        /*
 797         * If we're the last reference to the group, the group will be
 798         * released, which includes unregistering the iommu group notifier.
 799         * We hold a read-lock on that notifier list, unregistering needs
 800         * a write-lock... deadlock.  Release our reference asynchronously
 801         * to avoid that situation.
 802         */
 803        vfio_group_schedule_put(group);
 804        return NOTIFY_OK;
 805}
 806
 807/**
 808 * VFIO driver API
 809 */
 810int vfio_add_group_dev(struct device *dev,
 811                       const struct vfio_device_ops *ops, void *device_data)
 812{
 813        struct iommu_group *iommu_group;
 814        struct vfio_group *group;
 815        struct vfio_device *device;
 816
 817        iommu_group = iommu_group_get(dev);
 818        if (!iommu_group)
 819                return -EINVAL;
 820
 821        group = vfio_group_get_from_iommu(iommu_group);
 822        if (!group) {
 823                group = vfio_create_group(iommu_group);
 824                if (IS_ERR(group)) {
 825                        iommu_group_put(iommu_group);
 826                        return PTR_ERR(group);
 827                }
 828        } else {
 829                /*
 830                 * A found vfio_group already holds a reference to the
 831                 * iommu_group.  A created vfio_group keeps the reference.
 832                 */
 833                iommu_group_put(iommu_group);
 834        }
 835
 836        device = vfio_group_get_device(group, dev);
 837        if (device) {
 838                WARN(1, "Device %s already exists on group %d\n",
 839                     dev_name(dev), iommu_group_id(iommu_group));
 840                vfio_device_put(device);
 841                vfio_group_put(group);
 842                return -EBUSY;
 843        }
 844
 845        device = vfio_group_create_device(group, dev, ops, device_data);
 846        if (IS_ERR(device)) {
 847                vfio_group_put(group);
 848                return PTR_ERR(device);
 849        }
 850
 851        /*
 852         * Drop all but the vfio_device reference.  The vfio_device holds
 853         * a reference to the vfio_group, which holds a reference to the
 854         * iommu_group.
 855         */
 856        vfio_group_put(group);
 857
 858        return 0;
 859}
 860EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 861
 862/**
 863 * Get a reference to the vfio_device for a device.  Even if the
 864 * caller thinks they own the device, they could be racing with a
 865 * release call path, so we can't trust drvdata for the shortcut.
 866 * Go the long way around, from the iommu_group to the vfio_group
 867 * to the vfio_device.
 868 */
 869struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 870{
 871        struct vfio_group *group;
 872        struct vfio_device *device;
 873
 874        group = vfio_group_get_from_dev(dev);
 875        if (!group)
 876                return NULL;
 877
 878        device = vfio_group_get_device(group, dev);
 879        vfio_group_put(group);
 880
 881        return device;
 882}
 883EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 884
 885static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 886                                                     char *buf)
 887{
 888        struct vfio_device *it, *device = NULL;
 889
 890        mutex_lock(&group->device_lock);
 891        list_for_each_entry(it, &group->device_list, group_next) {
 892                if (!strcmp(dev_name(it->dev), buf)) {
 893                        device = it;
 894                        vfio_device_get(device);
 895                        break;
 896                }
 897        }
 898        mutex_unlock(&group->device_lock);
 899
 900        return device;
 901}
 902
 903/*
 904 * Caller must hold a reference to the vfio_device
 905 */
 906void *vfio_device_data(struct vfio_device *device)
 907{
 908        return device->device_data;
 909}
 910EXPORT_SYMBOL_GPL(vfio_device_data);
 911
 912/* Given a referenced group, check if it contains the device */
 913static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
 914{
 915        struct vfio_device *device;
 916
 917        device = vfio_group_get_device(group, dev);
 918        if (!device)
 919                return false;
 920
 921        vfio_device_put(device);
 922        return true;
 923}
 924
 925/*
 926 * Decrement the device reference count and wait for the device to be
 927 * removed.  Open file descriptors for the device... */
 928void *vfio_del_group_dev(struct device *dev)
 929{
 930        struct vfio_device *device = dev_get_drvdata(dev);
 931        struct vfio_group *group = device->group;
 932        void *device_data = device->device_data;
 933        struct vfio_unbound_dev *unbound;
 934        unsigned int i = 0;
 935        long ret;
 936        bool interrupted = false;
 937
 938        /*
 939         * The group exists so long as we have a device reference.  Get
 940         * a group reference and use it to scan for the device going away.
 941         */
 942        vfio_group_get(group);
 943
 944        /*
 945         * When the device is removed from the group, the group suddenly
 946         * becomes non-viable; the device has a driver (until the unbind
 947         * completes), but it's not present in the group.  This is bad news
 948         * for any external users that need to re-acquire a group reference
 949         * in order to match and release their existing reference.  To
 950         * solve this, we track such devices on the unbound_list to bridge
 951         * the gap until they're fully unbound.
 952         */
 953        unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 954        if (unbound) {
 955                unbound->dev = dev;
 956                mutex_lock(&group->unbound_lock);
 957                list_add(&unbound->unbound_next, &group->unbound_list);
 958                mutex_unlock(&group->unbound_lock);
 959        }
 960        WARN_ON(!unbound);
 961
 962        vfio_device_put(device);
 963
 964        /*
 965         * If the device is still present in the group after the above
 966         * 'put', then it is in use and we need to request it from the
 967         * bus driver.  The driver may in turn need to request the
 968         * device from the user.  We send the request on an arbitrary
 969         * interval with counter to allow the driver to take escalating
 970         * measures to release the device if it has the ability to do so.
 971         */
 972        do {
 973                device = vfio_group_get_device(group, dev);
 974                if (!device)
 975                        break;
 976
 977                if (device->ops->request)
 978                        device->ops->request(device_data, i++);
 979
 980                vfio_device_put(device);
 981
 982                if (interrupted) {
 983                        ret = wait_event_timeout(vfio.release_q,
 984                                        !vfio_dev_present(group, dev), HZ * 10);
 985                } else {
 986                        ret = wait_event_interruptible_timeout(vfio.release_q,
 987                                        !vfio_dev_present(group, dev), HZ * 10);
 988                        if (ret == -ERESTARTSYS) {
 989                                interrupted = true;
 990                                dev_warn(dev,
 991                                         "Device is currently in use, task"
 992                                         " \"%s\" (%d) "
 993                                         "blocked until device is released",
 994                                         current->comm, task_pid_nr(current));
 995                        }
 996                }
 997        } while (ret <= 0);
 998
 999        /*
1000         * In order to support multiple devices per group, devices can be

1001         * plucked from the group while other devices in the group are still
1002         * in use.  The container persists with this group and those remaining
1003         * devices still attached.  If the user creates an isolation violation
1004         * by binding this device to another driver while the group is still in
1005         * use, that's their fault.  However, in the case of removing the last,
1006         * or potentially the only, device in the group there can be no other
1007         * in-use devices in the group.  The user has done their due diligence
1008         * and we should lay no claims to those devices.  In order to do that,
1009         * we need to make sure the group is detached from the container.
1010         * Without this stall, we're potentially racing with a user process
1011         * that may attempt to immediately bind this device to another driver.
1012         */
1013        if (list_empty(&group->device_list))
1014                wait_event(group->container_q, !group->container);
1015
1016        vfio_group_put(group);
1017
1018        return device_data;
1019}
1020EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1021
1022/**
1023 * VFIO base fd, /dev/vfio/vfio
1024 */
1025static long vfio_ioctl_check_extension(struct vfio_container *container,
1026                                       unsigned long arg)
1027{
1028        struct vfio_iommu_driver *driver;
1029        long ret = 0;
1030
1031        down_read(&container->group_lock);
1032
1033        driver = container->iommu_driver;
1034
1035        switch (arg) {
1036                /* No base extensions yet */
1037        default:
1038                /*
1039                 * If no driver is set, poll all registered drivers for
1040                 * extensions and return the first positive result.  If
1041                 * a driver is already set, further queries will be passed
1042                 * only to that driver.
1043                 */
1044                if (!driver) {
1045                        mutex_lock(&vfio.iommu_drivers_lock);
1046                        list_for_each_entry(driver, &vfio.iommu_drivers_list,
1047                                            vfio_next) {
1048
1049#ifdef CONFIG_VFIO_NOIOMMU
1050                                if (!list_empty(&container->group_list) &&
1051                                    (container->noiommu !=
1052                                     (driver->ops == &vfio_noiommu_ops)))
1053                                        continue;
1054#endif
1055
1056                                if (!try_module_get(driver->ops->owner))
1057                                        continue;
1058
1059                                ret = driver->ops->ioctl(NULL,
1060                                                         VFIO_CHECK_EXTENSION,
1061                                                         arg);
1062                                module_put(driver->ops->owner);
1063                                if (ret > 0)
1064                                        break;
1065                        }
1066                        mutex_unlock(&vfio.iommu_drivers_lock);
1067                } else
1068                        ret = driver->ops->ioctl(container->iommu_data,
1069                                                 VFIO_CHECK_EXTENSION, arg);
1070        }
1071
1072        up_read(&container->group_lock);
1073
1074        return ret;
1075}
1076
1077/* hold write lock on container->group_lock */
1078static int __vfio_container_attach_groups(struct vfio_container *container,
1079                                          struct vfio_iommu_driver *driver,
1080                                          void *data)
1081{
1082        struct vfio_group *group;
1083        int ret = -ENODEV;
1084
1085        list_for_each_entry(group, &container->group_list, container_next) {
1086                ret = driver->ops->attach_group(data, group->iommu_group);
1087                if (ret)
1088                        goto unwind;
1089        }
1090
1091        return ret;
1092
1093unwind:
1094        list_for_each_entry_continue_reverse(group, &container->group_list,
1095                                             container_next) {
1096                driver->ops->detach_group(data, group->iommu_group);
1097        }
1098
1099        return ret;
1100}
1101
1102static long vfio_ioctl_set_iommu(struct vfio_container *container,
1103                                 unsigned long arg)
1104{
1105        struct vfio_iommu_driver *driver;
1106        long ret = -ENODEV;
1107
1108        down_write(&container->group_lock);
1109
1110        /*
1111         * The container is designed to be an unprivileged interface while
1112         * the group can be assigned to specific users.  Therefore, only by
1113         * adding a group to a container does the user get the privilege of
1114         * enabling the iommu, which may allocate finite resources.  There
1115         * is no unset_iommu, but by removing all the groups from a container,
1116         * the container is deprivileged and returns to an unset state.
1117         */
1118        if (list_empty(&container->group_list) || container->iommu_driver) {
1119                up_write(&container->group_lock);
1120                return -EINVAL;
1121        }
1122
1123        mutex_lock(&vfio.iommu_drivers_lock);
1124        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1125                void *data;
1126
1127#ifdef CONFIG_VFIO_NOIOMMU
1128                /*
1129                 * Only noiommu containers can use vfio-noiommu and noiommu
1130                 * containers can only use vfio-noiommu.
1131                 */
1132                if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1133                        continue;
1134#endif
1135
1136                if (!try_module_get(driver->ops->owner))
1137                        continue;
1138
1139                /*
1140                 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1141                 * so test which iommu driver reported support for this
1142                 * extension and call open on them.  We also pass them the
1143                 * magic, allowing a single driver to support multiple
1144                 * interfaces if they'd like.
1145                 */
1146                if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1147                        module_put(driver->ops->owner);
1148                        continue;
1149                }
1150
1151                data = driver->ops->open(arg);
1152                if (IS_ERR(data)) {
1153                        ret = PTR_ERR(data);
1154                        module_put(driver->ops->owner);
1155                        continue;
1156                }
1157
1158                ret = __vfio_container_attach_groups(container, driver, data);
1159                if (ret) {
1160                        driver->ops->release(data);
1161                        module_put(driver->ops->owner);
1162                        continue;
1163                }
1164
1165                container->iommu_driver = driver;
1166                container->iommu_data = data;
1167                break;
1168        }
1169
1170        mutex_unlock(&vfio.iommu_drivers_lock);
1171        up_write(&container->group_lock);
1172
1173        return ret;
1174}
1175
1176static long vfio_fops_unl_ioctl(struct file *filep,
1177                                unsigned int cmd, unsigned long arg)
1178{
1179        struct vfio_container *container = filep->private_data;
1180        struct vfio_iommu_driver *driver;
1181        void *data;
1182        long ret = -EINVAL;
1183
1184        if (!container)
1185                return ret;
1186
1187        switch (cmd) {
1188        case VFIO_GET_API_VERSION:
1189                ret = VFIO_API_VERSION;
1190                break;
1191        case VFIO_CHECK_EXTENSION:
1192                ret = vfio_ioctl_check_extension(container, arg);
1193                break;
1194        case VFIO_SET_IOMMU:
1195                ret = vfio_ioctl_set_iommu(container, arg);
1196                break;
1197        default:
1198                driver = container->iommu_driver;
1199                data = container->iommu_data;
1200
1201                if (driver) /* passthrough all unrecognized ioctls */
1202                        ret = driver->ops->ioctl(data, cmd, arg);
1203        }
1204
1205        return ret;
1206}
1207
1208#ifdef CONFIG_COMPAT
1209static long vfio_fops_compat_ioctl(struct file *filep,
1210                                   unsigned int cmd, unsigned long arg)
1211{
1212        arg = (unsigned long)compat_ptr(arg);
1213        return vfio_fops_unl_ioctl(filep, cmd, arg);
1214}
1215#endif  /* CONFIG_COMPAT */
1216
1217static int vfio_fops_open(struct inode *inode, struct file *filep)
1218{
1219        struct vfio_container *container;
1220
1221        container = kzalloc(sizeof(*container), GFP_KERNEL);
1222        if (!container)
1223                return -ENOMEM;
1224
1225        INIT_LIST_HEAD(&container->group_list);
1226        init_rwsem(&container->group_lock);
1227        kref_init(&container->kref);
1228
1229        filep->private_data = container;
1230
1231        return 0;
1232}
1233
1234static int vfio_fops_release(struct inode *inode, struct file *filep)
1235{
1236        struct vfio_container *container = filep->private_data;
1237
1238        filep->private_data = NULL;
1239
1240        vfio_container_put(container);
1241
1242        return 0;
1243}
1244
1245/*
1246 * Once an iommu driver is set, we optionally pass read/write/mmap
1247 * on to the driver, allowing management interfaces beyond ioctl.
1248 */
1249static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1250                              size_t count, loff_t *ppos)
1251{
1252        struct vfio_container *container = filep->private_data;
1253        struct vfio_iommu_driver *driver;
1254        ssize_t ret = -EINVAL;
1255
1256        driver = container->iommu_driver;
1257        if (likely(driver && driver->ops->read))
1258                ret = driver->ops->read(container->iommu_data,
1259                                        buf, count, ppos);
1260
1261        return ret;
1262}
1263
1264static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1265                               size_t count, loff_t *ppos)
1266{
1267        struct vfio_container *container = filep->private_data;
1268        struct vfio_iommu_driver *driver;
1269        ssize_t ret = -EINVAL;
1270
1271        driver = container->iommu_driver;
1272        if (likely(driver && driver->ops->write))
1273                ret = driver->ops->write(container->iommu_data,
1274                                         buf, count, ppos);
1275
1276        return ret;
1277}
1278
1279static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1280{
1281        struct vfio_container *container = filep->private_data;
1282        struct vfio_iommu_driver *driver;
1283        int ret = -EINVAL;
1284
1285        driver = container->iommu_driver;
1286        if (likely(driver && driver->ops->mmap))
1287                ret = driver->ops->mmap(container->iommu_data, vma);
1288
1289        return ret;
1290}
1291
1292static const struct file_operations vfio_fops = {
1293        .owner          = THIS_MODULE,
1294        .open           = vfio_fops_open,
1295        .release        = vfio_fops_release,
1296        .read           = vfio_fops_read,
1297        .write          = vfio_fops_write,
1298        .unlocked_ioctl = vfio_fops_unl_ioctl,
1299#ifdef CONFIG_COMPAT
1300        .compat_ioctl   = vfio_fops_compat_ioctl,
1301#endif
1302        .mmap           = vfio_fops_mmap,
1303};
1304
1305/**
1306 * VFIO Group fd, /dev/vfio/$GROUP
1307 */
1308static void __vfio_group_unset_container(struct vfio_group *group)
1309{
1310        struct vfio_container *container = group->container;
1311        struct vfio_iommu_driver *driver;
1312
1313        down_write(&container->group_lock);
1314
1315        driver = container->iommu_driver;
1316        if (driver)
1317                driver->ops->detach_group(container->iommu_data,
1318                                          group->iommu_group);
1319
1320        group->container = NULL;
1321        wake_up(&group->container_q);
1322        list_del(&group->container_next);
1323
1324        /* Detaching the last group deprivileges a container, remove iommu */
1325        if (driver && list_empty(&container->group_list)) {
1326                driver->ops->release(container->iommu_data);
1327                module_put(driver->ops->owner);
1328                container->iommu_driver = NULL;
1329                container->iommu_data = NULL;
1330        }
1331
1332        up_write(&container->group_lock);
1333
1334        vfio_container_put(container);
1335}
1336
1337/*
1338 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1339 * if there was no container to unset.  Since the ioctl is called on
1340 * the group, we know that still exists, therefore the only valid
1341 * transition here is 1->0.
1342 */
1343static int vfio_group_unset_container(struct vfio_group *group)
1344{
1345        int users = atomic_cmpxchg(&group->container_users, 1, 0);
1346
1347        if (!users)
1348                return -EINVAL;
1349        if (users != 1)
1350                return -EBUSY;
1351
1352        __vfio_group_unset_container(group);
1353
1354        return 0;
1355}
1356
1357/*
1358 * When removing container users, anything that removes the last user
1359 * implicitly removes the group from the container.  That is, if the
1360 * group file descriptor is closed, as well as any device file descriptors,
1361 * the group is free.
1362 */
1363static void vfio_group_try_dissolve_container(struct vfio_group *group)
1364{
1365        if (0 == atomic_dec_if_positive(&group->container_users))
1366                __vfio_group_unset_container(group);
1367}
1368
1369static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1370{
1371        struct fd f;
1372        struct vfio_container *container;
1373        struct vfio_iommu_driver *driver;
1374        int ret = 0;
1375
1376        if (atomic_read(&group->container_users))
1377                return -EINVAL;
1378
1379        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1380                return -EPERM;
1381
1382        f = fdget(container_fd);
1383        if (!f.file)
1384                return -EBADF;
1385
1386        /* Sanity check, is this really our fd? */
1387        if (f.file->f_op != &vfio_fops) {
1388                fdput(f);
1389                return -EINVAL;
1390        }
1391
1392        container = f.file->private_data;
1393        WARN_ON(!container); /* fget ensures we don't race vfio_release */
1394
1395        down_write(&container->group_lock);
1396
1397        /* Real groups and fake groups cannot mix */
1398        if (!list_empty(&container->group_list) &&
1399            container->noiommu != group->noiommu) {
1400                ret = -EPERM;
1401                goto unlock_out;
1402        }
1403
1404        driver = container->iommu_driver;
1405        if (driver) {
1406                ret = driver->ops->attach_group(container->iommu_data,
1407                                                group->iommu_group);
1408                if (ret)
1409                        goto unlock_out;
1410        }
1411
1412        group->container = container;
1413        container->noiommu = group->noiommu;
1414        list_add(&group->container_next, &container->group_list);
1415
1416        /* Get a reference on the container and mark a user within the group */
1417        vfio_container_get(container);
1418        atomic_inc(&group->container_users);
1419
1420unlock_out:
1421        up_write(&container->group_lock);
1422        fdput(f);
1423        return ret;
1424}
1425
1426static bool vfio_group_viable(struct vfio_group *group)
1427{
1428        return (iommu_group_for_each_dev(group->iommu_group,
1429                                         group, vfio_dev_viable) == 0);
1430}
1431
1432static int vfio_group_add_container_user(struct vfio_group *group)
1433{
1434        if (!atomic_inc_not_zero(&group->container_users))
1435                return -EINVAL;
1436
1437        if (group->noiommu) {
1438                atomic_dec(&group->container_users);
1439                return -EPERM;
1440        }
1441        if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1442                atomic_dec(&group->container_users);
1443                return -EINVAL;
1444        }
1445
1446        return 0;
1447}
1448
1449static const struct file_operations vfio_device_fops;
1450
1451static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1452{
1453        struct vfio_device *device;
1454        struct file *filep;
1455        int ret;
1456
1457        if (0 == atomic_read(&group->container_users) ||
1458            !group->container->iommu_driver || !vfio_group_viable(group))
1459                return -EINVAL;
1460
1461        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1462                return -EPERM;
1463
1464        device = vfio_device_get_from_name(group, buf);
1465        if (!device)
1466                return -ENODEV;
1467
1468        ret = device->ops->open(device->device_data);
1469        if (ret) {
1470                vfio_device_put(device);
1471                return ret;
1472        }
1473
1474        /*
1475         * We can't use anon_inode_getfd() because we need to modify
1476         * the f_mode flags directly to allow more than just ioctls
1477         */
1478        ret = get_unused_fd_flags(O_CLOEXEC);
1479        if (ret < 0) {
1480                device->ops->release(device->device_data);
1481                vfio_device_put(device);
1482                return ret;
1483        }
1484
1485        filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1486                                   device, O_RDWR);
1487        if (IS_ERR(filep)) {
1488                put_unused_fd(ret);
1489                ret = PTR_ERR(filep);
1490                device->ops->release(device->device_data);
1491                vfio_device_put(device);
1492                return ret;
1493        }
1494
1495        /*
1496         * TODO: add an anon_inode interface to do this.
1497         * Appears to be missing by lack of need rather than
1498         * explicitly prevented.  Now there's need.
1499         */
1500        filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1501
1502        atomic_inc(&group->container_users);
1503
1504        fd_install(ret, filep);
1505
1506        if (group->noiommu)
1507                dev_warn(device->dev, "vfio-noiommu device opened by user "
1508                         "(%s:%d)\n", current->comm, task_pid_nr(current));
1509
1510        return ret;
1511}
1512
1513static long vfio_group_fops_unl_ioctl(struct file *filep,
1514                                      unsigned int cmd, unsigned long arg)
1515{
1516        struct vfio_group *group = filep->private_data;
1517        long ret = -ENOTTY;
1518
1519        switch (cmd) {
1520        case VFIO_GROUP_GET_STATUS:
1521        {
1522                struct vfio_group_status status;
1523                unsigned long minsz;
1524
1525                minsz = offsetofend(struct vfio_group_status, flags);
1526
1527                if (copy_from_user(&status, (void __user *)arg, minsz))
1528                        return -EFAULT;
1529
1530                if (status.argsz < minsz)
1531                        return -EINVAL;
1532
1533                status.flags = 0;
1534
1535                if (vfio_group_viable(group))
1536                        status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1537
1538                if (group->container)
1539                        status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1540
1541                if (copy_to_user((void __user *)arg, &status, minsz))
1542                        return -EFAULT;
1543
1544                ret = 0;
1545                break;
1546        }
1547        case VFIO_GROUP_SET_CONTAINER:
1548        {
1549                int fd;
1550
1551                if (get_user(fd, (int __user *)arg))
1552                        return -EFAULT;
1553
1554                if (fd < 0)
1555                        return -EINVAL;
1556
1557                ret = vfio_group_set_container(group, fd);
1558                break;
1559        }
1560        case VFIO_GROUP_UNSET_CONTAINER:
1561                ret = vfio_group_unset_container(group);
1562                break;
1563        case VFIO_GROUP_GET_DEVICE_FD:
1564        {
1565                char *buf;
1566
1567                buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1568                if (IS_ERR(buf))
1569                        return PTR_ERR(buf);
1570
1571                ret = vfio_group_get_device_fd(group, buf);
1572                kfree(buf);
1573                break;
1574        }
1575        }
1576
1577        return ret;
1578}
1579
1580#ifdef CONFIG_COMPAT
1581static long vfio_group_fops_compat_ioctl(struct file *filep,
1582                                         unsigned int cmd, unsigned long arg)
1583{
1584        arg = (unsigned long)compat_ptr(arg);
1585        return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1586}
1587#endif  /* CONFIG_COMPAT */
1588
1589static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1590{
1591        struct vfio_group *group;
1592        int opened;
1593
1594        group = vfio_group_get_from_minor(iminor(inode));
1595        if (!group)
1596                return -ENODEV;
1597
1598        if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1599                vfio_group_put(group);
1600                return -EPERM;
1601        }
1602
1603        /* Do we need multiple instances of the group open?  Seems not. */
1604        opened = atomic_cmpxchg(&group->opened, 0, 1);
1605        if (opened) {
1606                vfio_group_put(group);
1607                return -EBUSY;
1608        }
1609
1610        /* Is something still in use from a previous open? */
1611        if (group->container) {
1612                atomic_dec(&group->opened);
1613                vfio_group_put(group);
1614                return -EBUSY;
1615        }
1616
1617        /* Warn if previous user didn't cleanup and re-init to drop them */
1618        if (WARN_ON(group->notifier.head))
1619                BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1620
1621        filep->private_data = group;
1622
1623        return 0;
1624}
1625
1626static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1627{
1628        struct vfio_group *group = filep->private_data;
1629
1630        filep->private_data = NULL;
1631
1632        vfio_group_try_dissolve_container(group);
1633
1634        atomic_dec(&group->opened);
1635
1636        vfio_group_put(group);
1637
1638        return 0;
1639}
1640
1641static const struct file_operations vfio_group_fops = {
1642        .owner          = THIS_MODULE,
1643        .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1644#ifdef CONFIG_COMPAT
1645        .compat_ioctl   = vfio_group_fops_compat_ioctl,
1646#endif
1647        .open           = vfio_group_fops_open,
1648        .release        = vfio_group_fops_release,
1649};
1650
1651/**
1652 * VFIO Device fd
1653 */
1654static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1655{
1656        struct vfio_device *device = filep->private_data;
1657
1658        device->ops->release(device->device_data);
1659
1660        vfio_group_try_dissolve_container(device->group);
1661
1662        vfio_device_put(device);
1663
1664        return 0;
1665}
1666
1667static long vfio_device_fops_unl_ioctl(struct file *filep,
1668                                       unsigned int cmd, unsigned long arg)
1669{
1670        struct vfio_device *device = filep->private_data;
1671
1672        if (unlikely(!device->ops->ioctl))
1673                return -EINVAL;
1674
1675        return device->ops->ioctl(device->device_data, cmd, arg);
1676}
1677
1678static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1679                                     size_t count, loff_t *ppos)
1680{
1681        struct vfio_device *device = filep->private_data;
1682
1683        if (unlikely(!device->ops->read))
1684                return -EINVAL;
1685
1686        return device->ops->read(device->device_data, buf, count, ppos);
1687}
1688
1689static ssize_t vfio_device_fops_write(struct file *filep,
1690                                      const char __user *buf,
1691                                      size_t count, loff_t *ppos)
1692{
1693        struct vfio_device *device = filep->private_data;
1694
1695        if (unlikely(!device->ops->write))
1696                return -EINVAL;
1697
1698        return device->ops->write(device->device_data, buf, count, ppos);
1699}
1700
1701static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1702{
1703        struct vfio_device *device = filep->private_data;
1704
1705        if (unlikely(!device->ops->mmap))
1706                return -EINVAL;
1707
1708        return device->ops->mmap(device->device_data, vma);
1709}
1710
1711#ifdef CONFIG_COMPAT
1712static long vfio_device_fops_compat_ioctl(struct file *filep,
1713                                          unsigned int cmd, unsigned long arg)
1714{
1715        arg = (unsigned long)compat_ptr(arg);
1716        return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1717}
1718#endif  /* CONFIG_COMPAT */
1719
1720static const struct file_operations vfio_device_fops = {
1721        .owner          = THIS_MODULE,
1722        .release        = vfio_device_fops_release,
1723        .read           = vfio_device_fops_read,
1724        .write          = vfio_device_fops_write,
1725        .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1726#ifdef CONFIG_COMPAT
1727        .compat_ioctl   = vfio_device_fops_compat_ioctl,
1728#endif
1729        .mmap           = vfio_device_fops_mmap,
1730};
1731
1732/**
1733 * External user API, exported by symbols to be linked dynamically.
1734 *
1735 * The protocol includes:
1736 *  1. do normal VFIO init operation:
1737 *      - opening a new container;
1738 *      - attaching group(s) to it;
1739 *      - setting an IOMMU driver for a container.
1740 * When IOMMU is set for a container, all groups in it are
1741 * considered ready to use by an external user.
1742 *
1743 * 2. User space passes a group fd to an external user.
1744 * The external user calls vfio_group_get_external_user()
1745 * to verify that:
1746 *      - the group is initialized;
1747 *      - IOMMU is set for it.
1748 * If both checks passed, vfio_group_get_external_user()
1749 * increments the container user counter to prevent
1750 * the VFIO group from disposal before KVM exits.
1751 *
1752 * 3. The external user calls vfio_external_user_iommu_id()
1753 * to know an IOMMU ID.
1754 *
1755 * 4. When the external KVM finishes, it calls
1756 * vfio_group_put_external_user() to release the VFIO group.
1757 * This call decrements the container user counter.
1758 */
1759struct vfio_group *vfio_group_get_external_user(struct file *filep)
1760{
1761        struct vfio_group *group = filep->private_data;
1762        int ret;
1763
1764        if (filep->f_op != &vfio_group_fops)
1765                return ERR_PTR(-EINVAL);
1766
1767        ret = vfio_group_add_container_user(group);
1768        if (ret)
1769                return ERR_PTR(ret);
1770
1771        vfio_group_get(group);
1772
1773        return group;
1774}
1775EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1776
1777void vfio_group_put_external_user(struct vfio_group *group)
1778{
1779        vfio_group_try_dissolve_container(group);
1780        vfio_group_put(group);
1781}
1782EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1783
1784bool vfio_external_group_match_file(struct vfio_group *test_group,
1785                                    struct file *filep)
1786{
1787        struct vfio_group *group = filep->private_data;
1788
1789        return (filep->f_op == &vfio_group_fops) && (group == test_group);
1790}
1791EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1792
1793int vfio_external_user_iommu_id(struct vfio_group *group)
1794{
1795        return iommu_group_id(group->iommu_group);
1796}
1797EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1798
1799long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1800{
1801        return vfio_ioctl_check_extension(group->container, arg);
1802}
1803EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1804
1805/**
1806 * Sub-module support
1807 */
1808/*
1809 * Helper for managing a buffer of info chain capabilities, allocate or
1810 * reallocate a buffer with additional @size, filling in @id and @version
1811 * of the capability.  A pointer to the new capability is returned.
1812 *
1813 * NB. The chain is based at the head of the buffer, so new entries are
1814 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1815 * next offsets prior to copying to the user buffer.
1816 */
1817struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1818                                               size_t size, u16 id, u16 version)
1819{
1820        void *buf;
1821        struct vfio_info_cap_header *header, *tmp;
1822
1823        buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1824        if (!buf) {
1825                kfree(caps->buf);
1826                caps->size = 0;
1827                return ERR_PTR(-ENOMEM);
1828        }
1829
1830        caps->buf = buf;
1831        header = buf + caps->size;
1832
1833        /* Eventually copied to user buffer, zero */
1834        memset(header, 0, size);
1835
1836        header->id = id;
1837        header->version = version;
1838
1839        /* Add to the end of the capability chain */
1840        for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1841                ; /* nothing */
1842
1843        tmp->next = caps->size;
1844        caps->size += size;
1845
1846        return header;
1847}
1848EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1849
1850void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1851{
1852        struct vfio_info_cap_header *tmp;
1853        void *buf = (void *)caps->buf;
1854
1855        for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1856                tmp->next += offset;
1857}
1858EXPORT_SYMBOL(vfio_info_cap_shift);
1859
1860static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1861{
1862        struct vfio_info_cap_header *header;
1863        struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1864        size_t size;
1865
1866        size = sizeof(*sparse) + sparse->nr_areas *  sizeof(*sparse->areas);
1867        header = vfio_info_cap_add(caps, size,
1868                                   VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1869        if (IS_ERR(header))
1870                return PTR_ERR(header);
1871
1872        sparse_cap = container_of(header,
1873                        struct vfio_region_info_cap_sparse_mmap, header);
1874        sparse_cap->nr_areas = sparse->nr_areas;
1875        memcpy(sparse_cap->areas, sparse->areas,
1876               sparse->nr_areas * sizeof(*sparse->areas));
1877        return 0;
1878}
1879
1880static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1881{
1882        struct vfio_info_cap_header *header;
1883        struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1884
1885        header = vfio_info_cap_add(caps, sizeof(*cap),
1886                                   VFIO_REGION_INFO_CAP_TYPE, 1);
1887        if (IS_ERR(header))
1888                return PTR_ERR(header);
1889
1890        type_cap = container_of(header, struct vfio_region_info_cap_type,
1891                                header);
1892        type_cap->type = cap->type;
1893        type_cap->subtype = cap->subtype;
1894        return 0;
1895}
1896
1897int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1898                             void *cap_type)
1899{
1900        int ret = -EINVAL;
1901
1902        if (!cap_type)
1903                return 0;
1904
1905        switch (cap_type_id) {
1906        case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1907                ret = sparse_mmap_cap(caps, cap_type);
1908                break;
1909
1910        case VFIO_REGION_INFO_CAP_TYPE:
1911                ret = region_type_cap(caps, cap_type);
1912                break;
1913        }
1914
1915        return ret;
1916}
1917EXPORT_SYMBOL(vfio_info_add_capability);
1918
1919int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1920                                       int max_irq_type, size_t *data_size)
1921{
1922        unsigned long minsz;
1923        size_t size;
1924
1925        minsz = offsetofend(struct vfio_irq_set, count);
1926
1927        if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1928            (hdr->count >= (U32_MAX - hdr->start)) ||
1929            (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1930                                VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1931                return -EINVAL;
1932
1933        if (data_size)
1934                *data_size = 0;
1935
1936        if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1937                return -EINVAL;
1938
1939        switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1940        case VFIO_IRQ_SET_DATA_NONE:
1941                size = 0;
1942                break;
1943        case VFIO_IRQ_SET_DATA_BOOL:
1944                size = sizeof(uint8_t);
1945                break;
1946        case VFIO_IRQ_SET_DATA_EVENTFD:
1947                size = sizeof(int32_t);
1948                break;
1949        default:
1950                return -EINVAL;
1951        }
1952
1953        if (size) {
1954                if (hdr->argsz - minsz < hdr->count * size)
1955                        return -EINVAL;
1956
1957                if (!data_size)
1958                        return -EINVAL;
1959
1960                *data_size = hdr->count * size;
1961        }
1962
1963        return 0;
1964}
1965EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1966
1967/*
1968 * Pin a set of guest PFNs and return their associated host PFNs for local
1969 * domain only.
1970 * @dev [in]     : device
1971 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1972 * @npage [in]   : count of elements in user_pfn array.  This count should not
1973 *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1974 * @prot [in]    : protection flags
1975 * @phys_pfn[out]: array of host PFNs
1976 * Return error or number of pages pinned.
1977 */
1978int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1979                   int prot, unsigned long *phys_pfn)
1980{
1981        struct vfio_container *container;
1982        struct vfio_group *group;
1983        struct vfio_iommu_driver *driver;
1984        int ret;
1985
1986        if (!dev || !user_pfn || !phys_pfn || !npage)
1987                return -EINVAL;
1988
1989        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1990                return -E2BIG;
1991
1992        group = vfio_group_get_from_dev(dev);
1993        if (!group)
1994                return -ENODEV;
1995
1996        ret = vfio_group_add_container_user(group);
1997        if (ret)
1998                goto err_pin_pages;
1999
2000        container = group->container;

2001        driver = container->iommu_driver;
2002        if (likely(driver && driver->ops->pin_pages))
2003                ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
2004                                             npage, prot, phys_pfn);
2005        else
2006                ret = -ENOTTY;
2007
2008        vfio_group_try_dissolve_container(group);
2009
2010err_pin_pages:
2011        vfio_group_put(group);
2012        return ret;
2013}
2014EXPORT_SYMBOL(vfio_pin_pages);
2015
2016/*
2017 * Unpin set of host PFNs for local domain only.
2018 * @dev [in]     : device
2019 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2020 *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2021 * @npage [in]   : count of elements in user_pfn array.  This count should not
2022 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2023 * Return error or number of pages unpinned.
2024 */
2025int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2026{
2027        struct vfio_container *container;
2028        struct vfio_group *group;
2029        struct vfio_iommu_driver *driver;
2030        int ret;
2031
2032        if (!dev || !user_pfn || !npage)
2033                return -EINVAL;
2034
2035        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2036                return -E2BIG;
2037
2038        group = vfio_group_get_from_dev(dev);
2039        if (!group)
2040                return -ENODEV;
2041
2042        ret = vfio_group_add_container_user(group);
2043        if (ret)
2044                goto err_unpin_pages;
2045
2046        container = group->container;
2047        driver = container->iommu_driver;
2048        if (likely(driver && driver->ops->unpin_pages))
2049                ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2050                                               npage);
2051        else
2052                ret = -ENOTTY;
2053
2054        vfio_group_try_dissolve_container(group);
2055
2056err_unpin_pages:
2057        vfio_group_put(group);
2058        return ret;
2059}
2060EXPORT_SYMBOL(vfio_unpin_pages);
2061
2062static int vfio_register_iommu_notifier(struct vfio_group *group,
2063                                        unsigned long *events,
2064                                        struct notifier_block *nb)
2065{
2066        struct vfio_container *container;
2067        struct vfio_iommu_driver *driver;
2068        int ret;
2069
2070        ret = vfio_group_add_container_user(group);
2071        if (ret)
2072                return -EINVAL;
2073
2074        container = group->container;
2075        driver = container->iommu_driver;
2076        if (likely(driver && driver->ops->register_notifier))
2077                ret = driver->ops->register_notifier(container->iommu_data,
2078                                                     events, nb);
2079        else
2080                ret = -ENOTTY;
2081
2082        vfio_group_try_dissolve_container(group);
2083
2084        return ret;
2085}
2086
2087static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2088                                          struct notifier_block *nb)
2089{
2090        struct vfio_container *container;
2091        struct vfio_iommu_driver *driver;
2092        int ret;
2093
2094        ret = vfio_group_add_container_user(group);
2095        if (ret)
2096                return -EINVAL;
2097
2098        container = group->container;
2099        driver = container->iommu_driver;
2100        if (likely(driver && driver->ops->unregister_notifier))
2101                ret = driver->ops->unregister_notifier(container->iommu_data,
2102                                                       nb);
2103        else
2104                ret = -ENOTTY;
2105
2106        vfio_group_try_dissolve_container(group);
2107
2108        return ret;
2109}
2110
2111void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2112{
2113        group->kvm = kvm;
2114        blocking_notifier_call_chain(&group->notifier,
2115                                VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2116}
2117EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2118
2119static int vfio_register_group_notifier(struct vfio_group *group,
2120                                        unsigned long *events,
2121                                        struct notifier_block *nb)
2122{
2123        int ret;
2124        bool set_kvm = false;
2125
2126        if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2127                set_kvm = true;
2128
2129        /* clear known events */
2130        *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2131
2132        /* refuse to continue if still events remaining */
2133        if (*events)
2134                return -EINVAL;
2135
2136        ret = vfio_group_add_container_user(group);
2137        if (ret)
2138                return -EINVAL;
2139
2140        ret = blocking_notifier_chain_register(&group->notifier, nb);
2141
2142        /*
2143         * The attaching of kvm and vfio_group might already happen, so
2144         * here we replay once upon registration.
2145         */
2146        if (!ret && set_kvm && group->kvm)
2147                blocking_notifier_call_chain(&group->notifier,
2148                                        VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2149
2150        vfio_group_try_dissolve_container(group);
2151
2152        return ret;
2153}
2154
2155static int vfio_unregister_group_notifier(struct vfio_group *group,
2156                                         struct notifier_block *nb)
2157{
2158        int ret;
2159
2160        ret = vfio_group_add_container_user(group);
2161        if (ret)
2162                return -EINVAL;
2163
2164        ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2165
2166        vfio_group_try_dissolve_container(group);
2167
2168        return ret;
2169}
2170
2171int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2172                           unsigned long *events, struct notifier_block *nb)
2173{
2174        struct vfio_group *group;
2175        int ret;
2176
2177        if (!dev || !nb || !events || (*events == 0))
2178                return -EINVAL;
2179
2180        group = vfio_group_get_from_dev(dev);
2181        if (!group)
2182                return -ENODEV;
2183
2184        switch (type) {
2185        case VFIO_IOMMU_NOTIFY:
2186                ret = vfio_register_iommu_notifier(group, events, nb);
2187                break;
2188        case VFIO_GROUP_NOTIFY:
2189                ret = vfio_register_group_notifier(group, events, nb);
2190                break;
2191        default:
2192                ret = -EINVAL;
2193        }
2194
2195        vfio_group_put(group);
2196        return ret;
2197}
2198EXPORT_SYMBOL(vfio_register_notifier);
2199
2200int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2201                             struct notifier_block *nb)
2202{
2203        struct vfio_group *group;
2204        int ret;
2205
2206        if (!dev || !nb)
2207                return -EINVAL;
2208
2209        group = vfio_group_get_from_dev(dev);
2210        if (!group)
2211                return -ENODEV;
2212
2213        switch (type) {
2214        case VFIO_IOMMU_NOTIFY:
2215                ret = vfio_unregister_iommu_notifier(group, nb);
2216                break;
2217        case VFIO_GROUP_NOTIFY:
2218                ret = vfio_unregister_group_notifier(group, nb);
2219                break;
2220        default:
2221                ret = -EINVAL;
2222        }
2223
2224        vfio_group_put(group);
2225        return ret;
2226}
2227EXPORT_SYMBOL(vfio_unregister_notifier);
2228
2229/**
2230 * Module/class support
2231 */
2232static char *vfio_devnode(struct device *dev, umode_t *mode)
2233{
2234        return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2235}
2236
2237static struct miscdevice vfio_dev = {
2238        .minor = VFIO_MINOR,
2239        .name = "vfio",
2240        .fops = &vfio_fops,
2241        .nodename = "vfio/vfio",
2242        .mode = S_IRUGO | S_IWUGO,
2243};
2244
2245static int __init vfio_init(void)
2246{
2247        int ret;
2248
2249        idr_init(&vfio.group_idr);
2250        mutex_init(&vfio.group_lock);
2251        mutex_init(&vfio.iommu_drivers_lock);
2252        INIT_LIST_HEAD(&vfio.group_list);
2253        INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2254        init_waitqueue_head(&vfio.release_q);
2255
2256        ret = misc_register(&vfio_dev);
2257        if (ret) {
2258                pr_err("vfio: misc device register failed\n");
2259                return ret;
2260        }
2261
2262        /* /dev/vfio/$GROUP */
2263        vfio.class = class_create(THIS_MODULE, "vfio");
2264        if (IS_ERR(vfio.class)) {
2265                ret = PTR_ERR(vfio.class);
2266                goto err_class;
2267        }
2268
2269        vfio.class->devnode = vfio_devnode;
2270
2271        ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2272        if (ret)
2273                goto err_alloc_chrdev;
2274
2275        cdev_init(&vfio.group_cdev, &vfio_group_fops);
2276        ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2277        if (ret)
2278                goto err_cdev_add;
2279
2280        pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2281
2282#ifdef CONFIG_VFIO_NOIOMMU
2283        vfio_register_iommu_driver(&vfio_noiommu_ops);
2284#endif
2285        return 0;
2286
2287err_cdev_add:
2288        unregister_chrdev_region(vfio.group_devt, MINORMASK);
2289err_alloc_chrdev:
2290        class_destroy(vfio.class);
2291        vfio.class = NULL;
2292err_class:
2293        misc_deregister(&vfio_dev);
2294        return ret;
2295}
2296
2297static void __exit vfio_cleanup(void)
2298{
2299        WARN_ON(!list_empty(&vfio.group_list));
2300
2301#ifdef CONFIG_VFIO_NOIOMMU
2302        vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2303#endif
2304        idr_destroy(&vfio.group_idr);
2305        cdev_del(&vfio.group_cdev);
2306        unregister_chrdev_region(vfio.group_devt, MINORMASK);
2307        class_destroy(vfio.class);
2308        vfio.class = NULL;
2309        misc_deregister(&vfio_dev);
2310}
2311
2312module_init(vfio_init);
2313module_exit(vfio_cleanup);
2314
2315MODULE_VERSION(DRIVER_VERSION);
2316MODULE_LICENSE("GPL v2");
2317MODULE_AUTHOR(DRIVER_AUTHOR);
2318MODULE_DESCRIPTION(DRIVER_DESC);
2319MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2320MODULE_ALIAS("devname:vfio/vfio");
2321MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2322