linux/drivers/vfio/vfio.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO core
   4 *
   5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6 *     Author: Alex Williamson <alex.williamson@redhat.com>
   7 *
   8 * Derived from original vfio:
   9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10 * Author: Tom Lyon, pugs@cisco.com
  11 */
  12
  13#include <linux/cdev.h>
  14#include <linux/compat.h>
  15#include <linux/device.h>
  16#include <linux/file.h>
  17#include <linux/anon_inodes.h>
  18#include <linux/fs.h>
  19#include <linux/idr.h>
  20#include <linux/iommu.h>
  21#include <linux/list.h>
  22#include <linux/miscdevice.h>
  23#include <linux/module.h>
  24#include <linux/mutex.h>
  25#include <linux/pci.h>
  26#include <linux/rwsem.h>
  27#include <linux/sched.h>
  28#include <linux/slab.h>
  29#include <linux/stat.h>
  30#include <linux/string.h>
  31#include <linux/uaccess.h>
  32#include <linux/vfio.h>
  33#include <linux/wait.h>
  34#include <linux/sched/signal.h>
  35
  36#define DRIVER_VERSION  "0.3"
  37#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  38#define DRIVER_DESC     "VFIO - User Level meta-driver"
  39
  40static struct vfio {
  41        struct class                    *class;
  42        struct list_head                iommu_drivers_list;
  43        struct mutex                    iommu_drivers_lock;
  44        struct list_head                group_list;
  45        struct idr                      group_idr;
  46        struct mutex                    group_lock;
  47        struct cdev                     group_cdev;
  48        dev_t                           group_devt;
  49        wait_queue_head_t               release_q;
  50} vfio;
  51
  52struct vfio_iommu_driver {
  53        const struct vfio_iommu_driver_ops      *ops;
  54        struct list_head                        vfio_next;
  55};
  56
  57struct vfio_container {
  58        struct kref                     kref;
  59        struct list_head                group_list;
  60        struct rw_semaphore             group_lock;
  61        struct vfio_iommu_driver        *iommu_driver;
  62        void                            *iommu_data;
  63        bool                            noiommu;
  64};
  65
  66struct vfio_unbound_dev {
  67        struct device                   *dev;
  68        struct list_head                unbound_next;
  69};
  70
  71struct vfio_group {
  72        struct kref                     kref;
  73        int                             minor;
  74        atomic_t                        container_users;
  75        struct iommu_group              *iommu_group;
  76        struct vfio_container           *container;
  77        struct list_head                device_list;
  78        struct mutex                    device_lock;
  79        struct device                   *dev;
  80        struct notifier_block           nb;
  81        struct list_head                vfio_next;
  82        struct list_head                container_next;
  83        struct list_head                unbound_list;
  84        struct mutex                    unbound_lock;
  85        atomic_t                        opened;
  86        wait_queue_head_t               container_q;
  87        bool                            noiommu;
  88        unsigned int                    dev_counter;
  89        struct kvm                      *kvm;
  90        struct blocking_notifier_head   notifier;
  91};
  92
  93struct vfio_device {
  94        struct kref                     kref;
  95        struct device                   *dev;
  96        const struct vfio_device_ops    *ops;
  97        struct vfio_group               *group;
  98        struct list_head                group_next;
  99        void                            *device_data;
 100};
 101
 102#ifdef CONFIG_VFIO_NOIOMMU
 103static bool noiommu __read_mostly;
 104module_param_named(enable_unsafe_noiommu_mode,
 105                   noiommu, bool, S_IRUGO | S_IWUSR);
 106MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 107#endif
 108
 109/*
 110 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 111 * and remove functions, any use cases other than acquiring the first
 112 * reference for the purpose of calling vfio_add_group_dev() or removing
 113 * that symmetric reference after vfio_del_group_dev() should use the raw
 114 * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 115 * removes the device from the dummy group and cannot be nested.
 116 */
 117struct iommu_group *vfio_iommu_group_get(struct device *dev)
 118{
 119        struct iommu_group *group;
 120        int __maybe_unused ret;
 121
 122        group = iommu_group_get(dev);
 123
 124#ifdef CONFIG_VFIO_NOIOMMU
 125        /*
 126         * With noiommu enabled, an IOMMU group will be created for a device
 127         * that doesn't already have one and doesn't have an iommu_ops on their
 128         * bus.  We set iommudata simply to be able to identify these groups
 129         * as special use and for reclamation later.
 130         */
 131        if (group || !noiommu || iommu_present(dev->bus))
 132                return group;
 133
 134        group = iommu_group_alloc();
 135        if (IS_ERR(group))
 136                return NULL;
 137
 138        iommu_group_set_name(group, "vfio-noiommu");
 139        iommu_group_set_iommudata(group, &noiommu, NULL);
 140        ret = iommu_group_add_device(group, dev);
 141        if (ret) {
 142                iommu_group_put(group);
 143                return NULL;
 144        }
 145
 146        /*
 147         * Where to taint?  At this point we've added an IOMMU group for a
 148         * device that is not backed by iommu_ops, therefore any iommu_
 149         * callback using iommu_ops can legitimately Oops.  So, while we may
 150         * be about to give a DMA capable device to a user without IOMMU
 151         * protection, which is clearly taint-worthy, let's go ahead and do
 152         * it here.
 153         */
 154        add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 155        dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 156#endif
 157
 158        return group;
 159}
 160EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 161
 162void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 163{
 164#ifdef CONFIG_VFIO_NOIOMMU
 165        if (iommu_group_get_iommudata(group) == &noiommu)
 166                iommu_group_remove_device(dev);
 167#endif
 168
 169        iommu_group_put(group);
 170}
 171EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 172
 173#ifdef CONFIG_VFIO_NOIOMMU
 174static void *vfio_noiommu_open(unsigned long arg)
 175{
 176        if (arg != VFIO_NOIOMMU_IOMMU)
 177                return ERR_PTR(-EINVAL);
 178        if (!capable(CAP_SYS_RAWIO))
 179                return ERR_PTR(-EPERM);
 180
 181        return NULL;
 182}
 183
 184static void vfio_noiommu_release(void *iommu_data)
 185{
 186}
 187
 188static long vfio_noiommu_ioctl(void *iommu_data,
 189                               unsigned int cmd, unsigned long arg)
 190{
 191        if (cmd == VFIO_CHECK_EXTENSION)
 192                return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 193
 194        return -ENOTTY;
 195}
 196
 197static int vfio_noiommu_attach_group(void *iommu_data,
 198                                     struct iommu_group *iommu_group)
 199{
 200        return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 201}
 202
 203static void vfio_noiommu_detach_group(void *iommu_data,
 204                                      struct iommu_group *iommu_group)
 205{
 206}
 207
 208static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 209        .name = "vfio-noiommu",
 210        .owner = THIS_MODULE,
 211        .open = vfio_noiommu_open,
 212        .release = vfio_noiommu_release,
 213        .ioctl = vfio_noiommu_ioctl,
 214        .attach_group = vfio_noiommu_attach_group,
 215        .detach_group = vfio_noiommu_detach_group,
 216};
 217#endif
 218
 219
 220/**
 221 * IOMMU driver registration
 222 */
 223int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 224{
 225        struct vfio_iommu_driver *driver, *tmp;
 226
 227        driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 228        if (!driver)
 229                return -ENOMEM;
 230
 231        driver->ops = ops;
 232
 233        mutex_lock(&vfio.iommu_drivers_lock);
 234
 235        /* Check for duplicates */
 236        list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 237                if (tmp->ops == ops) {
 238                        mutex_unlock(&vfio.iommu_drivers_lock);
 239                        kfree(driver);
 240                        return -EINVAL;
 241                }
 242        }
 243
 244        list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 245
 246        mutex_unlock(&vfio.iommu_drivers_lock);
 247
 248        return 0;
 249}
 250EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 251
 252void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 253{
 254        struct vfio_iommu_driver *driver;
 255
 256        mutex_lock(&vfio.iommu_drivers_lock);
 257        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 258                if (driver->ops == ops) {
 259                        list_del(&driver->vfio_next);
 260                        mutex_unlock(&vfio.iommu_drivers_lock);
 261                        kfree(driver);
 262                        return;
 263                }
 264        }
 265        mutex_unlock(&vfio.iommu_drivers_lock);
 266}
 267EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 268
 269/**
 270 * Group minor allocation/free - both called with vfio.group_lock held
 271 */
 272static int vfio_alloc_group_minor(struct vfio_group *group)
 273{
 274        return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 275}
 276
 277static void vfio_free_group_minor(int minor)
 278{
 279        idr_remove(&vfio.group_idr, minor);
 280}
 281
 282static int vfio_iommu_group_notifier(struct notifier_block *nb,
 283                                     unsigned long action, void *data);
 284static void vfio_group_get(struct vfio_group *group);
 285
 286/**
 287 * Container objects - containers are created when /dev/vfio/vfio is
 288 * opened, but their lifecycle extends until the last user is done, so
 289 * it's freed via kref.  Must support container/group/device being
 290 * closed in any order.
 291 */
 292static void vfio_container_get(struct vfio_container *container)
 293{
 294        kref_get(&container->kref);
 295}
 296
 297static void vfio_container_release(struct kref *kref)
 298{
 299        struct vfio_container *container;
 300        container = container_of(kref, struct vfio_container, kref);
 301
 302        kfree(container);
 303}
 304
 305static void vfio_container_put(struct vfio_container *container)
 306{
 307        kref_put(&container->kref, vfio_container_release);
 308}
 309
 310static void vfio_group_unlock_and_free(struct vfio_group *group)
 311{
 312        mutex_unlock(&vfio.group_lock);
 313        /*
 314         * Unregister outside of lock.  A spurious callback is harmless now
 315         * that the group is no longer in vfio.group_list.
 316         */
 317        iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 318        kfree(group);
 319}
 320
 321/**
 322 * Group objects - create, release, get, put, search
 323 */
 324static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 325{
 326        struct vfio_group *group, *tmp;
 327        struct device *dev;
 328        int ret, minor;
 329
 330        group = kzalloc(sizeof(*group), GFP_KERNEL);
 331        if (!group)
 332                return ERR_PTR(-ENOMEM);
 333
 334        kref_init(&group->kref);
 335        INIT_LIST_HEAD(&group->device_list);
 336        mutex_init(&group->device_lock);
 337        INIT_LIST_HEAD(&group->unbound_list);
 338        mutex_init(&group->unbound_lock);
 339        atomic_set(&group->container_users, 0);
 340        atomic_set(&group->opened, 0);
 341        init_waitqueue_head(&group->container_q);
 342        group->iommu_group = iommu_group;
 343#ifdef CONFIG_VFIO_NOIOMMU
 344        group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 345#endif
 346        BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 347
 348        group->nb.notifier_call = vfio_iommu_group_notifier;
 349
 350        /*
 351         * blocking notifiers acquire a rwsem around registering and hold
 352         * it around callback.  Therefore, need to register outside of
 353         * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 354         * do anything unless it can find the group in vfio.group_list, so
 355         * no harm in registering early.
 356         */
 357        ret = iommu_group_register_notifier(iommu_group, &group->nb);
 358        if (ret) {
 359                kfree(group);
 360                return ERR_PTR(ret);
 361        }
 362
 363        mutex_lock(&vfio.group_lock);
 364
 365        /* Did we race creating this group? */
 366        list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 367                if (tmp->iommu_group == iommu_group) {
 368                        vfio_group_get(tmp);
 369                        vfio_group_unlock_and_free(group);
 370                        return tmp;
 371                }
 372        }
 373
 374        minor = vfio_alloc_group_minor(group);
 375        if (minor < 0) {
 376                vfio_group_unlock_and_free(group);
 377                return ERR_PTR(minor);
 378        }
 379
 380        dev = device_create(vfio.class, NULL,
 381                            MKDEV(MAJOR(vfio.group_devt), minor),
 382                            group, "%s%d", group->noiommu ? "noiommu-" : "",
 383                            iommu_group_id(iommu_group));
 384        if (IS_ERR(dev)) {
 385                vfio_free_group_minor(minor);
 386                vfio_group_unlock_and_free(group);
 387                return ERR_CAST(dev);
 388        }
 389
 390        group->minor = minor;
 391        group->dev = dev;
 392
 393        list_add(&group->vfio_next, &vfio.group_list);
 394
 395        mutex_unlock(&vfio.group_lock);
 396
 397        return group;
 398}
 399
 400/* called with vfio.group_lock held */
 401static void vfio_group_release(struct kref *kref)
 402{
 403        struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 404        struct vfio_unbound_dev *unbound, *tmp;
 405        struct iommu_group *iommu_group = group->iommu_group;
 406
 407        WARN_ON(!list_empty(&group->device_list));
 408        WARN_ON(group->notifier.head);
 409
 410        list_for_each_entry_safe(unbound, tmp,
 411                                 &group->unbound_list, unbound_next) {
 412                list_del(&unbound->unbound_next);
 413                kfree(unbound);
 414        }
 415
 416        device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 417        list_del(&group->vfio_next);
 418        vfio_free_group_minor(group->minor);
 419        vfio_group_unlock_and_free(group);
 420        iommu_group_put(iommu_group);
 421}
 422
 423static void vfio_group_put(struct vfio_group *group)
 424{
 425        kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 426}
 427
 428struct vfio_group_put_work {
 429        struct work_struct work;
 430        struct vfio_group *group;
 431};
 432
 433static void vfio_group_put_bg(struct work_struct *work)
 434{
 435        struct vfio_group_put_work *do_work;
 436
 437        do_work = container_of(work, struct vfio_group_put_work, work);
 438
 439        vfio_group_put(do_work->group);
 440        kfree(do_work);
 441}
 442
 443static void vfio_group_schedule_put(struct vfio_group *group)
 444{
 445        struct vfio_group_put_work *do_work;
 446
 447        do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 448        if (WARN_ON(!do_work))
 449                return;
 450
 451        INIT_WORK(&do_work->work, vfio_group_put_bg);
 452        do_work->group = group;
 453        schedule_work(&do_work->work);
 454}
 455
 456/* Assume group_lock or group reference is held */
 457static void vfio_group_get(struct vfio_group *group)
 458{
 459        kref_get(&group->kref);
 460}
 461
 462/*
 463 * Not really a try as we will sleep for mutex, but we need to make
 464 * sure the group pointer is valid under lock and get a reference.
 465 */
 466static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 467{
 468        struct vfio_group *target = group;
 469
 470        mutex_lock(&vfio.group_lock);
 471        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 472                if (group == target) {
 473                        vfio_group_get(group);
 474                        mutex_unlock(&vfio.group_lock);
 475                        return group;
 476                }
 477        }
 478        mutex_unlock(&vfio.group_lock);
 479
 480        return NULL;
 481}
 482
 483static
 484struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 485{
 486        struct vfio_group *group;
 487
 488        mutex_lock(&vfio.group_lock);
 489        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 490                if (group->iommu_group == iommu_group) {
 491                        vfio_group_get(group);
 492                        mutex_unlock(&vfio.group_lock);
 493                        return group;
 494                }
 495        }
 496        mutex_unlock(&vfio.group_lock);
 497
 498        return NULL;
 499}
 500
 501static struct vfio_group *vfio_group_get_from_minor(int minor)
 502{
 503        struct vfio_group *group;
 504
 505        mutex_lock(&vfio.group_lock);
 506        group = idr_find(&vfio.group_idr, minor);
 507        if (!group) {
 508                mutex_unlock(&vfio.group_lock);
 509                return NULL;
 510        }
 511        vfio_group_get(group);
 512        mutex_unlock(&vfio.group_lock);
 513
 514        return group;
 515}
 516
 517static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 518{
 519        struct iommu_group *iommu_group;
 520        struct vfio_group *group;
 521
 522        iommu_group = iommu_group_get(dev);
 523        if (!iommu_group)
 524                return NULL;
 525
 526        group = vfio_group_get_from_iommu(iommu_group);
 527        iommu_group_put(iommu_group);
 528
 529        return group;
 530}
 531
 532/**
 533 * Device objects - create, release, get, put, search
 534 */
 535static
 536struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 537                                             struct device *dev,
 538                                             const struct vfio_device_ops *ops,
 539                                             void *device_data)
 540{
 541        struct vfio_device *device;
 542
 543        device = kzalloc(sizeof(*device), GFP_KERNEL);
 544        if (!device)
 545                return ERR_PTR(-ENOMEM);
 546
 547        kref_init(&device->kref);
 548        device->dev = dev;
 549        device->group = group;
 550        device->ops = ops;
 551        device->device_data = device_data;
 552        dev_set_drvdata(dev, device);
 553
 554        /* No need to get group_lock, caller has group reference */
 555        vfio_group_get(group);
 556
 557        mutex_lock(&group->device_lock);
 558        list_add(&device->group_next, &group->device_list);
 559        group->dev_counter++;
 560        mutex_unlock(&group->device_lock);
 561
 562        return device;
 563}
 564
 565static void vfio_device_release(struct kref *kref)
 566{
 567        struct vfio_device *device = container_of(kref,
 568                                                  struct vfio_device, kref);
 569        struct vfio_group *group = device->group;
 570
 571        list_del(&device->group_next);
 572        group->dev_counter--;
 573        mutex_unlock(&group->device_lock);
 574
 575        dev_set_drvdata(device->dev, NULL);
 576
 577        kfree(device);
 578
 579        /* vfio_del_group_dev may be waiting for this device */
 580        wake_up(&vfio.release_q);
 581}
 582
 583/* Device reference always implies a group reference */
 584void vfio_device_put(struct vfio_device *device)
 585{
 586        struct vfio_group *group = device->group;
 587        kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 588        vfio_group_put(group);
 589}
 590EXPORT_SYMBOL_GPL(vfio_device_put);
 591
 592static void vfio_device_get(struct vfio_device *device)
 593{
 594        vfio_group_get(device->group);
 595        kref_get(&device->kref);
 596}
 597
 598static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 599                                                 struct device *dev)
 600{
 601        struct vfio_device *device;
 602
 603        mutex_lock(&group->device_lock);
 604        list_for_each_entry(device, &group->device_list, group_next) {
 605                if (device->dev == dev) {
 606                        vfio_device_get(device);
 607                        mutex_unlock(&group->device_lock);
 608                        return device;
 609                }
 610        }
 611        mutex_unlock(&group->device_lock);
 612        return NULL;
 613}
 614
 615/*
 616 * Some drivers, like pci-stub, are only used to prevent other drivers from
 617 * claiming a device and are therefore perfectly legitimate for a user owned
 618 * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 619 * of the device, but it does prevent the user from having direct access to
 620 * the device, which is useful in some circumstances.
 621 *
 622 * We also assume that we can include PCI interconnect devices, ie. bridges.
 623 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 624 * then all of the downstream devices will be part of the same IOMMU group as
 625 * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 626 * breaks anything, it only does so for user owned devices downstream.  Note
 627 * that error notification via MSI can be affected for platforms that handle
 628 * MSI within the same IOVA space as DMA.
 629 */
 630static const char * const vfio_driver_allowed[] = { "pci-stub" };
 631
 632static bool vfio_dev_driver_allowed(struct device *dev,
 633                                    struct device_driver *drv)
 634{
 635        if (dev_is_pci(dev)) {
 636                struct pci_dev *pdev = to_pci_dev(dev);
 637
 638                if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 639                        return true;
 640        }
 641
 642        return match_string(vfio_driver_allowed,
 643                            ARRAY_SIZE(vfio_driver_allowed),
 644                            drv->name) >= 0;
 645}
 646
 647/*
 648 * A vfio group is viable for use by userspace if all devices are in
 649 * one of the following states:
 650 *  - driver-less
 651 *  - bound to a vfio driver
 652 *  - bound to an otherwise allowed driver
 653 *  - a PCI interconnect device
 654 *
 655 * We use two methods to determine whether a device is bound to a vfio
 656 * driver.  The first is to test whether the device exists in the vfio
 657 * group.  The second is to test if the device exists on the group
 658 * unbound_list, indicating it's in the middle of transitioning from
 659 * a vfio driver to driver-less.
 660 */
 661static int vfio_dev_viable(struct device *dev, void *data)
 662{
 663        struct vfio_group *group = data;
 664        struct vfio_device *device;
 665        struct device_driver *drv = READ_ONCE(dev->driver);
 666        struct vfio_unbound_dev *unbound;
 667        int ret = -EINVAL;
 668
 669        mutex_lock(&group->unbound_lock);
 670        list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 671                if (dev == unbound->dev) {
 672                        ret = 0;
 673                        break;
 674                }
 675        }
 676        mutex_unlock(&group->unbound_lock);
 677
 678        if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
 679                return 0;
 680
 681        device = vfio_group_get_device(group, dev);
 682        if (device) {
 683                vfio_device_put(device);
 684                return 0;
 685        }
 686
 687        return ret;
 688}
 689
 690/**
 691 * Async device support
 692 */
 693static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 694{
 695        struct vfio_device *device;
 696
 697        /* Do we already know about it?  We shouldn't */
 698        device = vfio_group_get_device(group, dev);
 699        if (WARN_ON_ONCE(device)) {
 700                vfio_device_put(device);
 701                return 0;
 702        }
 703
 704        /* Nothing to do for idle groups */
 705        if (!atomic_read(&group->container_users))
 706                return 0;
 707
 708        /* TODO Prevent device auto probing */
 709        dev_WARN(dev, "Device added to live group %d!\n",
 710                 iommu_group_id(group->iommu_group));
 711
 712        return 0;
 713}
 714
 715static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 716{
 717        /* We don't care what happens when the group isn't in use */
 718        if (!atomic_read(&group->container_users))
 719                return 0;
 720
 721        return vfio_dev_viable(dev, group);
 722}
 723
 724static int vfio_iommu_group_notifier(struct notifier_block *nb,
 725                                     unsigned long action, void *data)
 726{
 727        struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 728        struct device *dev = data;
 729        struct vfio_unbound_dev *unbound;
 730
 731        /*
 732         * Need to go through a group_lock lookup to get a reference or we
 733         * risk racing a group being removed.  Ignore spurious notifies.
 734         */
 735        group = vfio_group_try_get(group);
 736        if (!group)
 737                return NOTIFY_OK;
 738
 739        switch (action) {
 740        case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 741                vfio_group_nb_add_dev(group, dev);
 742                break;
 743        case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 744                /*
 745                 * Nothing to do here.  If the device is in use, then the
 746                 * vfio sub-driver should block the remove callback until
 747                 * it is unused.  If the device is unused or attached to a
 748                 * stub driver, then it should be released and we don't
 749                 * care that it will be going away.
 750                 */
 751                break;
 752        case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 753                dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
 754                        iommu_group_id(group->iommu_group));
 755                break;
 756        case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 757                dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
 758                        iommu_group_id(group->iommu_group), dev->driver->name);
 759                BUG_ON(vfio_group_nb_verify(group, dev));
 760                break;
 761        case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 762                dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
 763                        __func__, iommu_group_id(group->iommu_group),
 764                        dev->driver->name);
 765                break;
 766        case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 767                dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
 768                        iommu_group_id(group->iommu_group));
 769                /*
 770                 * XXX An unbound device in a live group is ok, but we'd
 771                 * really like to avoid the above BUG_ON by preventing other
 772                 * drivers from binding to it.  Once that occurs, we have to
 773                 * stop the system to maintain isolation.  At a minimum, we'd
 774                 * want a toggle to disable driver auto probe for this device.
 775                 */
 776
 777                mutex_lock(&group->unbound_lock);
 778                list_for_each_entry(unbound,
 779                                    &group->unbound_list, unbound_next) {
 780                        if (dev == unbound->dev) {
 781                                list_del(&unbound->unbound_next);
 782                                kfree(unbound);
 783                                break;
 784                        }
 785                }
 786                mutex_unlock(&group->unbound_lock);
 787                break;
 788        }
 789
 790        /*
 791         * If we're the last reference to the group, the group will be
 792         * released, which includes unregistering the iommu group notifier.
 793         * We hold a read-lock on that notifier list, unregistering needs
 794         * a write-lock... deadlock.  Release our reference asynchronously
 795         * to avoid that situation.
 796         */
 797        vfio_group_schedule_put(group);
 798        return NOTIFY_OK;
 799}
 800
 801/**
 802 * VFIO driver API
 803 */
 804int vfio_add_group_dev(struct device *dev,
 805                       const struct vfio_device_ops *ops, void *device_data)
 806{
 807        struct iommu_group *iommu_group;
 808        struct vfio_group *group;
 809        struct vfio_device *device;
 810
 811        iommu_group = iommu_group_get(dev);
 812        if (!iommu_group)
 813                return -EINVAL;
 814
 815        group = vfio_group_get_from_iommu(iommu_group);
 816        if (!group) {
 817                group = vfio_create_group(iommu_group);
 818                if (IS_ERR(group)) {
 819                        iommu_group_put(iommu_group);
 820                        return PTR_ERR(group);
 821                }
 822        } else {
 823                /*
 824                 * A found vfio_group already holds a reference to the
 825                 * iommu_group.  A created vfio_group keeps the reference.
 826                 */
 827                iommu_group_put(iommu_group);
 828        }
 829
 830        device = vfio_group_get_device(group, dev);
 831        if (device) {
 832                dev_WARN(dev, "Device already exists on group %d\n",
 833                         iommu_group_id(iommu_group));
 834                vfio_device_put(device);
 835                vfio_group_put(group);
 836                return -EBUSY;
 837        }
 838
 839        device = vfio_group_create_device(group, dev, ops, device_data);
 840        if (IS_ERR(device)) {
 841                vfio_group_put(group);
 842                return PTR_ERR(device);
 843        }
 844
 845        /*
 846         * Drop all but the vfio_device reference.  The vfio_device holds
 847         * a reference to the vfio_group, which holds a reference to the
 848         * iommu_group.
 849         */
 850        vfio_group_put(group);
 851
 852        return 0;
 853}
 854EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 855
 856/**
 857 * Get a reference to the vfio_device for a device.  Even if the
 858 * caller thinks they own the device, they could be racing with a
 859 * release call path, so we can't trust drvdata for the shortcut.
 860 * Go the long way around, from the iommu_group to the vfio_group
 861 * to the vfio_device.
 862 */
 863struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 864{
 865        struct vfio_group *group;
 866        struct vfio_device *device;
 867
 868        group = vfio_group_get_from_dev(dev);
 869        if (!group)
 870                return NULL;
 871
 872        device = vfio_group_get_device(group, dev);
 873        vfio_group_put(group);
 874
 875        return device;
 876}
 877EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 878
 879static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 880                                                     char *buf)
 881{
 882        struct vfio_device *it, *device = ERR_PTR(-ENODEV);
 883
 884        mutex_lock(&group->device_lock);
 885        list_for_each_entry(it, &group->device_list, group_next) {
 886                int ret;
 887
 888                if (it->ops->match) {
 889                        ret = it->ops->match(it->device_data, buf);
 890                        if (ret < 0) {
 891                                device = ERR_PTR(ret);
 892                                break;
 893                        }
 894                } else {
 895                        ret = !strcmp(dev_name(it->dev), buf);
 896                }
 897
 898                if (ret) {
 899                        device = it;
 900                        vfio_device_get(device);
 901                        break;
 902                }
 903        }
 904        mutex_unlock(&group->device_lock);
 905
 906        return device;
 907}
 908
 909/*
 910 * Caller must hold a reference to the vfio_device
 911 */
 912void *vfio_device_data(struct vfio_device *device)
 913{
 914        return device->device_data;
 915}
 916EXPORT_SYMBOL_GPL(vfio_device_data);
 917
 918/*
 919 * Decrement the device reference count and wait for the device to be
 920 * removed.  Open file descriptors for the device... */
 921void *vfio_del_group_dev(struct device *dev)
 922{
 923        DEFINE_WAIT_FUNC(wait, woken_wake_function);
 924        struct vfio_device *device = dev_get_drvdata(dev);
 925        struct vfio_group *group = device->group;
 926        void *device_data = device->device_data;
 927        struct vfio_unbound_dev *unbound;
 928        unsigned int i = 0;
 929        bool interrupted = false;
 930
 931        /*
 932         * The group exists so long as we have a device reference.  Get
 933         * a group reference and use it to scan for the device going away.
 934         */
 935        vfio_group_get(group);
 936
 937        /*
 938         * When the device is removed from the group, the group suddenly
 939         * becomes non-viable; the device has a driver (until the unbind
 940         * completes), but it's not present in the group.  This is bad news
 941         * for any external users that need to re-acquire a group reference
 942         * in order to match and release their existing reference.  To
 943         * solve this, we track such devices on the unbound_list to bridge
 944         * the gap until they're fully unbound.
 945         */
 946        unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 947        if (unbound) {
 948                unbound->dev = dev;
 949                mutex_lock(&group->unbound_lock);
 950                list_add(&unbound->unbound_next, &group->unbound_list);
 951                mutex_unlock(&group->unbound_lock);
 952        }
 953        WARN_ON(!unbound);
 954
 955        vfio_device_put(device);
 956
 957        /*
 958         * If the device is still present in the group after the above
 959         * 'put', then it is in use and we need to request it from the
 960         * bus driver.  The driver may in turn need to request the
 961         * device from the user.  We send the request on an arbitrary
 962         * interval with counter to allow the driver to take escalating
 963         * measures to release the device if it has the ability to do so.
 964         */
 965        add_wait_queue(&vfio.release_q, &wait);
 966
 967        do {
 968                device = vfio_group_get_device(group, dev);
 969                if (!device)
 970                        break;
 971
 972                if (device->ops->request)
 973                        device->ops->request(device_data, i++);
 974
 975                vfio_device_put(device);
 976
 977                if (interrupted) {
 978                        wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
 979                } else {
 980                        wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
 981                        if (signal_pending(current)) {
 982                                interrupted = true;
 983                                dev_warn(dev,
 984                                         "Device is currently in use, task"
 985                                         " \"%s\" (%d) "
 986                                         "blocked until device is released",
 987                                         current->comm, task_pid_nr(current));
 988                        }
 989                }
 990
 991        } while (1);
 992
 993        remove_wait_queue(&vfio.release_q, &wait);
 994        /*
 995         * In order to support multiple devices per group, devices can be
 996         * plucked from the group while other devices in the group are still
 997         * in use.  The container persists with this group and those remaining
 998         * devices still attached.  If the user creates an isolation violation
 999         * by binding this device to another driver while the group is still in
1000         * use, that's their fault.  However, in the case of removing the last,
1001         * or potentially the only, device in the group there can be no other
1002         * in-use devices in the group.  The user has done their due diligence
1003         * and we should lay no claims to those devices.  In order to do that,
1004         * we need to make sure the group is detached from the container.
1005         * Without this stall, we're potentially racing with a user process
1006         * that may attempt to immediately bind this device to another driver.
1007         */
1008        if (list_empty(&group->device_list))
1009                wait_event(group->container_q, !group->container);
1010
1011        vfio_group_put(group);
1012
1013        return device_data;
1014}
1015EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1016
1017/**
1018 * VFIO base fd, /dev/vfio/vfio
1019 */
1020static long vfio_ioctl_check_extension(struct vfio_container *container,
1021                                       unsigned long arg)
1022{
1023        struct vfio_iommu_driver *driver;
1024        long ret = 0;
1025
1026        down_read(&container->group_lock);
1027
1028        driver = container->iommu_driver;
1029
1030        switch (arg) {
1031                /* No base extensions yet */
1032        default:
1033                /*
1034                 * If no driver is set, poll all registered drivers for
1035                 * extensions and return the first positive result.  If
1036                 * a driver is already set, further queries will be passed
1037                 * only to that driver.
1038                 */
1039                if (!driver) {
1040                        mutex_lock(&vfio.iommu_drivers_lock);
1041                        list_for_each_entry(driver, &vfio.iommu_drivers_list,
1042                                            vfio_next) {
1043
1044#ifdef CONFIG_VFIO_NOIOMMU
1045                                if (!list_empty(&container->group_list) &&
1046                                    (container->noiommu !=
1047                                     (driver->ops == &vfio_noiommu_ops)))
1048                                        continue;
1049#endif
1050
1051                                if (!try_module_get(driver->ops->owner))
1052                                        continue;
1053
1054                                ret = driver->ops->ioctl(NULL,
1055                                                         VFIO_CHECK_EXTENSION,
1056                                                         arg);
1057                                module_put(driver->ops->owner);
1058                                if (ret > 0)
1059                                        break;
1060                        }
1061                        mutex_unlock(&vfio.iommu_drivers_lock);
1062                } else
1063                        ret = driver->ops->ioctl(container->iommu_data,
1064                                                 VFIO_CHECK_EXTENSION, arg);
1065        }
1066
1067        up_read(&container->group_lock);
1068
1069        return ret;
1070}
1071
1072/* hold write lock on container->group_lock */
1073static int __vfio_container_attach_groups(struct vfio_container *container,
1074                                          struct vfio_iommu_driver *driver,
1075                                          void *data)
1076{
1077        struct vfio_group *group;
1078        int ret = -ENODEV;
1079
1080        list_for_each_entry(group, &container->group_list, container_next) {
1081                ret = driver->ops->attach_group(data, group->iommu_group);
1082                if (ret)
1083                        goto unwind;
1084        }
1085
1086        return ret;
1087
1088unwind:
1089        list_for_each_entry_continue_reverse(group, &container->group_list,
1090                                             container_next) {
1091                driver->ops->detach_group(data, group->iommu_group);
1092        }
1093
1094        return ret;
1095}
1096
1097static long vfio_ioctl_set_iommu(struct vfio_container *container,
1098                                 unsigned long arg)
1099{
1100        struct vfio_iommu_driver *driver;
1101        long ret = -ENODEV;
1102
1103        down_write(&container->group_lock);
1104
1105        /*
1106         * The container is designed to be an unprivileged interface while
1107         * the group can be assigned to specific users.  Therefore, only by
1108         * adding a group to a container does the user get the privilege of
1109         * enabling the iommu, which may allocate finite resources.  There
1110         * is no unset_iommu, but by removing all the groups from a container,
1111         * the container is deprivileged and returns to an unset state.
1112         */
1113        if (list_empty(&container->group_list) || container->iommu_driver) {
1114                up_write(&container->group_lock);
1115                return -EINVAL;
1116        }
1117
1118        mutex_lock(&vfio.iommu_drivers_lock);
1119        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1120                void *data;
1121
1122#ifdef CONFIG_VFIO_NOIOMMU
1123                /*
1124                 * Only noiommu containers can use vfio-noiommu and noiommu
1125                 * containers can only use vfio-noiommu.
1126                 */
1127                if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1128                        continue;
1129#endif
1130
1131                if (!try_module_get(driver->ops->owner))
1132                        continue;
1133
1134                /*
1135                 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1136                 * so test which iommu driver reported support for this
1137                 * extension and call open on them.  We also pass them the
1138                 * magic, allowing a single driver to support multiple
1139                 * interfaces if they'd like.
1140                 */
1141                if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1142                        module_put(driver->ops->owner);
1143                        continue;
1144                }
1145
1146                data = driver->ops->open(arg);
1147                if (IS_ERR(data)) {
1148                        ret = PTR_ERR(data);
1149                        module_put(driver->ops->owner);
1150                        continue;
1151                }
1152
1153                ret = __vfio_container_attach_groups(container, driver, data);
1154                if (ret) {
1155                        driver->ops->release(data);
1156                        module_put(driver->ops->owner);
1157                        continue;
1158                }
1159
1160                container->iommu_driver = driver;
1161                container->iommu_data = data;
1162                break;
1163        }
1164
1165        mutex_unlock(&vfio.iommu_drivers_lock);
1166        up_write(&container->group_lock);
1167
1168        return ret;
1169}
1170
1171static long vfio_fops_unl_ioctl(struct file *filep,
1172                                unsigned int cmd, unsigned long arg)
1173{
1174        struct vfio_container *container = filep->private_data;
1175        struct vfio_iommu_driver *driver;
1176        void *data;
1177        long ret = -EINVAL;
1178
1179        if (!container)
1180                return ret;
1181
1182        switch (cmd) {
1183        case VFIO_GET_API_VERSION:
1184                ret = VFIO_API_VERSION;
1185                break;
1186        case VFIO_CHECK_EXTENSION:
1187                ret = vfio_ioctl_check_extension(container, arg);
1188                break;
1189        case VFIO_SET_IOMMU:
1190                ret = vfio_ioctl_set_iommu(container, arg);
1191                break;
1192        default:
1193                driver = container->iommu_driver;
1194                data = container->iommu_data;
1195
1196                if (driver) /* passthrough all unrecognized ioctls */
1197                        ret = driver->ops->ioctl(data, cmd, arg);
1198        }
1199
1200        return ret;
1201}
1202
1203static int vfio_fops_open(struct inode *inode, struct file *filep)
1204{
1205        struct vfio_container *container;
1206
1207        container = kzalloc(sizeof(*container), GFP_KERNEL);
1208        if (!container)
1209                return -ENOMEM;
1210
1211        INIT_LIST_HEAD(&container->group_list);
1212        init_rwsem(&container->group_lock);
1213        kref_init(&container->kref);
1214
1215        filep->private_data = container;
1216
1217        return 0;
1218}
1219
1220static int vfio_fops_release(struct inode *inode, struct file *filep)
1221{
1222        struct vfio_container *container = filep->private_data;
1223
1224        filep->private_data = NULL;
1225
1226        vfio_container_put(container);
1227
1228        return 0;
1229}
1230
1231/*
1232 * Once an iommu driver is set, we optionally pass read/write/mmap
1233 * on to the driver, allowing management interfaces beyond ioctl.
1234 */
1235static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1236                              size_t count, loff_t *ppos)
1237{
1238        struct vfio_container *container = filep->private_data;
1239        struct vfio_iommu_driver *driver;
1240        ssize_t ret = -EINVAL;
1241
1242        driver = container->iommu_driver;
1243        if (likely(driver && driver->ops->read))
1244                ret = driver->ops->read(container->iommu_data,
1245                                        buf, count, ppos);
1246
1247        return ret;
1248}
1249
1250static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1251                               size_t count, loff_t *ppos)
1252{
1253        struct vfio_container *container = filep->private_data;
1254        struct vfio_iommu_driver *driver;
1255        ssize_t ret = -EINVAL;
1256
1257        driver = container->iommu_driver;
1258        if (likely(driver && driver->ops->write))
1259                ret = driver->ops->write(container->iommu_data,
1260                                         buf, count, ppos);
1261
1262        return ret;
1263}
1264
1265static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1266{
1267        struct vfio_container *container = filep->private_data;
1268        struct vfio_iommu_driver *driver;
1269        int ret = -EINVAL;
1270
1271        driver = container->iommu_driver;
1272        if (likely(driver && driver->ops->mmap))
1273                ret = driver->ops->mmap(container->iommu_data, vma);
1274
1275        return ret;
1276}
1277
1278static const struct file_operations vfio_fops = {
1279        .owner          = THIS_MODULE,
1280        .open           = vfio_fops_open,
1281        .release        = vfio_fops_release,
1282        .read           = vfio_fops_read,
1283        .write          = vfio_fops_write,
1284        .unlocked_ioctl = vfio_fops_unl_ioctl,
1285        .compat_ioctl   = compat_ptr_ioctl,
1286        .mmap           = vfio_fops_mmap,
1287};
1288
1289/**
1290 * VFIO Group fd, /dev/vfio/$GROUP
1291 */
1292static void __vfio_group_unset_container(struct vfio_group *group)
1293{
1294        struct vfio_container *container = group->container;
1295        struct vfio_iommu_driver *driver;
1296
1297        down_write(&container->group_lock);
1298
1299        driver = container->iommu_driver;
1300        if (driver)
1301                driver->ops->detach_group(container->iommu_data,
1302                                          group->iommu_group);
1303
1304        group->container = NULL;
1305        wake_up(&group->container_q);
1306        list_del(&group->container_next);
1307
1308        /* Detaching the last group deprivileges a container, remove iommu */
1309        if (driver && list_empty(&container->group_list)) {
1310                driver->ops->release(container->iommu_data);
1311                module_put(driver->ops->owner);
1312                container->iommu_driver = NULL;
1313                container->iommu_data = NULL;
1314        }
1315
1316        up_write(&container->group_lock);
1317
1318        vfio_container_put(container);
1319}
1320
1321/*
1322 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1323 * if there was no container to unset.  Since the ioctl is called on
1324 * the group, we know that still exists, therefore the only valid
1325 * transition here is 1->0.
1326 */
1327static int vfio_group_unset_container(struct vfio_group *group)
1328{
1329        int users = atomic_cmpxchg(&group->container_users, 1, 0);
1330
1331        if (!users)
1332                return -EINVAL;
1333        if (users != 1)
1334                return -EBUSY;
1335
1336        __vfio_group_unset_container(group);
1337
1338        return 0;
1339}
1340
1341/*
1342 * When removing container users, anything that removes the last user
1343 * implicitly removes the group from the container.  That is, if the
1344 * group file descriptor is closed, as well as any device file descriptors,
1345 * the group is free.
1346 */
1347static void vfio_group_try_dissolve_container(struct vfio_group *group)
1348{
1349        if (0 == atomic_dec_if_positive(&group->container_users))
1350                __vfio_group_unset_container(group);
1351}
1352
1353static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1354{
1355        struct fd f;
1356        struct vfio_container *container;
1357        struct vfio_iommu_driver *driver;
1358        int ret = 0;
1359
1360        if (atomic_read(&group->container_users))
1361                return -EINVAL;
1362
1363        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1364                return -EPERM;
1365
1366        f = fdget(container_fd);
1367        if (!f.file)
1368                return -EBADF;
1369
1370        /* Sanity check, is this really our fd? */
1371        if (f.file->f_op != &vfio_fops) {
1372                fdput(f);
1373                return -EINVAL;
1374        }
1375
1376        container = f.file->private_data;
1377        WARN_ON(!container); /* fget ensures we don't race vfio_release */
1378
1379        down_write(&container->group_lock);
1380
1381        /* Real groups and fake groups cannot mix */
1382        if (!list_empty(&container->group_list) &&
1383            container->noiommu != group->noiommu) {
1384                ret = -EPERM;
1385                goto unlock_out;
1386        }
1387
1388        driver = container->iommu_driver;
1389        if (driver) {
1390                ret = driver->ops->attach_group(container->iommu_data,
1391                                                group->iommu_group);
1392                if (ret)
1393                        goto unlock_out;
1394        }
1395
1396        group->container = container;
1397        container->noiommu = group->noiommu;
1398        list_add(&group->container_next, &container->group_list);
1399
1400        /* Get a reference on the container and mark a user within the group */
1401        vfio_container_get(container);
1402        atomic_inc(&group->container_users);
1403
1404unlock_out:
1405        up_write(&container->group_lock);
1406        fdput(f);
1407        return ret;
1408}
1409
1410static bool vfio_group_viable(struct vfio_group *group)
1411{
1412        return (iommu_group_for_each_dev(group->iommu_group,
1413                                         group, vfio_dev_viable) == 0);
1414}
1415
1416static int vfio_group_add_container_user(struct vfio_group *group)
1417{
1418        if (!atomic_inc_not_zero(&group->container_users))
1419                return -EINVAL;
1420
1421        if (group->noiommu) {
1422                atomic_dec(&group->container_users);
1423                return -EPERM;
1424        }
1425        if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1426                atomic_dec(&group->container_users);
1427                return -EINVAL;
1428        }
1429
1430        return 0;
1431}
1432
1433static const struct file_operations vfio_device_fops;
1434
1435static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1436{
1437        struct vfio_device *device;
1438        struct file *filep;
1439        int ret;
1440
1441        if (0 == atomic_read(&group->container_users) ||
1442            !group->container->iommu_driver || !vfio_group_viable(group))
1443                return -EINVAL;
1444
1445        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1446                return -EPERM;
1447
1448        device = vfio_device_get_from_name(group, buf);
1449        if (IS_ERR(device))
1450                return PTR_ERR(device);
1451
1452        ret = device->ops->open(device->device_data);
1453        if (ret) {
1454                vfio_device_put(device);
1455                return ret;
1456        }
1457
1458        /*
1459         * We can't use anon_inode_getfd() because we need to modify
1460         * the f_mode flags directly to allow more than just ioctls
1461         */
1462        ret = get_unused_fd_flags(O_CLOEXEC);
1463        if (ret < 0) {
1464                device->ops->release(device->device_data);
1465                vfio_device_put(device);
1466                return ret;
1467        }
1468
1469        filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1470                                   device, O_RDWR);
1471        if (IS_ERR(filep)) {
1472                put_unused_fd(ret);
1473                ret = PTR_ERR(filep);
1474                device->ops->release(device->device_data);
1475                vfio_device_put(device);
1476                return ret;
1477        }
1478
1479        /*
1480         * TODO: add an anon_inode interface to do this.
1481         * Appears to be missing by lack of need rather than
1482         * explicitly prevented.  Now there's need.
1483         */
1484        filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1485
1486        atomic_inc(&group->container_users);
1487
1488        fd_install(ret, filep);
1489
1490        if (group->noiommu)
1491                dev_warn(device->dev, "vfio-noiommu device opened by user "
1492                         "(%s:%d)\n", current->comm, task_pid_nr(current));
1493
1494        return ret;
1495}
1496
1497static long vfio_group_fops_unl_ioctl(struct file *filep,
1498                                      unsigned int cmd, unsigned long arg)
1499{
1500        struct vfio_group *group = filep->private_data;
1501        long ret = -ENOTTY;
1502
1503        switch (cmd) {
1504        case VFIO_GROUP_GET_STATUS:
1505        {
1506                struct vfio_group_status status;
1507                unsigned long minsz;
1508
1509                minsz = offsetofend(struct vfio_group_status, flags);
1510
1511                if (copy_from_user(&status, (void __user *)arg, minsz))
1512                        return -EFAULT;
1513
1514                if (status.argsz < minsz)
1515                        return -EINVAL;
1516
1517                status.flags = 0;
1518
1519                if (vfio_group_viable(group))
1520                        status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1521
1522                if (group->container)
1523                        status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1524
1525                if (copy_to_user((void __user *)arg, &status, minsz))
1526                        return -EFAULT;
1527
1528                ret = 0;
1529                break;
1530        }
1531        case VFIO_GROUP_SET_CONTAINER:
1532        {
1533                int fd;
1534
1535                if (get_user(fd, (int __user *)arg))
1536                        return -EFAULT;
1537
1538                if (fd < 0)
1539                        return -EINVAL;
1540
1541                ret = vfio_group_set_container(group, fd);
1542                break;
1543        }
1544        case VFIO_GROUP_UNSET_CONTAINER:
1545                ret = vfio_group_unset_container(group);
1546                break;
1547        case VFIO_GROUP_GET_DEVICE_FD:
1548        {
1549                char *buf;
1550
1551                buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1552                if (IS_ERR(buf))
1553                        return PTR_ERR(buf);
1554
1555                ret = vfio_group_get_device_fd(group, buf);
1556                kfree(buf);
1557                break;
1558        }
1559        }
1560
1561        return ret;
1562}
1563
1564static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1565{
1566        struct vfio_group *group;
1567        int opened;
1568
1569        group = vfio_group_get_from_minor(iminor(inode));
1570        if (!group)
1571                return -ENODEV;
1572
1573        if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1574                vfio_group_put(group);
1575                return -EPERM;
1576        }
1577
1578        /* Do we need multiple instances of the group open?  Seems not. */
1579        opened = atomic_cmpxchg(&group->opened, 0, 1);
1580        if (opened) {
1581                vfio_group_put(group);
1582                return -EBUSY;
1583        }
1584
1585        /* Is something still in use from a previous open? */
1586        if (group->container) {
1587                atomic_dec(&group->opened);
1588                vfio_group_put(group);
1589                return -EBUSY;
1590        }
1591
1592        /* Warn if previous user didn't cleanup and re-init to drop them */
1593        if (WARN_ON(group->notifier.head))
1594                BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1595
1596        filep->private_data = group;
1597
1598        return 0;
1599}
1600
1601static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1602{
1603        struct vfio_group *group = filep->private_data;
1604
1605        filep->private_data = NULL;
1606
1607        vfio_group_try_dissolve_container(group);
1608
1609        atomic_dec(&group->opened);
1610
1611        vfio_group_put(group);
1612
1613        return 0;
1614}
1615
1616static const struct file_operations vfio_group_fops = {
1617        .owner          = THIS_MODULE,
1618        .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1619        .compat_ioctl   = compat_ptr_ioctl,
1620        .open           = vfio_group_fops_open,
1621        .release        = vfio_group_fops_release,
1622};
1623
1624/**
1625 * VFIO Device fd
1626 */
1627static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1628{
1629        struct vfio_device *device = filep->private_data;
1630
1631        device->ops->release(device->device_data);
1632
1633        vfio_group_try_dissolve_container(device->group);
1634
1635        vfio_device_put(device);
1636
1637        return 0;
1638}
1639
1640static long vfio_device_fops_unl_ioctl(struct file *filep,
1641                                       unsigned int cmd, unsigned long arg)
1642{
1643        struct vfio_device *device = filep->private_data;
1644
1645        if (unlikely(!device->ops->ioctl))
1646                return -EINVAL;
1647
1648        return device->ops->ioctl(device->device_data, cmd, arg);
1649}
1650
1651static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1652                                     size_t count, loff_t *ppos)
1653{
1654        struct vfio_device *device = filep->private_data;
1655
1656        if (unlikely(!device->ops->read))
1657                return -EINVAL;
1658
1659        return device->ops->read(device->device_data, buf, count, ppos);
1660}
1661
1662static ssize_t vfio_device_fops_write(struct file *filep,
1663                                      const char __user *buf,
1664                                      size_t count, loff_t *ppos)
1665{
1666        struct vfio_device *device = filep->private_data;
1667
1668        if (unlikely(!device->ops->write))
1669                return -EINVAL;
1670
1671        return device->ops->write(device->device_data, buf, count, ppos);
1672}
1673
1674static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1675{
1676        struct vfio_device *device = filep->private_data;
1677
1678        if (unlikely(!device->ops->mmap))
1679                return -EINVAL;
1680
1681        return device->ops->mmap(device->device_data, vma);
1682}
1683
1684static const struct file_operations vfio_device_fops = {
1685        .owner          = THIS_MODULE,
1686        .release        = vfio_device_fops_release,
1687        .read           = vfio_device_fops_read,
1688        .write          = vfio_device_fops_write,
1689        .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1690        .compat_ioctl   = compat_ptr_ioctl,
1691        .mmap           = vfio_device_fops_mmap,
1692};
1693
1694/**
1695 * External user API, exported by symbols to be linked dynamically.
1696 *
1697 * The protocol includes:
1698 *  1. do normal VFIO init operation:
1699 *      - opening a new container;
1700 *      - attaching group(s) to it;
1701 *      - setting an IOMMU driver for a container.
1702 * When IOMMU is set for a container, all groups in it are
1703 * considered ready to use by an external user.
1704 *
1705 * 2. User space passes a group fd to an external user.
1706 * The external user calls vfio_group_get_external_user()
1707 * to verify that:
1708 *      - the group is initialized;
1709 *      - IOMMU is set for it.
1710 * If both checks passed, vfio_group_get_external_user()
1711 * increments the container user counter to prevent
1712 * the VFIO group from disposal before KVM exits.
1713 *
1714 * 3. The external user calls vfio_external_user_iommu_id()
1715 * to know an IOMMU ID.
1716 *
1717 * 4. When the external KVM finishes, it calls
1718 * vfio_group_put_external_user() to release the VFIO group.
1719 * This call decrements the container user counter.
1720 */
1721struct vfio_group *vfio_group_get_external_user(struct file *filep)
1722{
1723        struct vfio_group *group = filep->private_data;
1724        int ret;
1725
1726        if (filep->f_op != &vfio_group_fops)
1727                return ERR_PTR(-EINVAL);
1728
1729        ret = vfio_group_add_container_user(group);
1730        if (ret)
1731                return ERR_PTR(ret);
1732
1733        vfio_group_get(group);
1734
1735        return group;
1736}
1737EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1738
1739/**
1740 * External user API, exported by symbols to be linked dynamically.
1741 * The external user passes in a device pointer
1742 * to verify that:
1743 *      - A VFIO group is assiciated with the device;
1744 *      - IOMMU is set for the group.
1745 * If both checks passed, vfio_group_get_external_user_from_dev()
1746 * increments the container user counter to prevent the VFIO group
1747 * from disposal before external user exits and returns the pointer
1748 * to the VFIO group.
1749 *
1750 * When the external user finishes using the VFIO group, it calls
1751 * vfio_group_put_external_user() to release the VFIO group and
1752 * decrement the container user counter.
1753 *
1754 * @dev [in]    : device
1755 * Return error PTR or pointer to VFIO group.
1756 */
1757
1758struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1759{
1760        struct vfio_group *group;
1761        int ret;
1762
1763        group = vfio_group_get_from_dev(dev);
1764        if (!group)
1765                return ERR_PTR(-ENODEV);
1766
1767        ret = vfio_group_add_container_user(group);
1768        if (ret) {
1769                vfio_group_put(group);
1770                return ERR_PTR(ret);
1771        }
1772
1773        return group;
1774}
1775EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1776
1777void vfio_group_put_external_user(struct vfio_group *group)
1778{
1779        vfio_group_try_dissolve_container(group);
1780        vfio_group_put(group);
1781}
1782EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1783
1784bool vfio_external_group_match_file(struct vfio_group *test_group,
1785                                    struct file *filep)
1786{
1787        struct vfio_group *group = filep->private_data;
1788
1789        return (filep->f_op == &vfio_group_fops) && (group == test_group);
1790}
1791EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1792
1793int vfio_external_user_iommu_id(struct vfio_group *group)
1794{
1795        return iommu_group_id(group->iommu_group);
1796}
1797EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1798
1799long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1800{
1801        return vfio_ioctl_check_extension(group->container, arg);
1802}
1803EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1804
1805/**
1806 * Sub-module support
1807 */
1808/*
1809 * Helper for managing a buffer of info chain capabilities, allocate or
1810 * reallocate a buffer with additional @size, filling in @id and @version
1811 * of the capability.  A pointer to the new capability is returned.
1812 *
1813 * NB. The chain is based at the head of the buffer, so new entries are
1814 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1815 * next offsets prior to copying to the user buffer.
1816 */
1817struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1818                                               size_t size, u16 id, u16 version)
1819{
1820        void *buf;
1821        struct vfio_info_cap_header *header, *tmp;
1822
1823        buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1824        if (!buf) {
1825                kfree(caps->buf);
1826                caps->size = 0;
1827                return ERR_PTR(-ENOMEM);
1828        }
1829
1830        caps->buf = buf;
1831        header = buf + caps->size;
1832
1833        /* Eventually copied to user buffer, zero */
1834        memset(header, 0, size);
1835
1836        header->id = id;
1837        header->version = version;
1838
1839        /* Add to the end of the capability chain */
1840        for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1841                ; /* nothing */
1842
1843        tmp->next = caps->size;
1844        caps->size += size;
1845
1846        return header;
1847}
1848EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1849
1850void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1851{
1852        struct vfio_info_cap_header *tmp;
1853        void *buf = (void *)caps->buf;
1854
1855        for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1856                tmp->next += offset;
1857}
1858EXPORT_SYMBOL(vfio_info_cap_shift);
1859
1860int vfio_info_add_capability(struct vfio_info_cap *caps,
1861                             struct vfio_info_cap_header *cap, size_t size)
1862{
1863        struct vfio_info_cap_header *header;
1864
1865        header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1866        if (IS_ERR(header))
1867                return PTR_ERR(header);
1868
1869        memcpy(header + 1, cap + 1, size - sizeof(*header));
1870
1871        return 0;
1872}
1873EXPORT_SYMBOL(vfio_info_add_capability);
1874
1875int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1876                                       int max_irq_type, size_t *data_size)
1877{
1878        unsigned long minsz;
1879        size_t size;
1880
1881        minsz = offsetofend(struct vfio_irq_set, count);
1882
1883        if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1884            (hdr->count >= (U32_MAX - hdr->start)) ||
1885            (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1886                                VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1887                return -EINVAL;
1888
1889        if (data_size)
1890                *data_size = 0;
1891
1892        if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1893                return -EINVAL;
1894
1895        switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1896        case VFIO_IRQ_SET_DATA_NONE:
1897                size = 0;
1898                break;
1899        case VFIO_IRQ_SET_DATA_BOOL:
1900                size = sizeof(uint8_t);
1901                break;
1902        case VFIO_IRQ_SET_DATA_EVENTFD:
1903                size = sizeof(int32_t);
1904                break;
1905        default:
1906                return -EINVAL;
1907        }
1908
1909        if (size) {
1910                if (hdr->argsz - minsz < hdr->count * size)
1911                        return -EINVAL;
1912
1913                if (!data_size)
1914                        return -EINVAL;
1915
1916                *data_size = hdr->count * size;
1917        }
1918
1919        return 0;
1920}
1921EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1922
1923/*
1924 * Pin a set of guest PFNs and return their associated host PFNs for local
1925 * domain only.
1926 * @dev [in]     : device
1927 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1928 * @npage [in]   : count of elements in user_pfn array.  This count should not
1929 *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1930 * @prot [in]    : protection flags
1931 * @phys_pfn[out]: array of host PFNs
1932 * Return error or number of pages pinned.
1933 */
1934int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1935                   int prot, unsigned long *phys_pfn)
1936{
1937        struct vfio_container *container;
1938        struct vfio_group *group;
1939        struct vfio_iommu_driver *driver;
1940        int ret;
1941
1942        if (!dev || !user_pfn || !phys_pfn || !npage)
1943                return -EINVAL;
1944
1945        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1946                return -E2BIG;
1947
1948        group = vfio_group_get_from_dev(dev);
1949        if (!group)
1950                return -ENODEV;
1951
1952        if (group->dev_counter > 1) {
1953                ret = -EINVAL;
1954                goto err_pin_pages;
1955        }
1956
1957        ret = vfio_group_add_container_user(group);
1958        if (ret)
1959                goto err_pin_pages;
1960
1961        container = group->container;
1962        driver = container->iommu_driver;
1963        if (likely(driver && driver->ops->pin_pages))
1964                ret = driver->ops->pin_pages(container->iommu_data,
1965                                             group->iommu_group, user_pfn,
1966                                             npage, prot, phys_pfn);
1967        else
1968                ret = -ENOTTY;
1969
1970        vfio_group_try_dissolve_container(group);
1971
1972err_pin_pages:
1973        vfio_group_put(group);
1974        return ret;
1975}
1976EXPORT_SYMBOL(vfio_pin_pages);
1977
1978/*
1979 * Unpin set of host PFNs for local domain only.
1980 * @dev [in]     : device
1981 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1982 *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1983 * @npage [in]   : count of elements in user_pfn array.  This count should not
1984 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1985 * Return error or number of pages unpinned.
1986 */
1987int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1988{
1989        struct vfio_container *container;
1990        struct vfio_group *group;
1991        struct vfio_iommu_driver *driver;
1992        int ret;
1993
1994        if (!dev || !user_pfn || !npage)
1995                return -EINVAL;
1996
1997        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1998                return -E2BIG;
1999
2000        group = vfio_group_get_from_dev(dev);
2001        if (!group)
2002                return -ENODEV;
2003
2004        ret = vfio_group_add_container_user(group);
2005        if (ret)
2006                goto err_unpin_pages;
2007
2008        container = group->container;
2009        driver = container->iommu_driver;
2010        if (likely(driver && driver->ops->unpin_pages))
2011                ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2012                                               npage);
2013        else
2014                ret = -ENOTTY;
2015
2016        vfio_group_try_dissolve_container(group);
2017
2018err_unpin_pages:
2019        vfio_group_put(group);
2020        return ret;
2021}
2022EXPORT_SYMBOL(vfio_unpin_pages);
2023
2024/*
2025 * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
2026 * VFIO group.
2027 *
2028 * The caller needs to call vfio_group_get_external_user() or
2029 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2030 * so as to prevent the VFIO group from disposal in the middle of the call.
2031 * But it can keep the reference to the VFIO group for several calls into
2032 * this interface.
2033 * After finishing using of the VFIO group, the caller needs to release the
2034 * VFIO group by calling vfio_group_put_external_user().
2035 *
2036 * @group [in]          : VFIO group
2037 * @user_iova_pfn [in]  : array of user/guest IOVA PFNs to be pinned.
2038 * @npage [in]          : count of elements in user_iova_pfn array.
2039 *                        This count should not be greater
2040 *                        VFIO_PIN_PAGES_MAX_ENTRIES.
2041 * @prot [in]           : protection flags
2042 * @phys_pfn [out]      : array of host PFNs
2043 * Return error or number of pages pinned.
2044 */
2045int vfio_group_pin_pages(struct vfio_group *group,
2046                         unsigned long *user_iova_pfn, int npage,
2047                         int prot, unsigned long *phys_pfn)
2048{
2049        struct vfio_container *container;
2050        struct vfio_iommu_driver *driver;
2051        int ret;
2052
2053        if (!group || !user_iova_pfn || !phys_pfn || !npage)
2054                return -EINVAL;
2055
2056        if (group->dev_counter > 1)
2057                return -EINVAL;
2058
2059        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2060                return -E2BIG;
2061
2062        container = group->container;
2063        driver = container->iommu_driver;
2064        if (likely(driver && driver->ops->pin_pages))
2065                ret = driver->ops->pin_pages(container->iommu_data,
2066                                             group->iommu_group, user_iova_pfn,
2067                                             npage, prot, phys_pfn);
2068        else
2069                ret = -ENOTTY;
2070
2071        return ret;
2072}
2073EXPORT_SYMBOL(vfio_group_pin_pages);
2074
2075/*
2076 * Unpin a set of guest IOVA PFNs for a VFIO group.
2077 *
2078 * The caller needs to call vfio_group_get_external_user() or
2079 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2080 * so as to prevent the VFIO group from disposal in the middle of the call.
2081 * But it can keep the reference to the VFIO group for several calls into
2082 * this interface.
2083 * After finishing using of the VFIO group, the caller needs to release the
2084 * VFIO group by calling vfio_group_put_external_user().
2085 *
2086 * @group [in]          : vfio group
2087 * @user_iova_pfn [in]  : array of user/guest IOVA PFNs to be unpinned.
2088 * @npage [in]          : count of elements in user_iova_pfn array.
2089 *                        This count should not be greater than
2090 *                        VFIO_PIN_PAGES_MAX_ENTRIES.
2091 * Return error or number of pages unpinned.
2092 */
2093int vfio_group_unpin_pages(struct vfio_group *group,
2094                           unsigned long *user_iova_pfn, int npage)
2095{
2096        struct vfio_container *container;
2097        struct vfio_iommu_driver *driver;
2098        int ret;
2099
2100        if (!group || !user_iova_pfn || !npage)
2101                return -EINVAL;
2102
2103        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2104                return -E2BIG;
2105
2106        container = group->container;
2107        driver = container->iommu_driver;
2108        if (likely(driver && driver->ops->unpin_pages))
2109                ret = driver->ops->unpin_pages(container->iommu_data,
2110                                               user_iova_pfn, npage);
2111        else
2112                ret = -ENOTTY;
2113
2114        return ret;
2115}
2116EXPORT_SYMBOL(vfio_group_unpin_pages);
2117
2118
2119/*
2120 * This interface allows the CPUs to perform some sort of virtual DMA on
2121 * behalf of the device.
2122 *
2123 * CPUs read/write from/into a range of IOVAs pointing to user space memory
2124 * into/from a kernel buffer.
2125 *
2126 * As the read/write of user space memory is conducted via the CPUs and is
2127 * not a real device DMA, it is not necessary to pin the user space memory.
2128 *
2129 * The caller needs to call vfio_group_get_external_user() or
2130 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2131 * so as to prevent the VFIO group from disposal in the middle of the call.
2132 * But it can keep the reference to the VFIO group for several calls into
2133 * this interface.
2134 * After finishing using of the VFIO group, the caller needs to release the
2135 * VFIO group by calling vfio_group_put_external_user().
2136 *
2137 * @group [in]          : VFIO group
2138 * @user_iova [in]      : base IOVA of a user space buffer
2139 * @data [in]           : pointer to kernel buffer
2140 * @len [in]            : kernel buffer length
2141 * @write               : indicate read or write
2142 * Return error code on failure or 0 on success.
2143 */
2144int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2145                void *data, size_t len, bool write)
2146{
2147        struct vfio_container *container;
2148        struct vfio_iommu_driver *driver;
2149        int ret = 0;
2150
2151        if (!group || !data || len <= 0)
2152                return -EINVAL;
2153
2154        container = group->container;
2155        driver = container->iommu_driver;
2156
2157        if (likely(driver && driver->ops->dma_rw))
2158                ret = driver->ops->dma_rw(container->iommu_data,
2159                                          user_iova, data, len, write);
2160        else
2161                ret = -ENOTTY;
2162
2163        return ret;
2164}
2165EXPORT_SYMBOL(vfio_dma_rw);
2166
2167static int vfio_register_iommu_notifier(struct vfio_group *group,
2168                                        unsigned long *events,
2169                                        struct notifier_block *nb)
2170{
2171        struct vfio_container *container;
2172        struct vfio_iommu_driver *driver;
2173        int ret;
2174
2175        ret = vfio_group_add_container_user(group);
2176        if (ret)
2177                return -EINVAL;
2178
2179        container = group->container;
2180        driver = container->iommu_driver;
2181        if (likely(driver && driver->ops->register_notifier))
2182                ret = driver->ops->register_notifier(container->iommu_data,
2183                                                     events, nb);
2184        else
2185                ret = -ENOTTY;
2186
2187        vfio_group_try_dissolve_container(group);
2188
2189        return ret;
2190}
2191
2192static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2193                                          struct notifier_block *nb)
2194{
2195        struct vfio_container *container;
2196        struct vfio_iommu_driver *driver;
2197        int ret;
2198
2199        ret = vfio_group_add_container_user(group);
2200        if (ret)
2201                return -EINVAL;
2202
2203        container = group->container;
2204        driver = container->iommu_driver;
2205        if (likely(driver && driver->ops->unregister_notifier))
2206                ret = driver->ops->unregister_notifier(container->iommu_data,
2207                                                       nb);
2208        else
2209                ret = -ENOTTY;
2210
2211        vfio_group_try_dissolve_container(group);
2212
2213        return ret;
2214}
2215
2216void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2217{
2218        group->kvm = kvm;
2219        blocking_notifier_call_chain(&group->notifier,
2220                                VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2221}
2222EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2223
2224static int vfio_register_group_notifier(struct vfio_group *group,
2225                                        unsigned long *events,
2226                                        struct notifier_block *nb)
2227{
2228        int ret;
2229        bool set_kvm = false;
2230
2231        if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2232                set_kvm = true;
2233
2234        /* clear known events */
2235        *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2236
2237        /* refuse to continue if still events remaining */
2238        if (*events)
2239                return -EINVAL;
2240
2241        ret = vfio_group_add_container_user(group);
2242        if (ret)
2243                return -EINVAL;
2244
2245        ret = blocking_notifier_chain_register(&group->notifier, nb);
2246
2247        /*
2248         * The attaching of kvm and vfio_group might already happen, so
2249         * here we replay once upon registration.
2250         */
2251        if (!ret && set_kvm && group->kvm)
2252                blocking_notifier_call_chain(&group->notifier,
2253                                        VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2254
2255        vfio_group_try_dissolve_container(group);
2256
2257        return ret;
2258}
2259
2260static int vfio_unregister_group_notifier(struct vfio_group *group,
2261                                         struct notifier_block *nb)
2262{
2263        int ret;
2264
2265        ret = vfio_group_add_container_user(group);
2266        if (ret)
2267                return -EINVAL;
2268
2269        ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2270
2271        vfio_group_try_dissolve_container(group);
2272
2273        return ret;
2274}
2275
2276int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2277                           unsigned long *events, struct notifier_block *nb)
2278{
2279        struct vfio_group *group;
2280        int ret;
2281
2282        if (!dev || !nb || !events || (*events == 0))
2283                return -EINVAL;
2284
2285        group = vfio_group_get_from_dev(dev);
2286        if (!group)
2287                return -ENODEV;
2288
2289        switch (type) {
2290        case VFIO_IOMMU_NOTIFY:
2291                ret = vfio_register_iommu_notifier(group, events, nb);
2292                break;
2293        case VFIO_GROUP_NOTIFY:
2294                ret = vfio_register_group_notifier(group, events, nb);
2295                break;
2296        default:
2297                ret = -EINVAL;
2298        }
2299
2300        vfio_group_put(group);
2301        return ret;
2302}
2303EXPORT_SYMBOL(vfio_register_notifier);
2304
2305int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2306                             struct notifier_block *nb)
2307{
2308        struct vfio_group *group;
2309        int ret;
2310
2311        if (!dev || !nb)
2312                return -EINVAL;
2313
2314        group = vfio_group_get_from_dev(dev);
2315        if (!group)
2316                return -ENODEV;
2317
2318        switch (type) {
2319        case VFIO_IOMMU_NOTIFY:
2320                ret = vfio_unregister_iommu_notifier(group, nb);
2321                break;
2322        case VFIO_GROUP_NOTIFY:
2323                ret = vfio_unregister_group_notifier(group, nb);
2324                break;
2325        default:
2326                ret = -EINVAL;
2327        }
2328
2329        vfio_group_put(group);
2330        return ret;
2331}
2332EXPORT_SYMBOL(vfio_unregister_notifier);
2333
2334/**
2335 * Module/class support
2336 */
2337static char *vfio_devnode(struct device *dev, umode_t *mode)
2338{
2339        return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2340}
2341
2342static struct miscdevice vfio_dev = {
2343        .minor = VFIO_MINOR,
2344        .name = "vfio",
2345        .fops = &vfio_fops,
2346        .nodename = "vfio/vfio",
2347        .mode = S_IRUGO | S_IWUGO,
2348};
2349
2350static int __init vfio_init(void)
2351{
2352        int ret;
2353
2354        idr_init(&vfio.group_idr);
2355        mutex_init(&vfio.group_lock);
2356        mutex_init(&vfio.iommu_drivers_lock);
2357        INIT_LIST_HEAD(&vfio.group_list);
2358        INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2359        init_waitqueue_head(&vfio.release_q);
2360
2361        ret = misc_register(&vfio_dev);
2362        if (ret) {
2363                pr_err("vfio: misc device register failed\n");
2364                return ret;
2365        }
2366
2367        /* /dev/vfio/$GROUP */
2368        vfio.class = class_create(THIS_MODULE, "vfio");
2369        if (IS_ERR(vfio.class)) {
2370                ret = PTR_ERR(vfio.class);
2371                goto err_class;
2372        }
2373
2374        vfio.class->devnode = vfio_devnode;
2375
2376        ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2377        if (ret)
2378                goto err_alloc_chrdev;
2379
2380        cdev_init(&vfio.group_cdev, &vfio_group_fops);
2381        ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2382        if (ret)
2383                goto err_cdev_add;
2384
2385        pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2386
2387#ifdef CONFIG_VFIO_NOIOMMU
2388        vfio_register_iommu_driver(&vfio_noiommu_ops);
2389#endif
2390        return 0;
2391
2392err_cdev_add:
2393        unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2394err_alloc_chrdev:
2395        class_destroy(vfio.class);
2396        vfio.class = NULL;
2397err_class:
2398        misc_deregister(&vfio_dev);
2399        return ret;
2400}
2401
2402static void __exit vfio_cleanup(void)
2403{
2404        WARN_ON(!list_empty(&vfio.group_list));
2405
2406#ifdef CONFIG_VFIO_NOIOMMU
2407        vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2408#endif
2409        idr_destroy(&vfio.group_idr);
2410        cdev_del(&vfio.group_cdev);
2411        unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2412        class_destroy(vfio.class);
2413        vfio.class = NULL;
2414        misc_deregister(&vfio_dev);
2415}
2416
2417module_init(vfio_init);
2418module_exit(vfio_cleanup);
2419
2420MODULE_VERSION(DRIVER_VERSION);
2421MODULE_LICENSE("GPL v2");
2422MODULE_AUTHOR(DRIVER_AUTHOR);
2423MODULE_DESCRIPTION(DRIVER_DESC);
2424MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2425MODULE_ALIAS("devname:vfio/vfio");
2426MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2427