linux/drivers/vfio/vfio.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO core
   4 *
   5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6 *     Author: Alex Williamson <alex.williamson@redhat.com>
   7 *
   8 * Derived from original vfio:
   9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10 * Author: Tom Lyon, pugs@cisco.com
  11 */
  12
  13#include <linux/cdev.h>
  14#include <linux/compat.h>
  15#include <linux/device.h>
  16#include <linux/file.h>
  17#include <linux/anon_inodes.h>
  18#include <linux/fs.h>
  19#include <linux/idr.h>
  20#include <linux/iommu.h>
  21#include <linux/list.h>
  22#include <linux/miscdevice.h>
  23#include <linux/module.h>
  24#include <linux/mutex.h>
  25#include <linux/pci.h>
  26#include <linux/rwsem.h>
  27#include <linux/sched.h>
  28#include <linux/slab.h>
  29#include <linux/stat.h>
  30#include <linux/string.h>
  31#include <linux/uaccess.h>
  32#include <linux/vfio.h>
  33#include <linux/wait.h>
  34#include <linux/sched/signal.h>
  35
  36#define DRIVER_VERSION  "0.3"
  37#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  38#define DRIVER_DESC     "VFIO - User Level meta-driver"
  39
  40static struct vfio {
  41        struct class                    *class;
  42        struct list_head                iommu_drivers_list;
  43        struct mutex                    iommu_drivers_lock;
  44        struct list_head                group_list;
  45        struct idr                      group_idr;
  46        struct mutex                    group_lock;
  47        struct cdev                     group_cdev;
  48        dev_t                           group_devt;
  49} vfio;
  50
  51struct vfio_iommu_driver {
  52        const struct vfio_iommu_driver_ops      *ops;
  53        struct list_head                        vfio_next;
  54};
  55
  56struct vfio_container {
  57        struct kref                     kref;
  58        struct list_head                group_list;
  59        struct rw_semaphore             group_lock;
  60        struct vfio_iommu_driver        *iommu_driver;
  61        void                            *iommu_data;
  62        bool                            noiommu;
  63};
  64
  65struct vfio_unbound_dev {
  66        struct device                   *dev;
  67        struct list_head                unbound_next;
  68};
  69
  70struct vfio_group {
  71        struct kref                     kref;
  72        int                             minor;
  73        atomic_t                        container_users;
  74        struct iommu_group              *iommu_group;
  75        struct vfio_container           *container;
  76        struct list_head                device_list;
  77        struct mutex                    device_lock;
  78        struct device                   *dev;
  79        struct notifier_block           nb;
  80        struct list_head                vfio_next;
  81        struct list_head                container_next;
  82        struct list_head                unbound_list;
  83        struct mutex                    unbound_lock;
  84        atomic_t                        opened;
  85        wait_queue_head_t               container_q;
  86        bool                            noiommu;
  87        unsigned int                    dev_counter;
  88        struct kvm                      *kvm;
  89        struct blocking_notifier_head   notifier;
  90};
  91
  92#ifdef CONFIG_VFIO_NOIOMMU
  93static bool noiommu __read_mostly;
  94module_param_named(enable_unsafe_noiommu_mode,
  95                   noiommu, bool, S_IRUGO | S_IWUSR);
  96MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
  97#endif
  98
  99static DEFINE_XARRAY(vfio_device_set_xa);
 100
 101int vfio_assign_device_set(struct vfio_device *device, void *set_id)
 102{
 103        unsigned long idx = (unsigned long)set_id;
 104        struct vfio_device_set *new_dev_set;
 105        struct vfio_device_set *dev_set;
 106
 107        if (WARN_ON(!set_id))
 108                return -EINVAL;
 109
 110        /*
 111         * Atomically acquire a singleton object in the xarray for this set_id
 112         */
 113        xa_lock(&vfio_device_set_xa);
 114        dev_set = xa_load(&vfio_device_set_xa, idx);
 115        if (dev_set)
 116                goto found_get_ref;
 117        xa_unlock(&vfio_device_set_xa);
 118
 119        new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
 120        if (!new_dev_set)
 121                return -ENOMEM;
 122        mutex_init(&new_dev_set->lock);
 123        INIT_LIST_HEAD(&new_dev_set->device_list);
 124        new_dev_set->set_id = set_id;
 125
 126        xa_lock(&vfio_device_set_xa);
 127        dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
 128                               GFP_KERNEL);
 129        if (!dev_set) {
 130                dev_set = new_dev_set;
 131                goto found_get_ref;
 132        }
 133
 134        kfree(new_dev_set);
 135        if (xa_is_err(dev_set)) {
 136                xa_unlock(&vfio_device_set_xa);
 137                return xa_err(dev_set);
 138        }
 139
 140found_get_ref:
 141        dev_set->device_count++;
 142        xa_unlock(&vfio_device_set_xa);
 143        mutex_lock(&dev_set->lock);
 144        device->dev_set = dev_set;
 145        list_add_tail(&device->dev_set_list, &dev_set->device_list);
 146        mutex_unlock(&dev_set->lock);
 147        return 0;
 148}
 149EXPORT_SYMBOL_GPL(vfio_assign_device_set);
 150
 151static void vfio_release_device_set(struct vfio_device *device)
 152{
 153        struct vfio_device_set *dev_set = device->dev_set;
 154
 155        if (!dev_set)
 156                return;
 157
 158        mutex_lock(&dev_set->lock);
 159        list_del(&device->dev_set_list);
 160        mutex_unlock(&dev_set->lock);
 161
 162        xa_lock(&vfio_device_set_xa);
 163        if (!--dev_set->device_count) {
 164                __xa_erase(&vfio_device_set_xa,
 165                           (unsigned long)dev_set->set_id);
 166                mutex_destroy(&dev_set->lock);
 167                kfree(dev_set);
 168        }
 169        xa_unlock(&vfio_device_set_xa);
 170}
 171
 172/*
 173 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 174 * and remove functions, any use cases other than acquiring the first
 175 * reference for the purpose of calling vfio_register_group_dev() or removing
 176 * that symmetric reference after vfio_unregister_group_dev() should use the raw
 177 * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 178 * removes the device from the dummy group and cannot be nested.
 179 */
 180struct iommu_group *vfio_iommu_group_get(struct device *dev)
 181{
 182        struct iommu_group *group;
 183        int __maybe_unused ret;
 184
 185        group = iommu_group_get(dev);
 186
 187#ifdef CONFIG_VFIO_NOIOMMU
 188        /*
 189         * With noiommu enabled, an IOMMU group will be created for a device
 190         * that doesn't already have one and doesn't have an iommu_ops on their
 191         * bus.  We set iommudata simply to be able to identify these groups
 192         * as special use and for reclamation later.
 193         */
 194        if (group || !noiommu || iommu_present(dev->bus))
 195                return group;
 196
 197        group = iommu_group_alloc();
 198        if (IS_ERR(group))
 199                return NULL;
 200
 201        iommu_group_set_name(group, "vfio-noiommu");
 202        iommu_group_set_iommudata(group, &noiommu, NULL);
 203        ret = iommu_group_add_device(group, dev);
 204        if (ret) {
 205                iommu_group_put(group);
 206                return NULL;
 207        }
 208
 209        /*
 210         * Where to taint?  At this point we've added an IOMMU group for a
 211         * device that is not backed by iommu_ops, therefore any iommu_
 212         * callback using iommu_ops can legitimately Oops.  So, while we may
 213         * be about to give a DMA capable device to a user without IOMMU
 214         * protection, which is clearly taint-worthy, let's go ahead and do
 215         * it here.
 216         */
 217        add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 218        dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 219#endif
 220
 221        return group;
 222}
 223EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 224
 225void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 226{
 227#ifdef CONFIG_VFIO_NOIOMMU
 228        if (iommu_group_get_iommudata(group) == &noiommu)
 229                iommu_group_remove_device(dev);
 230#endif
 231
 232        iommu_group_put(group);
 233}
 234EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 235
 236#ifdef CONFIG_VFIO_NOIOMMU
 237static void *vfio_noiommu_open(unsigned long arg)
 238{
 239        if (arg != VFIO_NOIOMMU_IOMMU)
 240                return ERR_PTR(-EINVAL);
 241        if (!capable(CAP_SYS_RAWIO))
 242                return ERR_PTR(-EPERM);
 243
 244        return NULL;
 245}
 246
 247static void vfio_noiommu_release(void *iommu_data)
 248{
 249}
 250
 251static long vfio_noiommu_ioctl(void *iommu_data,
 252                               unsigned int cmd, unsigned long arg)
 253{
 254        if (cmd == VFIO_CHECK_EXTENSION)
 255                return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 256
 257        return -ENOTTY;
 258}
 259
 260static int vfio_noiommu_attach_group(void *iommu_data,
 261                                     struct iommu_group *iommu_group)
 262{
 263        return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 264}
 265
 266static void vfio_noiommu_detach_group(void *iommu_data,
 267                                      struct iommu_group *iommu_group)
 268{
 269}
 270
 271static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 272        .name = "vfio-noiommu",
 273        .owner = THIS_MODULE,
 274        .open = vfio_noiommu_open,
 275        .release = vfio_noiommu_release,
 276        .ioctl = vfio_noiommu_ioctl,
 277        .attach_group = vfio_noiommu_attach_group,
 278        .detach_group = vfio_noiommu_detach_group,
 279};
 280#endif
 281
 282
 283/**
 284 * IOMMU driver registration
 285 */
 286int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 287{
 288        struct vfio_iommu_driver *driver, *tmp;
 289
 290        driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 291        if (!driver)
 292                return -ENOMEM;
 293
 294        driver->ops = ops;
 295
 296        mutex_lock(&vfio.iommu_drivers_lock);
 297
 298        /* Check for duplicates */
 299        list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 300                if (tmp->ops == ops) {
 301                        mutex_unlock(&vfio.iommu_drivers_lock);
 302                        kfree(driver);
 303                        return -EINVAL;
 304                }
 305        }
 306
 307        list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 308
 309        mutex_unlock(&vfio.iommu_drivers_lock);
 310
 311        return 0;
 312}
 313EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 314
 315void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 316{
 317        struct vfio_iommu_driver *driver;
 318
 319        mutex_lock(&vfio.iommu_drivers_lock);
 320        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 321                if (driver->ops == ops) {
 322                        list_del(&driver->vfio_next);
 323                        mutex_unlock(&vfio.iommu_drivers_lock);
 324                        kfree(driver);
 325                        return;
 326                }
 327        }
 328        mutex_unlock(&vfio.iommu_drivers_lock);
 329}
 330EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 331
 332/**
 333 * Group minor allocation/free - both called with vfio.group_lock held
 334 */
 335static int vfio_alloc_group_minor(struct vfio_group *group)
 336{
 337        return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 338}
 339
 340static void vfio_free_group_minor(int minor)
 341{
 342        idr_remove(&vfio.group_idr, minor);
 343}
 344
 345static int vfio_iommu_group_notifier(struct notifier_block *nb,
 346                                     unsigned long action, void *data);
 347static void vfio_group_get(struct vfio_group *group);
 348
 349/**
 350 * Container objects - containers are created when /dev/vfio/vfio is
 351 * opened, but their lifecycle extends until the last user is done, so
 352 * it's freed via kref.  Must support container/group/device being
 353 * closed in any order.
 354 */
 355static void vfio_container_get(struct vfio_container *container)
 356{
 357        kref_get(&container->kref);
 358}
 359
 360static void vfio_container_release(struct kref *kref)
 361{
 362        struct vfio_container *container;
 363        container = container_of(kref, struct vfio_container, kref);
 364
 365        kfree(container);
 366}
 367
 368static void vfio_container_put(struct vfio_container *container)
 369{
 370        kref_put(&container->kref, vfio_container_release);
 371}
 372
 373static void vfio_group_unlock_and_free(struct vfio_group *group)
 374{
 375        mutex_unlock(&vfio.group_lock);
 376        /*
 377         * Unregister outside of lock.  A spurious callback is harmless now
 378         * that the group is no longer in vfio.group_list.
 379         */
 380        iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 381        kfree(group);
 382}
 383
 384/**
 385 * Group objects - create, release, get, put, search
 386 */
 387static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 388{
 389        struct vfio_group *group, *tmp;
 390        struct device *dev;
 391        int ret, minor;
 392
 393        group = kzalloc(sizeof(*group), GFP_KERNEL);
 394        if (!group)
 395                return ERR_PTR(-ENOMEM);
 396
 397        kref_init(&group->kref);
 398        INIT_LIST_HEAD(&group->device_list);
 399        mutex_init(&group->device_lock);
 400        INIT_LIST_HEAD(&group->unbound_list);
 401        mutex_init(&group->unbound_lock);
 402        atomic_set(&group->container_users, 0);
 403        atomic_set(&group->opened, 0);
 404        init_waitqueue_head(&group->container_q);
 405        group->iommu_group = iommu_group;
 406#ifdef CONFIG_VFIO_NOIOMMU
 407        group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 408#endif
 409        BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 410
 411        group->nb.notifier_call = vfio_iommu_group_notifier;
 412
 413        /*
 414         * blocking notifiers acquire a rwsem around registering and hold
 415         * it around callback.  Therefore, need to register outside of
 416         * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 417         * do anything unless it can find the group in vfio.group_list, so
 418         * no harm in registering early.
 419         */
 420        ret = iommu_group_register_notifier(iommu_group, &group->nb);
 421        if (ret) {
 422                kfree(group);
 423                return ERR_PTR(ret);
 424        }
 425
 426        mutex_lock(&vfio.group_lock);
 427
 428        /* Did we race creating this group? */
 429        list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 430                if (tmp->iommu_group == iommu_group) {
 431                        vfio_group_get(tmp);
 432                        vfio_group_unlock_and_free(group);
 433                        return tmp;
 434                }
 435        }
 436
 437        minor = vfio_alloc_group_minor(group);
 438        if (minor < 0) {
 439                vfio_group_unlock_and_free(group);
 440                return ERR_PTR(minor);
 441        }
 442
 443        dev = device_create(vfio.class, NULL,
 444                            MKDEV(MAJOR(vfio.group_devt), minor),
 445                            group, "%s%d", group->noiommu ? "noiommu-" : "",
 446                            iommu_group_id(iommu_group));
 447        if (IS_ERR(dev)) {
 448                vfio_free_group_minor(minor);
 449                vfio_group_unlock_and_free(group);
 450                return ERR_CAST(dev);
 451        }
 452
 453        group->minor = minor;
 454        group->dev = dev;
 455
 456        list_add(&group->vfio_next, &vfio.group_list);
 457
 458        mutex_unlock(&vfio.group_lock);
 459
 460        return group;
 461}
 462
 463/* called with vfio.group_lock held */
 464static void vfio_group_release(struct kref *kref)
 465{
 466        struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 467        struct vfio_unbound_dev *unbound, *tmp;
 468        struct iommu_group *iommu_group = group->iommu_group;
 469
 470        WARN_ON(!list_empty(&group->device_list));
 471        WARN_ON(group->notifier.head);
 472
 473        list_for_each_entry_safe(unbound, tmp,
 474                                 &group->unbound_list, unbound_next) {
 475                list_del(&unbound->unbound_next);
 476                kfree(unbound);
 477        }
 478
 479        device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 480        list_del(&group->vfio_next);
 481        vfio_free_group_minor(group->minor);
 482        vfio_group_unlock_and_free(group);
 483        iommu_group_put(iommu_group);
 484}
 485
 486static void vfio_group_put(struct vfio_group *group)
 487{
 488        kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 489}
 490
 491struct vfio_group_put_work {
 492        struct work_struct work;
 493        struct vfio_group *group;
 494};
 495
 496static void vfio_group_put_bg(struct work_struct *work)
 497{
 498        struct vfio_group_put_work *do_work;
 499
 500        do_work = container_of(work, struct vfio_group_put_work, work);
 501
 502        vfio_group_put(do_work->group);
 503        kfree(do_work);
 504}
 505
 506static void vfio_group_schedule_put(struct vfio_group *group)
 507{
 508        struct vfio_group_put_work *do_work;
 509
 510        do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 511        if (WARN_ON(!do_work))
 512                return;
 513
 514        INIT_WORK(&do_work->work, vfio_group_put_bg);
 515        do_work->group = group;
 516        schedule_work(&do_work->work);
 517}
 518
 519/* Assume group_lock or group reference is held */
 520static void vfio_group_get(struct vfio_group *group)
 521{
 522        kref_get(&group->kref);
 523}
 524
 525/*
 526 * Not really a try as we will sleep for mutex, but we need to make
 527 * sure the group pointer is valid under lock and get a reference.
 528 */
 529static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 530{
 531        struct vfio_group *target = group;
 532
 533        mutex_lock(&vfio.group_lock);
 534        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 535                if (group == target) {
 536                        vfio_group_get(group);
 537                        mutex_unlock(&vfio.group_lock);
 538                        return group;
 539                }
 540        }
 541        mutex_unlock(&vfio.group_lock);
 542
 543        return NULL;
 544}
 545
 546static
 547struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 548{
 549        struct vfio_group *group;
 550
 551        mutex_lock(&vfio.group_lock);
 552        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 553                if (group->iommu_group == iommu_group) {
 554                        vfio_group_get(group);
 555                        mutex_unlock(&vfio.group_lock);
 556                        return group;
 557                }
 558        }
 559        mutex_unlock(&vfio.group_lock);
 560
 561        return NULL;
 562}
 563
 564static struct vfio_group *vfio_group_get_from_minor(int minor)
 565{
 566        struct vfio_group *group;
 567
 568        mutex_lock(&vfio.group_lock);
 569        group = idr_find(&vfio.group_idr, minor);
 570        if (!group) {
 571                mutex_unlock(&vfio.group_lock);
 572                return NULL;
 573        }
 574        vfio_group_get(group);
 575        mutex_unlock(&vfio.group_lock);
 576
 577        return group;
 578}
 579
 580static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 581{
 582        struct iommu_group *iommu_group;
 583        struct vfio_group *group;
 584
 585        iommu_group = iommu_group_get(dev);
 586        if (!iommu_group)
 587                return NULL;
 588
 589        group = vfio_group_get_from_iommu(iommu_group);
 590        iommu_group_put(iommu_group);
 591
 592        return group;
 593}
 594
 595/**
 596 * Device objects - create, release, get, put, search
 597 */
 598/* Device reference always implies a group reference */
 599void vfio_device_put(struct vfio_device *device)
 600{
 601        if (refcount_dec_and_test(&device->refcount))
 602                complete(&device->comp);
 603}
 604EXPORT_SYMBOL_GPL(vfio_device_put);
 605
 606static bool vfio_device_try_get(struct vfio_device *device)
 607{
 608        return refcount_inc_not_zero(&device->refcount);
 609}
 610
 611static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 612                                                 struct device *dev)
 613{
 614        struct vfio_device *device;
 615
 616        mutex_lock(&group->device_lock);
 617        list_for_each_entry(device, &group->device_list, group_next) {
 618                if (device->dev == dev && vfio_device_try_get(device)) {
 619                        mutex_unlock(&group->device_lock);
 620                        return device;
 621                }
 622        }
 623        mutex_unlock(&group->device_lock);
 624        return NULL;
 625}
 626
 627/*
 628 * Some drivers, like pci-stub, are only used to prevent other drivers from
 629 * claiming a device and are therefore perfectly legitimate for a user owned
 630 * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 631 * of the device, but it does prevent the user from having direct access to
 632 * the device, which is useful in some circumstances.
 633 *
 634 * We also assume that we can include PCI interconnect devices, ie. bridges.
 635 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 636 * then all of the downstream devices will be part of the same IOMMU group as
 637 * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 638 * breaks anything, it only does so for user owned devices downstream.  Note
 639 * that error notification via MSI can be affected for platforms that handle
 640 * MSI within the same IOVA space as DMA.
 641 */
 642static const char * const vfio_driver_allowed[] = { "pci-stub" };
 643
 644static bool vfio_dev_driver_allowed(struct device *dev,
 645                                    struct device_driver *drv)
 646{
 647        if (dev_is_pci(dev)) {
 648                struct pci_dev *pdev = to_pci_dev(dev);
 649
 650                if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 651                        return true;
 652        }
 653
 654        return match_string(vfio_driver_allowed,
 655                            ARRAY_SIZE(vfio_driver_allowed),
 656                            drv->name) >= 0;
 657}
 658
 659/*
 660 * A vfio group is viable for use by userspace if all devices are in
 661 * one of the following states:
 662 *  - driver-less
 663 *  - bound to a vfio driver
 664 *  - bound to an otherwise allowed driver
 665 *  - a PCI interconnect device
 666 *
 667 * We use two methods to determine whether a device is bound to a vfio
 668 * driver.  The first is to test whether the device exists in the vfio
 669 * group.  The second is to test if the device exists on the group
 670 * unbound_list, indicating it's in the middle of transitioning from
 671 * a vfio driver to driver-less.
 672 */
 673static int vfio_dev_viable(struct device *dev, void *data)
 674{
 675        struct vfio_group *group = data;
 676        struct vfio_device *device;
 677        struct device_driver *drv = READ_ONCE(dev->driver);
 678        struct vfio_unbound_dev *unbound;
 679        int ret = -EINVAL;
 680
 681        mutex_lock(&group->unbound_lock);
 682        list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 683                if (dev == unbound->dev) {
 684                        ret = 0;
 685                        break;
 686                }
 687        }
 688        mutex_unlock(&group->unbound_lock);
 689
 690        if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
 691                return 0;
 692
 693        device = vfio_group_get_device(group, dev);
 694        if (device) {
 695                vfio_device_put(device);
 696                return 0;
 697        }
 698
 699        return ret;
 700}
 701
 702/**
 703 * Async device support
 704 */
 705static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 706{
 707        struct vfio_device *device;
 708
 709        /* Do we already know about it?  We shouldn't */
 710        device = vfio_group_get_device(group, dev);
 711        if (WARN_ON_ONCE(device)) {
 712                vfio_device_put(device);
 713                return 0;
 714        }
 715
 716        /* Nothing to do for idle groups */
 717        if (!atomic_read(&group->container_users))
 718                return 0;
 719
 720        /* TODO Prevent device auto probing */
 721        dev_WARN(dev, "Device added to live group %d!\n",
 722                 iommu_group_id(group->iommu_group));
 723
 724        return 0;
 725}
 726
 727static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 728{
 729        /* We don't care what happens when the group isn't in use */
 730        if (!atomic_read(&group->container_users))
 731                return 0;
 732
 733        return vfio_dev_viable(dev, group);
 734}
 735
 736static int vfio_iommu_group_notifier(struct notifier_block *nb,
 737                                     unsigned long action, void *data)
 738{
 739        struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 740        struct device *dev = data;
 741        struct vfio_unbound_dev *unbound;
 742
 743        /*
 744         * Need to go through a group_lock lookup to get a reference or we
 745         * risk racing a group being removed.  Ignore spurious notifies.
 746         */
 747        group = vfio_group_try_get(group);
 748        if (!group)
 749                return NOTIFY_OK;
 750
 751        switch (action) {
 752        case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 753                vfio_group_nb_add_dev(group, dev);
 754                break;
 755        case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 756                /*
 757                 * Nothing to do here.  If the device is in use, then the
 758                 * vfio sub-driver should block the remove callback until
 759                 * it is unused.  If the device is unused or attached to a
 760                 * stub driver, then it should be released and we don't
 761                 * care that it will be going away.
 762                 */
 763                break;
 764        case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 765                dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
 766                        iommu_group_id(group->iommu_group));
 767                break;
 768        case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 769                dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
 770                        iommu_group_id(group->iommu_group), dev->driver->name);
 771                BUG_ON(vfio_group_nb_verify(group, dev));
 772                break;
 773        case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 774                dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
 775                        __func__, iommu_group_id(group->iommu_group),
 776                        dev->driver->name);
 777                break;
 778        case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 779                dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
 780                        iommu_group_id(group->iommu_group));
 781                /*
 782                 * XXX An unbound device in a live group is ok, but we'd
 783                 * really like to avoid the above BUG_ON by preventing other
 784                 * drivers from binding to it.  Once that occurs, we have to
 785                 * stop the system to maintain isolation.  At a minimum, we'd
 786                 * want a toggle to disable driver auto probe for this device.
 787                 */
 788
 789                mutex_lock(&group->unbound_lock);
 790                list_for_each_entry(unbound,
 791                                    &group->unbound_list, unbound_next) {
 792                        if (dev == unbound->dev) {
 793                                list_del(&unbound->unbound_next);
 794                                kfree(unbound);
 795                                break;
 796                        }
 797                }
 798                mutex_unlock(&group->unbound_lock);
 799                break;
 800        }
 801
 802        /*
 803         * If we're the last reference to the group, the group will be
 804         * released, which includes unregistering the iommu group notifier.
 805         * We hold a read-lock on that notifier list, unregistering needs
 806         * a write-lock... deadlock.  Release our reference asynchronously
 807         * to avoid that situation.
 808         */
 809        vfio_group_schedule_put(group);
 810        return NOTIFY_OK;
 811}
 812
 813/**
 814 * VFIO driver API
 815 */
 816void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
 817                         const struct vfio_device_ops *ops)
 818{
 819        init_completion(&device->comp);
 820        device->dev = dev;
 821        device->ops = ops;
 822}
 823EXPORT_SYMBOL_GPL(vfio_init_group_dev);
 824
 825void vfio_uninit_group_dev(struct vfio_device *device)
 826{
 827        vfio_release_device_set(device);
 828}
 829EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
 830
 831int vfio_register_group_dev(struct vfio_device *device)
 832{
 833        struct vfio_device *existing_device;
 834        struct iommu_group *iommu_group;
 835        struct vfio_group *group;
 836
 837        /*
 838         * If the driver doesn't specify a set then the device is added to a
 839         * singleton set just for itself.
 840         */
 841        if (!device->dev_set)
 842                vfio_assign_device_set(device, device);
 843
 844        iommu_group = iommu_group_get(device->dev);
 845        if (!iommu_group)
 846                return -EINVAL;
 847
 848        group = vfio_group_get_from_iommu(iommu_group);
 849        if (!group) {
 850                group = vfio_create_group(iommu_group);
 851                if (IS_ERR(group)) {
 852                        iommu_group_put(iommu_group);
 853                        return PTR_ERR(group);
 854                }
 855        } else {
 856                /*
 857                 * A found vfio_group already holds a reference to the
 858                 * iommu_group.  A created vfio_group keeps the reference.
 859                 */
 860                iommu_group_put(iommu_group);
 861        }
 862
 863        existing_device = vfio_group_get_device(group, device->dev);
 864        if (existing_device) {
 865                dev_WARN(device->dev, "Device already exists on group %d\n",
 866                         iommu_group_id(iommu_group));
 867                vfio_device_put(existing_device);
 868                vfio_group_put(group);
 869                return -EBUSY;
 870        }
 871
 872        /* Our reference on group is moved to the device */
 873        device->group = group;
 874
 875        /* Refcounting can't start until the driver calls register */
 876        refcount_set(&device->refcount, 1);
 877
 878        mutex_lock(&group->device_lock);
 879        list_add(&device->group_next, &group->device_list);
 880        group->dev_counter++;
 881        mutex_unlock(&group->device_lock);
 882
 883        return 0;
 884}
 885EXPORT_SYMBOL_GPL(vfio_register_group_dev);
 886
 887/**
 888 * Get a reference to the vfio_device for a device.  Even if the
 889 * caller thinks they own the device, they could be racing with a
 890 * release call path, so we can't trust drvdata for the shortcut.
 891 * Go the long way around, from the iommu_group to the vfio_group
 892 * to the vfio_device.
 893 */
 894struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 895{
 896        struct vfio_group *group;
 897        struct vfio_device *device;
 898
 899        group = vfio_group_get_from_dev(dev);
 900        if (!group)
 901                return NULL;
 902
 903        device = vfio_group_get_device(group, dev);
 904        vfio_group_put(group);
 905
 906        return device;
 907}
 908EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 909
 910static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 911                                                     char *buf)
 912{
 913        struct vfio_device *it, *device = ERR_PTR(-ENODEV);
 914
 915        mutex_lock(&group->device_lock);
 916        list_for_each_entry(it, &group->device_list, group_next) {
 917                int ret;
 918
 919                if (it->ops->match) {
 920                        ret = it->ops->match(it, buf);
 921                        if (ret < 0) {
 922                                device = ERR_PTR(ret);
 923                                break;
 924                        }
 925                } else {
 926                        ret = !strcmp(dev_name(it->dev), buf);
 927                }
 928
 929                if (ret && vfio_device_try_get(it)) {
 930                        device = it;
 931                        break;
 932                }
 933        }
 934        mutex_unlock(&group->device_lock);
 935
 936        return device;
 937}
 938
 939/*
 940 * Decrement the device reference count and wait for the device to be
 941 * removed.  Open file descriptors for the device... */
 942void vfio_unregister_group_dev(struct vfio_device *device)
 943{
 944        struct vfio_group *group = device->group;
 945        struct vfio_unbound_dev *unbound;
 946        unsigned int i = 0;
 947        bool interrupted = false;
 948        long rc;
 949
 950        /*
 951         * When the device is removed from the group, the group suddenly
 952         * becomes non-viable; the device has a driver (until the unbind
 953         * completes), but it's not present in the group.  This is bad news
 954         * for any external users that need to re-acquire a group reference
 955         * in order to match and release their existing reference.  To
 956         * solve this, we track such devices on the unbound_list to bridge
 957         * the gap until they're fully unbound.
 958         */
 959        unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 960        if (unbound) {
 961                unbound->dev = device->dev;
 962                mutex_lock(&group->unbound_lock);
 963                list_add(&unbound->unbound_next, &group->unbound_list);
 964                mutex_unlock(&group->unbound_lock);
 965        }
 966        WARN_ON(!unbound);
 967
 968        vfio_device_put(device);
 969        rc = try_wait_for_completion(&device->comp);
 970        while (rc <= 0) {
 971                if (device->ops->request)
 972                        device->ops->request(device, i++);
 973
 974                if (interrupted) {
 975                        rc = wait_for_completion_timeout(&device->comp,
 976                                                         HZ * 10);
 977                } else {
 978                        rc = wait_for_completion_interruptible_timeout(
 979                                &device->comp, HZ * 10);
 980                        if (rc < 0) {
 981                                interrupted = true;
 982                                dev_warn(device->dev,
 983                                         "Device is currently in use, task"
 984                                         " \"%s\" (%d) "
 985                                         "blocked until device is released",
 986                                         current->comm, task_pid_nr(current));
 987                        }
 988                }
 989        }
 990
 991        mutex_lock(&group->device_lock);
 992        list_del(&device->group_next);
 993        group->dev_counter--;
 994        mutex_unlock(&group->device_lock);
 995
 996        /*
 997         * In order to support multiple devices per group, devices can be
 998         * plucked from the group while other devices in the group are still
 999         * in use.  The container persists with this group and those remaining
1000         * devices still attached.  If the user creates an isolation violation
1001         * by binding this device to another driver while the group is still in
1002         * use, that's their fault.  However, in the case of removing the last,
1003         * or potentially the only, device in the group there can be no other
1004         * in-use devices in the group.  The user has done their due diligence
1005         * and we should lay no claims to those devices.  In order to do that,
1006         * we need to make sure the group is detached from the container.
1007         * Without this stall, we're potentially racing with a user process
1008         * that may attempt to immediately bind this device to another driver.
1009         */
1010        if (list_empty(&group->device_list))
1011                wait_event(group->container_q, !group->container);
1012
1013        /* Matches the get in vfio_register_group_dev() */
1014        vfio_group_put(group);
1015}
1016EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
1017
1018/**
1019 * VFIO base fd, /dev/vfio/vfio
1020 */
1021static long vfio_ioctl_check_extension(struct vfio_container *container,
1022                                       unsigned long arg)
1023{
1024        struct vfio_iommu_driver *driver;
1025        long ret = 0;
1026
1027        down_read(&container->group_lock);
1028
1029        driver = container->iommu_driver;
1030
1031        switch (arg) {
1032                /* No base extensions yet */
1033        default:
1034                /*
1035                 * If no driver is set, poll all registered drivers for
1036                 * extensions and return the first positive result.  If
1037                 * a driver is already set, further queries will be passed
1038                 * only to that driver.
1039                 */
1040                if (!driver) {
1041                        mutex_lock(&vfio.iommu_drivers_lock);
1042                        list_for_each_entry(driver, &vfio.iommu_drivers_list,
1043                                            vfio_next) {
1044
1045#ifdef CONFIG_VFIO_NOIOMMU
1046                                if (!list_empty(&container->group_list) &&
1047                                    (container->noiommu !=
1048                                     (driver->ops == &vfio_noiommu_ops)))
1049                                        continue;
1050#endif
1051
1052                                if (!try_module_get(driver->ops->owner))
1053                                        continue;
1054
1055                                ret = driver->ops->ioctl(NULL,
1056                                                         VFIO_CHECK_EXTENSION,
1057                                                         arg);
1058                                module_put(driver->ops->owner);
1059                                if (ret > 0)
1060                                        break;
1061                        }
1062                        mutex_unlock(&vfio.iommu_drivers_lock);
1063                } else
1064                        ret = driver->ops->ioctl(container->iommu_data,
1065                                                 VFIO_CHECK_EXTENSION, arg);
1066        }
1067
1068        up_read(&container->group_lock);
1069
1070        return ret;
1071}
1072
1073/* hold write lock on container->group_lock */
1074static int __vfio_container_attach_groups(struct vfio_container *container,
1075                                          struct vfio_iommu_driver *driver,
1076                                          void *data)
1077{
1078        struct vfio_group *group;
1079        int ret = -ENODEV;
1080
1081        list_for_each_entry(group, &container->group_list, container_next) {
1082                ret = driver->ops->attach_group(data, group->iommu_group);
1083                if (ret)
1084                        goto unwind;
1085        }
1086
1087        return ret;
1088
1089unwind:
1090        list_for_each_entry_continue_reverse(group, &container->group_list,
1091                                             container_next) {
1092                driver->ops->detach_group(data, group->iommu_group);
1093        }
1094
1095        return ret;
1096}
1097
1098static long vfio_ioctl_set_iommu(struct vfio_container *container,
1099                                 unsigned long arg)
1100{
1101        struct vfio_iommu_driver *driver;
1102        long ret = -ENODEV;
1103
1104        down_write(&container->group_lock);
1105
1106        /*
1107         * The container is designed to be an unprivileged interface while
1108         * the group can be assigned to specific users.  Therefore, only by
1109         * adding a group to a container does the user get the privilege of
1110         * enabling the iommu, which may allocate finite resources.  There
1111         * is no unset_iommu, but by removing all the groups from a container,
1112         * the container is deprivileged and returns to an unset state.
1113         */
1114        if (list_empty(&container->group_list) || container->iommu_driver) {
1115                up_write(&container->group_lock);
1116                return -EINVAL;
1117        }
1118
1119        mutex_lock(&vfio.iommu_drivers_lock);
1120        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1121                void *data;
1122
1123#ifdef CONFIG_VFIO_NOIOMMU
1124                /*
1125                 * Only noiommu containers can use vfio-noiommu and noiommu
1126                 * containers can only use vfio-noiommu.
1127                 */
1128                if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1129                        continue;
1130#endif
1131
1132                if (!try_module_get(driver->ops->owner))
1133                        continue;
1134
1135                /*
1136                 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1137                 * so test which iommu driver reported support for this
1138                 * extension and call open on them.  We also pass them the
1139                 * magic, allowing a single driver to support multiple
1140                 * interfaces if they'd like.
1141                 */
1142                if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1143                        module_put(driver->ops->owner);
1144                        continue;
1145                }
1146
1147                data = driver->ops->open(arg);
1148                if (IS_ERR(data)) {
1149                        ret = PTR_ERR(data);
1150                        module_put(driver->ops->owner);
1151                        continue;
1152                }
1153
1154                ret = __vfio_container_attach_groups(container, driver, data);
1155                if (ret) {
1156                        driver->ops->release(data);
1157                        module_put(driver->ops->owner);
1158                        continue;
1159                }
1160
1161                container->iommu_driver = driver;
1162                container->iommu_data = data;
1163                break;
1164        }
1165
1166        mutex_unlock(&vfio.iommu_drivers_lock);
1167        up_write(&container->group_lock);
1168
1169        return ret;
1170}
1171
1172static long vfio_fops_unl_ioctl(struct file *filep,
1173                                unsigned int cmd, unsigned long arg)
1174{
1175        struct vfio_container *container = filep->private_data;
1176        struct vfio_iommu_driver *driver;
1177        void *data;
1178        long ret = -EINVAL;
1179
1180        if (!container)
1181                return ret;
1182
1183        switch (cmd) {
1184        case VFIO_GET_API_VERSION:
1185                ret = VFIO_API_VERSION;
1186                break;
1187        case VFIO_CHECK_EXTENSION:
1188                ret = vfio_ioctl_check_extension(container, arg);
1189                break;
1190        case VFIO_SET_IOMMU:
1191                ret = vfio_ioctl_set_iommu(container, arg);
1192                break;
1193        default:
1194                driver = container->iommu_driver;
1195                data = container->iommu_data;
1196
1197                if (driver) /* passthrough all unrecognized ioctls */
1198                        ret = driver->ops->ioctl(data, cmd, arg);
1199        }
1200
1201        return ret;
1202}
1203
1204static int vfio_fops_open(struct inode *inode, struct file *filep)
1205{
1206        struct vfio_container *container;
1207
1208        container = kzalloc(sizeof(*container), GFP_KERNEL);
1209        if (!container)
1210                return -ENOMEM;
1211
1212        INIT_LIST_HEAD(&container->group_list);
1213        init_rwsem(&container->group_lock);
1214        kref_init(&container->kref);
1215
1216        filep->private_data = container;
1217
1218        return 0;
1219}
1220
1221static int vfio_fops_release(struct inode *inode, struct file *filep)
1222{
1223        struct vfio_container *container = filep->private_data;
1224        struct vfio_iommu_driver *driver = container->iommu_driver;
1225
1226        if (driver && driver->ops->notify)
1227                driver->ops->notify(container->iommu_data,
1228                                    VFIO_IOMMU_CONTAINER_CLOSE);
1229
1230        filep->private_data = NULL;
1231
1232        vfio_container_put(container);
1233
1234        return 0;
1235}
1236
1237/*
1238 * Once an iommu driver is set, we optionally pass read/write/mmap
1239 * on to the driver, allowing management interfaces beyond ioctl.
1240 */
1241static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1242                              size_t count, loff_t *ppos)
1243{
1244        struct vfio_container *container = filep->private_data;
1245        struct vfio_iommu_driver *driver;
1246        ssize_t ret = -EINVAL;
1247
1248        driver = container->iommu_driver;
1249        if (likely(driver && driver->ops->read))
1250                ret = driver->ops->read(container->iommu_data,
1251                                        buf, count, ppos);
1252
1253        return ret;
1254}
1255
1256static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1257                               size_t count, loff_t *ppos)
1258{
1259        struct vfio_container *container = filep->private_data;
1260        struct vfio_iommu_driver *driver;
1261        ssize_t ret = -EINVAL;
1262
1263        driver = container->iommu_driver;
1264        if (likely(driver && driver->ops->write))
1265                ret = driver->ops->write(container->iommu_data,
1266                                         buf, count, ppos);
1267
1268        return ret;
1269}
1270
1271static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1272{
1273        struct vfio_container *container = filep->private_data;
1274        struct vfio_iommu_driver *driver;
1275        int ret = -EINVAL;
1276
1277        driver = container->iommu_driver;
1278        if (likely(driver && driver->ops->mmap))
1279                ret = driver->ops->mmap(container->iommu_data, vma);
1280
1281        return ret;
1282}
1283
1284static const struct file_operations vfio_fops = {
1285        .owner          = THIS_MODULE,
1286        .open           = vfio_fops_open,
1287        .release        = vfio_fops_release,
1288        .read           = vfio_fops_read,
1289        .write          = vfio_fops_write,
1290        .unlocked_ioctl = vfio_fops_unl_ioctl,
1291        .compat_ioctl   = compat_ptr_ioctl,
1292        .mmap           = vfio_fops_mmap,
1293};
1294
1295/**
1296 * VFIO Group fd, /dev/vfio/$GROUP
1297 */
1298static void __vfio_group_unset_container(struct vfio_group *group)
1299{
1300        struct vfio_container *container = group->container;
1301        struct vfio_iommu_driver *driver;
1302
1303        down_write(&container->group_lock);
1304
1305        driver = container->iommu_driver;
1306        if (driver)
1307                driver->ops->detach_group(container->iommu_data,
1308                                          group->iommu_group);
1309
1310        group->container = NULL;
1311        wake_up(&group->container_q);
1312        list_del(&group->container_next);
1313
1314        /* Detaching the last group deprivileges a container, remove iommu */
1315        if (driver && list_empty(&container->group_list)) {
1316                driver->ops->release(container->iommu_data);
1317                module_put(driver->ops->owner);
1318                container->iommu_driver = NULL;
1319                container->iommu_data = NULL;
1320        }
1321
1322        up_write(&container->group_lock);
1323
1324        vfio_container_put(container);
1325}
1326
1327/*
1328 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1329 * if there was no container to unset.  Since the ioctl is called on
1330 * the group, we know that still exists, therefore the only valid
1331 * transition here is 1->0.
1332 */
1333static int vfio_group_unset_container(struct vfio_group *group)
1334{
1335        int users = atomic_cmpxchg(&group->container_users, 1, 0);
1336
1337        if (!users)
1338                return -EINVAL;
1339        if (users != 1)
1340                return -EBUSY;
1341
1342        __vfio_group_unset_container(group);
1343
1344        return 0;
1345}
1346
1347/*
1348 * When removing container users, anything that removes the last user
1349 * implicitly removes the group from the container.  That is, if the
1350 * group file descriptor is closed, as well as any device file descriptors,
1351 * the group is free.
1352 */
1353static void vfio_group_try_dissolve_container(struct vfio_group *group)
1354{
1355        if (0 == atomic_dec_if_positive(&group->container_users))
1356                __vfio_group_unset_container(group);
1357}
1358
1359static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1360{
1361        struct fd f;
1362        struct vfio_container *container;
1363        struct vfio_iommu_driver *driver;
1364        int ret = 0;
1365
1366        if (atomic_read(&group->container_users))
1367                return -EINVAL;
1368
1369        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1370                return -EPERM;
1371
1372        f = fdget(container_fd);
1373        if (!f.file)
1374                return -EBADF;
1375
1376        /* Sanity check, is this really our fd? */
1377        if (f.file->f_op != &vfio_fops) {
1378                fdput(f);
1379                return -EINVAL;
1380        }
1381
1382        container = f.file->private_data;
1383        WARN_ON(!container); /* fget ensures we don't race vfio_release */
1384
1385        down_write(&container->group_lock);
1386
1387        /* Real groups and fake groups cannot mix */
1388        if (!list_empty(&container->group_list) &&
1389            container->noiommu != group->noiommu) {
1390                ret = -EPERM;
1391                goto unlock_out;
1392        }
1393
1394        driver = container->iommu_driver;
1395        if (driver) {
1396                ret = driver->ops->attach_group(container->iommu_data,
1397                                                group->iommu_group);
1398                if (ret)
1399                        goto unlock_out;
1400        }
1401
1402        group->container = container;
1403        container->noiommu = group->noiommu;
1404        list_add(&group->container_next, &container->group_list);
1405
1406        /* Get a reference on the container and mark a user within the group */
1407        vfio_container_get(container);
1408        atomic_inc(&group->container_users);
1409
1410unlock_out:
1411        up_write(&container->group_lock);
1412        fdput(f);
1413        return ret;
1414}
1415
1416static bool vfio_group_viable(struct vfio_group *group)
1417{
1418        return (iommu_group_for_each_dev(group->iommu_group,
1419                                         group, vfio_dev_viable) == 0);
1420}
1421
1422static int vfio_group_add_container_user(struct vfio_group *group)
1423{
1424        if (!atomic_inc_not_zero(&group->container_users))
1425                return -EINVAL;
1426
1427        if (group->noiommu) {
1428                atomic_dec(&group->container_users);
1429                return -EPERM;
1430        }
1431        if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1432                atomic_dec(&group->container_users);
1433                return -EINVAL;
1434        }
1435
1436        return 0;
1437}
1438
1439static const struct file_operations vfio_device_fops;
1440
1441static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1442{
1443        struct vfio_device *device;
1444        struct file *filep;
1445        int fdno;
1446        int ret = 0;
1447
1448        if (0 == atomic_read(&group->container_users) ||
1449            !group->container->iommu_driver || !vfio_group_viable(group))
1450                return -EINVAL;
1451
1452        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1453                return -EPERM;
1454
1455        device = vfio_device_get_from_name(group, buf);
1456        if (IS_ERR(device))
1457                return PTR_ERR(device);
1458
1459        if (!try_module_get(device->dev->driver->owner)) {
1460                ret = -ENODEV;
1461                goto err_device_put;
1462        }
1463
1464        mutex_lock(&device->dev_set->lock);
1465        device->open_count++;
1466        if (device->open_count == 1 && device->ops->open_device) {
1467                ret = device->ops->open_device(device);
1468                if (ret)
1469                        goto err_undo_count;
1470        }
1471        mutex_unlock(&device->dev_set->lock);
1472
1473        /*
1474         * We can't use anon_inode_getfd() because we need to modify
1475         * the f_mode flags directly to allow more than just ioctls
1476         */
1477        fdno = ret = get_unused_fd_flags(O_CLOEXEC);
1478        if (ret < 0)
1479                goto err_close_device;
1480
1481        filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1482                                   device, O_RDWR);
1483        if (IS_ERR(filep)) {
1484                ret = PTR_ERR(filep);
1485                goto err_fd;
1486        }
1487
1488        /*
1489         * TODO: add an anon_inode interface to do this.
1490         * Appears to be missing by lack of need rather than
1491         * explicitly prevented.  Now there's need.
1492         */
1493        filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1494
1495        atomic_inc(&group->container_users);
1496
1497        fd_install(fdno, filep);
1498
1499        if (group->noiommu)
1500                dev_warn(device->dev, "vfio-noiommu device opened by user "
1501                         "(%s:%d)\n", current->comm, task_pid_nr(current));
1502        return fdno;
1503
1504err_fd:
1505        put_unused_fd(fdno);
1506err_close_device:
1507        mutex_lock(&device->dev_set->lock);
1508        if (device->open_count == 1 && device->ops->close_device)
1509                device->ops->close_device(device);
1510err_undo_count:
1511        device->open_count--;
1512        mutex_unlock(&device->dev_set->lock);
1513        module_put(device->dev->driver->owner);
1514err_device_put:
1515        vfio_device_put(device);
1516        return ret;
1517}
1518
1519static long vfio_group_fops_unl_ioctl(struct file *filep,
1520                                      unsigned int cmd, unsigned long arg)
1521{
1522        struct vfio_group *group = filep->private_data;
1523        long ret = -ENOTTY;
1524
1525        switch (cmd) {
1526        case VFIO_GROUP_GET_STATUS:
1527        {
1528                struct vfio_group_status status;
1529                unsigned long minsz;
1530
1531                minsz = offsetofend(struct vfio_group_status, flags);
1532
1533                if (copy_from_user(&status, (void __user *)arg, minsz))
1534                        return -EFAULT;
1535
1536                if (status.argsz < minsz)
1537                        return -EINVAL;
1538
1539                status.flags = 0;
1540
1541                if (vfio_group_viable(group))
1542                        status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1543
1544                if (group->container)
1545                        status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1546
1547                if (copy_to_user((void __user *)arg, &status, minsz))
1548                        return -EFAULT;
1549
1550                ret = 0;
1551                break;
1552        }
1553        case VFIO_GROUP_SET_CONTAINER:
1554        {
1555                int fd;
1556
1557                if (get_user(fd, (int __user *)arg))
1558                        return -EFAULT;
1559
1560                if (fd < 0)
1561                        return -EINVAL;
1562
1563                ret = vfio_group_set_container(group, fd);
1564                break;
1565        }
1566        case VFIO_GROUP_UNSET_CONTAINER:
1567                ret = vfio_group_unset_container(group);
1568                break;
1569        case VFIO_GROUP_GET_DEVICE_FD:
1570        {
1571                char *buf;
1572
1573                buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1574                if (IS_ERR(buf))
1575                        return PTR_ERR(buf);
1576
1577                ret = vfio_group_get_device_fd(group, buf);
1578                kfree(buf);
1579                break;
1580        }
1581        }
1582
1583        return ret;
1584}
1585
1586static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1587{
1588        struct vfio_group *group;
1589        int opened;
1590
1591        group = vfio_group_get_from_minor(iminor(inode));
1592        if (!group)
1593                return -ENODEV;
1594
1595        if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1596                vfio_group_put(group);
1597                return -EPERM;
1598        }
1599
1600        /* Do we need multiple instances of the group open?  Seems not. */
1601        opened = atomic_cmpxchg(&group->opened, 0, 1);
1602        if (opened) {
1603                vfio_group_put(group);
1604                return -EBUSY;
1605        }
1606
1607        /* Is something still in use from a previous open? */
1608        if (group->container) {
1609                atomic_dec(&group->opened);
1610                vfio_group_put(group);
1611                return -EBUSY;
1612        }
1613
1614        /* Warn if previous user didn't cleanup and re-init to drop them */
1615        if (WARN_ON(group->notifier.head))
1616                BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1617
1618        filep->private_data = group;
1619
1620        return 0;
1621}
1622
1623static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1624{
1625        struct vfio_group *group = filep->private_data;
1626
1627        filep->private_data = NULL;
1628
1629        vfio_group_try_dissolve_container(group);
1630
1631        atomic_dec(&group->opened);
1632
1633        vfio_group_put(group);
1634
1635        return 0;
1636}
1637
1638static const struct file_operations vfio_group_fops = {
1639        .owner          = THIS_MODULE,
1640        .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1641        .compat_ioctl   = compat_ptr_ioctl,
1642        .open           = vfio_group_fops_open,
1643        .release        = vfio_group_fops_release,
1644};
1645
1646/**
1647 * VFIO Device fd
1648 */
1649static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1650{
1651        struct vfio_device *device = filep->private_data;
1652
1653        mutex_lock(&device->dev_set->lock);
1654        if (!--device->open_count && device->ops->close_device)
1655                device->ops->close_device(device);
1656        mutex_unlock(&device->dev_set->lock);
1657
1658        module_put(device->dev->driver->owner);
1659
1660        vfio_group_try_dissolve_container(device->group);
1661
1662        vfio_device_put(device);
1663
1664        return 0;
1665}
1666
1667static long vfio_device_fops_unl_ioctl(struct file *filep,
1668                                       unsigned int cmd, unsigned long arg)
1669{
1670        struct vfio_device *device = filep->private_data;
1671
1672        if (unlikely(!device->ops->ioctl))
1673                return -EINVAL;
1674
1675        return device->ops->ioctl(device, cmd, arg);
1676}
1677
1678static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1679                                     size_t count, loff_t *ppos)
1680{
1681        struct vfio_device *device = filep->private_data;
1682
1683        if (unlikely(!device->ops->read))
1684                return -EINVAL;
1685
1686        return device->ops->read(device, buf, count, ppos);
1687}
1688
1689static ssize_t vfio_device_fops_write(struct file *filep,
1690                                      const char __user *buf,
1691                                      size_t count, loff_t *ppos)
1692{
1693        struct vfio_device *device = filep->private_data;
1694
1695        if (unlikely(!device->ops->write))
1696                return -EINVAL;
1697
1698        return device->ops->write(device, buf, count, ppos);
1699}
1700
1701static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1702{
1703        struct vfio_device *device = filep->private_data;
1704
1705        if (unlikely(!device->ops->mmap))
1706                return -EINVAL;
1707
1708        return device->ops->mmap(device, vma);
1709}
1710
1711static const struct file_operations vfio_device_fops = {
1712        .owner          = THIS_MODULE,
1713        .release        = vfio_device_fops_release,
1714        .read           = vfio_device_fops_read,
1715        .write          = vfio_device_fops_write,
1716        .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1717        .compat_ioctl   = compat_ptr_ioctl,
1718        .mmap           = vfio_device_fops_mmap,
1719};
1720
1721/**
1722 * External user API, exported by symbols to be linked dynamically.
1723 *
1724 * The protocol includes:
1725 *  1. do normal VFIO init operation:
1726 *      - opening a new container;
1727 *      - attaching group(s) to it;
1728 *      - setting an IOMMU driver for a container.
1729 * When IOMMU is set for a container, all groups in it are
1730 * considered ready to use by an external user.
1731 *
1732 * 2. User space passes a group fd to an external user.
1733 * The external user calls vfio_group_get_external_user()
1734 * to verify that:
1735 *      - the group is initialized;
1736 *      - IOMMU is set for it.
1737 * If both checks passed, vfio_group_get_external_user()
1738 * increments the container user counter to prevent
1739 * the VFIO group from disposal before KVM exits.
1740 *
1741 * 3. The external user calls vfio_external_user_iommu_id()
1742 * to know an IOMMU ID.
1743 *
1744 * 4. When the external KVM finishes, it calls
1745 * vfio_group_put_external_user() to release the VFIO group.
1746 * This call decrements the container user counter.
1747 */
1748struct vfio_group *vfio_group_get_external_user(struct file *filep)
1749{
1750        struct vfio_group *group = filep->private_data;
1751        int ret;
1752
1753        if (filep->f_op != &vfio_group_fops)
1754                return ERR_PTR(-EINVAL);
1755
1756        ret = vfio_group_add_container_user(group);
1757        if (ret)
1758                return ERR_PTR(ret);
1759
1760        vfio_group_get(group);
1761
1762        return group;
1763}
1764EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1765
1766/**
1767 * External user API, exported by symbols to be linked dynamically.
1768 * The external user passes in a device pointer
1769 * to verify that:
1770 *      - A VFIO group is assiciated with the device;
1771 *      - IOMMU is set for the group.
1772 * If both checks passed, vfio_group_get_external_user_from_dev()
1773 * increments the container user counter to prevent the VFIO group
1774 * from disposal before external user exits and returns the pointer
1775 * to the VFIO group.
1776 *
1777 * When the external user finishes using the VFIO group, it calls
1778 * vfio_group_put_external_user() to release the VFIO group and
1779 * decrement the container user counter.
1780 *
1781 * @dev [in]    : device
1782 * Return error PTR or pointer to VFIO group.
1783 */
1784
1785struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1786{
1787        struct vfio_group *group;
1788        int ret;
1789
1790        group = vfio_group_get_from_dev(dev);
1791        if (!group)
1792                return ERR_PTR(-ENODEV);
1793
1794        ret = vfio_group_add_container_user(group);
1795        if (ret) {
1796                vfio_group_put(group);
1797                return ERR_PTR(ret);
1798        }
1799
1800        return group;
1801}
1802EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1803
1804void vfio_group_put_external_user(struct vfio_group *group)
1805{
1806        vfio_group_try_dissolve_container(group);
1807        vfio_group_put(group);
1808}
1809EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1810
1811bool vfio_external_group_match_file(struct vfio_group *test_group,
1812                                    struct file *filep)
1813{
1814        struct vfio_group *group = filep->private_data;
1815
1816        return (filep->f_op == &vfio_group_fops) && (group == test_group);
1817}
1818EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1819
1820int vfio_external_user_iommu_id(struct vfio_group *group)
1821{
1822        return iommu_group_id(group->iommu_group);
1823}
1824EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1825
1826long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1827{
1828        return vfio_ioctl_check_extension(group->container, arg);
1829}
1830EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1831
1832/**
1833 * Sub-module support
1834 */
1835/*
1836 * Helper for managing a buffer of info chain capabilities, allocate or
1837 * reallocate a buffer with additional @size, filling in @id and @version
1838 * of the capability.  A pointer to the new capability is returned.
1839 *
1840 * NB. The chain is based at the head of the buffer, so new entries are
1841 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1842 * next offsets prior to copying to the user buffer.
1843 */
1844struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1845                                               size_t size, u16 id, u16 version)
1846{
1847        void *buf;
1848        struct vfio_info_cap_header *header, *tmp;
1849
1850        buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1851        if (!buf) {
1852                kfree(caps->buf);
1853                caps->size = 0;
1854                return ERR_PTR(-ENOMEM);
1855        }
1856
1857        caps->buf = buf;
1858        header = buf + caps->size;
1859
1860        /* Eventually copied to user buffer, zero */
1861        memset(header, 0, size);
1862
1863        header->id = id;
1864        header->version = version;
1865
1866        /* Add to the end of the capability chain */
1867        for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1868                ; /* nothing */
1869
1870        tmp->next = caps->size;
1871        caps->size += size;
1872
1873        return header;
1874}
1875EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1876
1877void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1878{
1879        struct vfio_info_cap_header *tmp;
1880        void *buf = (void *)caps->buf;
1881
1882        for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1883                tmp->next += offset;
1884}
1885EXPORT_SYMBOL(vfio_info_cap_shift);
1886
1887int vfio_info_add_capability(struct vfio_info_cap *caps,
1888                             struct vfio_info_cap_header *cap, size_t size)
1889{
1890        struct vfio_info_cap_header *header;
1891
1892        header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1893        if (IS_ERR(header))
1894                return PTR_ERR(header);
1895
1896        memcpy(header + 1, cap + 1, size - sizeof(*header));
1897
1898        return 0;
1899}
1900EXPORT_SYMBOL(vfio_info_add_capability);
1901
1902int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1903                                       int max_irq_type, size_t *data_size)
1904{
1905        unsigned long minsz;
1906        size_t size;
1907
1908        minsz = offsetofend(struct vfio_irq_set, count);
1909
1910        if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1911            (hdr->count >= (U32_MAX - hdr->start)) ||
1912            (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1913                                VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1914                return -EINVAL;
1915
1916        if (data_size)
1917                *data_size = 0;
1918
1919        if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1920                return -EINVAL;
1921
1922        switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1923        case VFIO_IRQ_SET_DATA_NONE:
1924                size = 0;
1925                break;
1926        case VFIO_IRQ_SET_DATA_BOOL:
1927                size = sizeof(uint8_t);
1928                break;
1929        case VFIO_IRQ_SET_DATA_EVENTFD:
1930                size = sizeof(int32_t);
1931                break;
1932        default:
1933                return -EINVAL;
1934        }
1935
1936        if (size) {
1937                if (hdr->argsz - minsz < hdr->count * size)
1938                        return -EINVAL;
1939
1940                if (!data_size)
1941                        return -EINVAL;
1942
1943                *data_size = hdr->count * size;
1944        }
1945
1946        return 0;
1947}
1948EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1949
1950/*
1951 * Pin a set of guest PFNs and return their associated host PFNs for local
1952 * domain only.
1953 * @dev [in]     : device
1954 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1955 * @npage [in]   : count of elements in user_pfn array.  This count should not
1956 *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1957 * @prot [in]    : protection flags
1958 * @phys_pfn[out]: array of host PFNs
1959 * Return error or number of pages pinned.
1960 */
1961int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1962                   int prot, unsigned long *phys_pfn)
1963{
1964        struct vfio_container *container;
1965        struct vfio_group *group;
1966        struct vfio_iommu_driver *driver;
1967        int ret;
1968
1969        if (!dev || !user_pfn || !phys_pfn || !npage)
1970                return -EINVAL;
1971
1972        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1973                return -E2BIG;
1974
1975        group = vfio_group_get_from_dev(dev);
1976        if (!group)
1977                return -ENODEV;
1978
1979        if (group->dev_counter > 1) {
1980                ret = -EINVAL;
1981                goto err_pin_pages;
1982        }
1983
1984        ret = vfio_group_add_container_user(group);
1985        if (ret)
1986                goto err_pin_pages;
1987
1988        container = group->container;
1989        driver = container->iommu_driver;
1990        if (likely(driver && driver->ops->pin_pages))
1991                ret = driver->ops->pin_pages(container->iommu_data,
1992                                             group->iommu_group, user_pfn,
1993                                             npage, prot, phys_pfn);
1994        else
1995                ret = -ENOTTY;
1996
1997        vfio_group_try_dissolve_container(group);
1998
1999err_pin_pages:
2000        vfio_group_put(group);
2001        return ret;
2002}
2003EXPORT_SYMBOL(vfio_pin_pages);
2004
2005/*
2006 * Unpin set of host PFNs for local domain only.
2007 * @dev [in]     : device
2008 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2009 *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2010 * @npage [in]   : count of elements in user_pfn array.  This count should not
2011 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2012 * Return error or number of pages unpinned.
2013 */
2014int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2015{
2016        struct vfio_container *container;
2017        struct vfio_group *group;
2018        struct vfio_iommu_driver *driver;
2019        int ret;
2020
2021        if (!dev || !user_pfn || !npage)
2022                return -EINVAL;
2023
2024        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2025                return -E2BIG;
2026
2027        group = vfio_group_get_from_dev(dev);
2028        if (!group)
2029                return -ENODEV;
2030
2031        ret = vfio_group_add_container_user(group);
2032        if (ret)
2033                goto err_unpin_pages;
2034
2035        container = group->container;
2036        driver = container->iommu_driver;
2037        if (likely(driver && driver->ops->unpin_pages))
2038                ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2039                                               npage);
2040        else
2041                ret = -ENOTTY;
2042
2043        vfio_group_try_dissolve_container(group);
2044
2045err_unpin_pages:
2046        vfio_group_put(group);
2047        return ret;
2048}
2049EXPORT_SYMBOL(vfio_unpin_pages);
2050
2051/*
2052 * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
2053 * VFIO group.
2054 *
2055 * The caller needs to call vfio_group_get_external_user() or
2056 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2057 * so as to prevent the VFIO group from disposal in the middle of the call.
2058 * But it can keep the reference to the VFIO group for several calls into
2059 * this interface.
2060 * After finishing using of the VFIO group, the caller needs to release the
2061 * VFIO group by calling vfio_group_put_external_user().
2062 *
2063 * @group [in]          : VFIO group
2064 * @user_iova_pfn [in]  : array of user/guest IOVA PFNs to be pinned.
2065 * @npage [in]          : count of elements in user_iova_pfn array.
2066 *                        This count should not be greater
2067 *                        VFIO_PIN_PAGES_MAX_ENTRIES.
2068 * @prot [in]           : protection flags
2069 * @phys_pfn [out]      : array of host PFNs
2070 * Return error or number of pages pinned.
2071 */
2072int vfio_group_pin_pages(struct vfio_group *group,
2073                         unsigned long *user_iova_pfn, int npage,
2074                         int prot, unsigned long *phys_pfn)
2075{
2076        struct vfio_container *container;
2077        struct vfio_iommu_driver *driver;
2078        int ret;
2079
2080        if (!group || !user_iova_pfn || !phys_pfn || !npage)
2081                return -EINVAL;
2082
2083        if (group->dev_counter > 1)
2084                return -EINVAL;
2085
2086        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2087                return -E2BIG;
2088
2089        container = group->container;
2090        driver = container->iommu_driver;
2091        if (likely(driver && driver->ops->pin_pages))
2092                ret = driver->ops->pin_pages(container->iommu_data,
2093                                             group->iommu_group, user_iova_pfn,
2094                                             npage, prot, phys_pfn);
2095        else
2096                ret = -ENOTTY;
2097
2098        return ret;
2099}
2100EXPORT_SYMBOL(vfio_group_pin_pages);
2101
2102/*
2103 * Unpin a set of guest IOVA PFNs for a VFIO group.
2104 *
2105 * The caller needs to call vfio_group_get_external_user() or
2106 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2107 * so as to prevent the VFIO group from disposal in the middle of the call.
2108 * But it can keep the reference to the VFIO group for several calls into
2109 * this interface.
2110 * After finishing using of the VFIO group, the caller needs to release the
2111 * VFIO group by calling vfio_group_put_external_user().
2112 *
2113 * @group [in]          : vfio group
2114 * @user_iova_pfn [in]  : array of user/guest IOVA PFNs to be unpinned.
2115 * @npage [in]          : count of elements in user_iova_pfn array.
2116 *                        This count should not be greater than
2117 *                        VFIO_PIN_PAGES_MAX_ENTRIES.
2118 * Return error or number of pages unpinned.
2119 */
2120int vfio_group_unpin_pages(struct vfio_group *group,
2121                           unsigned long *user_iova_pfn, int npage)
2122{
2123        struct vfio_container *container;
2124        struct vfio_iommu_driver *driver;
2125        int ret;
2126
2127        if (!group || !user_iova_pfn || !npage)
2128                return -EINVAL;
2129
2130        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2131                return -E2BIG;
2132
2133        container = group->container;
2134        driver = container->iommu_driver;
2135        if (likely(driver && driver->ops->unpin_pages))
2136                ret = driver->ops->unpin_pages(container->iommu_data,
2137                                               user_iova_pfn, npage);
2138        else
2139                ret = -ENOTTY;
2140
2141        return ret;
2142}
2143EXPORT_SYMBOL(vfio_group_unpin_pages);
2144
2145
2146/*
2147 * This interface allows the CPUs to perform some sort of virtual DMA on
2148 * behalf of the device.
2149 *
2150 * CPUs read/write from/into a range of IOVAs pointing to user space memory
2151 * into/from a kernel buffer.
2152 *
2153 * As the read/write of user space memory is conducted via the CPUs and is
2154 * not a real device DMA, it is not necessary to pin the user space memory.
2155 *
2156 * The caller needs to call vfio_group_get_external_user() or
2157 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2158 * so as to prevent the VFIO group from disposal in the middle of the call.
2159 * But it can keep the reference to the VFIO group for several calls into
2160 * this interface.
2161 * After finishing using of the VFIO group, the caller needs to release the
2162 * VFIO group by calling vfio_group_put_external_user().
2163 *
2164 * @group [in]          : VFIO group
2165 * @user_iova [in]      : base IOVA of a user space buffer
2166 * @data [in]           : pointer to kernel buffer
2167 * @len [in]            : kernel buffer length
2168 * @write               : indicate read or write
2169 * Return error code on failure or 0 on success.
2170 */
2171int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2172                void *data, size_t len, bool write)
2173{
2174        struct vfio_container *container;
2175        struct vfio_iommu_driver *driver;
2176        int ret = 0;
2177
2178        if (!group || !data || len <= 0)
2179                return -EINVAL;
2180
2181        container = group->container;
2182        driver = container->iommu_driver;
2183
2184        if (likely(driver && driver->ops->dma_rw))
2185                ret = driver->ops->dma_rw(container->iommu_data,
2186                                          user_iova, data, len, write);
2187        else
2188                ret = -ENOTTY;
2189
2190        return ret;
2191}
2192EXPORT_SYMBOL(vfio_dma_rw);
2193
2194static int vfio_register_iommu_notifier(struct vfio_group *group,
2195                                        unsigned long *events,
2196                                        struct notifier_block *nb)
2197{
2198        struct vfio_container *container;
2199        struct vfio_iommu_driver *driver;
2200        int ret;
2201
2202        ret = vfio_group_add_container_user(group);
2203        if (ret)
2204                return -EINVAL;
2205
2206        container = group->container;
2207        driver = container->iommu_driver;
2208        if (likely(driver && driver->ops->register_notifier))
2209                ret = driver->ops->register_notifier(container->iommu_data,
2210                                                     events, nb);
2211        else
2212                ret = -ENOTTY;
2213
2214        vfio_group_try_dissolve_container(group);
2215
2216        return ret;
2217}
2218
2219static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2220                                          struct notifier_block *nb)
2221{
2222        struct vfio_container *container;
2223        struct vfio_iommu_driver *driver;
2224        int ret;
2225
2226        ret = vfio_group_add_container_user(group);
2227        if (ret)
2228                return -EINVAL;
2229
2230        container = group->container;
2231        driver = container->iommu_driver;
2232        if (likely(driver && driver->ops->unregister_notifier))
2233                ret = driver->ops->unregister_notifier(container->iommu_data,
2234                                                       nb);
2235        else
2236                ret = -ENOTTY;
2237
2238        vfio_group_try_dissolve_container(group);
2239
2240        return ret;
2241}
2242
2243void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2244{
2245        group->kvm = kvm;
2246        blocking_notifier_call_chain(&group->notifier,
2247                                VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2248}
2249EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2250
2251static int vfio_register_group_notifier(struct vfio_group *group,
2252                                        unsigned long *events,
2253                                        struct notifier_block *nb)
2254{
2255        int ret;
2256        bool set_kvm = false;
2257
2258        if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2259                set_kvm = true;
2260
2261        /* clear known events */
2262        *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2263
2264        /* refuse to continue if still events remaining */
2265        if (*events)
2266                return -EINVAL;
2267
2268        ret = vfio_group_add_container_user(group);
2269        if (ret)
2270                return -EINVAL;
2271
2272        ret = blocking_notifier_chain_register(&group->notifier, nb);
2273
2274        /*
2275         * The attaching of kvm and vfio_group might already happen, so
2276         * here we replay once upon registration.
2277         */
2278        if (!ret && set_kvm && group->kvm)
2279                blocking_notifier_call_chain(&group->notifier,
2280                                        VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2281
2282        vfio_group_try_dissolve_container(group);
2283
2284        return ret;
2285}
2286
2287static int vfio_unregister_group_notifier(struct vfio_group *group,
2288                                         struct notifier_block *nb)
2289{
2290        int ret;
2291
2292        ret = vfio_group_add_container_user(group);
2293        if (ret)
2294                return -EINVAL;
2295
2296        ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2297
2298        vfio_group_try_dissolve_container(group);
2299
2300        return ret;
2301}
2302
2303int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2304                           unsigned long *events, struct notifier_block *nb)
2305{
2306        struct vfio_group *group;
2307        int ret;
2308
2309        if (!dev || !nb || !events || (*events == 0))
2310                return -EINVAL;
2311
2312        group = vfio_group_get_from_dev(dev);
2313        if (!group)
2314                return -ENODEV;
2315
2316        switch (type) {
2317        case VFIO_IOMMU_NOTIFY:
2318                ret = vfio_register_iommu_notifier(group, events, nb);
2319                break;
2320        case VFIO_GROUP_NOTIFY:
2321                ret = vfio_register_group_notifier(group, events, nb);
2322                break;
2323        default:
2324                ret = -EINVAL;
2325        }
2326
2327        vfio_group_put(group);
2328        return ret;
2329}
2330EXPORT_SYMBOL(vfio_register_notifier);
2331
2332int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2333                             struct notifier_block *nb)
2334{
2335        struct vfio_group *group;
2336        int ret;
2337
2338        if (!dev || !nb)
2339                return -EINVAL;
2340
2341        group = vfio_group_get_from_dev(dev);
2342        if (!group)
2343                return -ENODEV;
2344
2345        switch (type) {
2346        case VFIO_IOMMU_NOTIFY:
2347                ret = vfio_unregister_iommu_notifier(group, nb);
2348                break;
2349        case VFIO_GROUP_NOTIFY:
2350                ret = vfio_unregister_group_notifier(group, nb);
2351                break;
2352        default:
2353                ret = -EINVAL;
2354        }
2355
2356        vfio_group_put(group);
2357        return ret;
2358}
2359EXPORT_SYMBOL(vfio_unregister_notifier);
2360
2361struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
2362{
2363        struct vfio_container *container;
2364        struct vfio_iommu_driver *driver;
2365
2366        if (!group)
2367                return ERR_PTR(-EINVAL);
2368
2369        container = group->container;
2370        driver = container->iommu_driver;
2371        if (likely(driver && driver->ops->group_iommu_domain))
2372                return driver->ops->group_iommu_domain(container->iommu_data,
2373                                                       group->iommu_group);
2374
2375        return ERR_PTR(-ENOTTY);
2376}
2377EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
2378
2379/**
2380 * Module/class support
2381 */
2382static char *vfio_devnode(struct device *dev, umode_t *mode)
2383{
2384        return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2385}
2386
2387static struct miscdevice vfio_dev = {
2388        .minor = VFIO_MINOR,
2389        .name = "vfio",
2390        .fops = &vfio_fops,
2391        .nodename = "vfio/vfio",
2392        .mode = S_IRUGO | S_IWUGO,
2393};
2394
2395static int __init vfio_init(void)
2396{
2397        int ret;
2398
2399        idr_init(&vfio.group_idr);
2400        mutex_init(&vfio.group_lock);
2401        mutex_init(&vfio.iommu_drivers_lock);
2402        INIT_LIST_HEAD(&vfio.group_list);
2403        INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2404
2405        ret = misc_register(&vfio_dev);
2406        if (ret) {
2407                pr_err("vfio: misc device register failed\n");
2408                return ret;
2409        }
2410
2411        /* /dev/vfio/$GROUP */
2412        vfio.class = class_create(THIS_MODULE, "vfio");
2413        if (IS_ERR(vfio.class)) {
2414                ret = PTR_ERR(vfio.class);
2415                goto err_class;
2416        }
2417
2418        vfio.class->devnode = vfio_devnode;
2419
2420        ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2421        if (ret)
2422                goto err_alloc_chrdev;
2423
2424        cdev_init(&vfio.group_cdev, &vfio_group_fops);
2425        ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2426        if (ret)
2427                goto err_cdev_add;
2428
2429        pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2430
2431#ifdef CONFIG_VFIO_NOIOMMU
2432        vfio_register_iommu_driver(&vfio_noiommu_ops);
2433#endif
2434        return 0;
2435
2436err_cdev_add:
2437        unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2438err_alloc_chrdev:
2439        class_destroy(vfio.class);
2440        vfio.class = NULL;
2441err_class:
2442        misc_deregister(&vfio_dev);
2443        return ret;
2444}
2445
2446static void __exit vfio_cleanup(void)
2447{
2448        WARN_ON(!list_empty(&vfio.group_list));
2449
2450#ifdef CONFIG_VFIO_NOIOMMU
2451        vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2452#endif
2453        idr_destroy(&vfio.group_idr);
2454        cdev_del(&vfio.group_cdev);
2455        unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2456        class_destroy(vfio.class);
2457        vfio.class = NULL;
2458        misc_deregister(&vfio_dev);
2459        xa_destroy(&vfio_device_set_xa);
2460}
2461
2462module_init(vfio_init);
2463module_exit(vfio_cleanup);
2464
2465MODULE_VERSION(DRIVER_VERSION);
2466MODULE_LICENSE("GPL v2");
2467MODULE_AUTHOR(DRIVER_AUTHOR);
2468MODULE_DESCRIPTION(DRIVER_DESC);
2469MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2470MODULE_ALIAS("devname:vfio/vfio");
2471MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2472