linux/drivers/vfio/vfio.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO core
   4 *
   5 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6 *     Author: Alex Williamson <alex.williamson@redhat.com>
   7 *
   8 * Derived from original vfio:
   9 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10 * Author: Tom Lyon, pugs@cisco.com
  11 */
  12
  13#include <linux/cdev.h>
  14#include <linux/compat.h>
  15#include <linux/device.h>
  16#include <linux/file.h>
  17#include <linux/anon_inodes.h>
  18#include <linux/fs.h>
  19#include <linux/idr.h>
  20#include <linux/iommu.h>
  21#include <linux/list.h>
  22#include <linux/miscdevice.h>
  23#include <linux/module.h>
  24#include <linux/mutex.h>
  25#include <linux/pci.h>
  26#include <linux/rwsem.h>
  27#include <linux/sched.h>
  28#include <linux/slab.h>
  29#include <linux/stat.h>
  30#include <linux/string.h>
  31#include <linux/uaccess.h>
  32#include <linux/vfio.h>
  33#include <linux/wait.h>
  34#include <linux/sched/signal.h>
  35
  36#define DRIVER_VERSION  "0.3"
  37#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  38#define DRIVER_DESC     "VFIO - User Level meta-driver"
  39
  40static struct vfio {
  41        struct class                    *class;
  42        struct list_head                iommu_drivers_list;
  43        struct mutex                    iommu_drivers_lock;
  44        struct list_head                group_list;
  45        struct idr                      group_idr;
  46        struct mutex                    group_lock;
  47        struct cdev                     group_cdev;
  48        dev_t                           group_devt;
  49        wait_queue_head_t               release_q;
  50} vfio;
  51
  52struct vfio_iommu_driver {
  53        const struct vfio_iommu_driver_ops      *ops;
  54        struct list_head                        vfio_next;
  55};
  56
  57struct vfio_container {
  58        struct kref                     kref;
  59        struct list_head                group_list;
  60        struct rw_semaphore             group_lock;
  61        struct vfio_iommu_driver        *iommu_driver;
  62        void                            *iommu_data;
  63        bool                            noiommu;
  64};
  65
  66struct vfio_unbound_dev {
  67        struct device                   *dev;
  68        struct list_head                unbound_next;
  69};
  70
  71struct vfio_group {
  72        struct kref                     kref;
  73        int                             minor;
  74        atomic_t                        container_users;
  75        struct iommu_group              *iommu_group;
  76        struct vfio_container           *container;
  77        struct list_head                device_list;
  78        struct mutex                    device_lock;
  79        struct device                   *dev;
  80        struct notifier_block           nb;
  81        struct list_head                vfio_next;
  82        struct list_head                container_next;
  83        struct list_head                unbound_list;
  84        struct mutex                    unbound_lock;
  85        atomic_t                        opened;
  86        wait_queue_head_t               container_q;
  87        bool                            noiommu;
  88        struct kvm                      *kvm;
  89        struct blocking_notifier_head   notifier;
  90};
  91
  92struct vfio_device {
  93        struct kref                     kref;
  94        struct device                   *dev;
  95        const struct vfio_device_ops    *ops;
  96        struct vfio_group               *group;
  97        struct list_head                group_next;
  98        void                            *device_data;
  99};
 100
 101#ifdef CONFIG_VFIO_NOIOMMU
 102static bool noiommu __read_mostly;
 103module_param_named(enable_unsafe_noiommu_mode,
 104                   noiommu, bool, S_IRUGO | S_IWUSR);
 105MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 106#endif
 107
 108/*
 109 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 110 * and remove functions, any use cases other than acquiring the first
 111 * reference for the purpose of calling vfio_add_group_dev() or removing
 112 * that symmetric reference after vfio_del_group_dev() should use the raw
 113 * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 114 * removes the device from the dummy group and cannot be nested.
 115 */
 116struct iommu_group *vfio_iommu_group_get(struct device *dev)
 117{
 118        struct iommu_group *group;
 119        int __maybe_unused ret;
 120
 121        group = iommu_group_get(dev);
 122
 123#ifdef CONFIG_VFIO_NOIOMMU
 124        /*
 125         * With noiommu enabled, an IOMMU group will be created for a device
 126         * that doesn't already have one and doesn't have an iommu_ops on their
 127         * bus.  We set iommudata simply to be able to identify these groups
 128         * as special use and for reclamation later.
 129         */
 130        if (group || !noiommu || iommu_present(dev->bus))
 131                return group;
 132
 133        group = iommu_group_alloc();
 134        if (IS_ERR(group))
 135                return NULL;
 136
 137        iommu_group_set_name(group, "vfio-noiommu");
 138        iommu_group_set_iommudata(group, &noiommu, NULL);
 139        ret = iommu_group_add_device(group, dev);
 140        if (ret) {
 141                iommu_group_put(group);
 142                return NULL;
 143        }
 144
 145        /*
 146         * Where to taint?  At this point we've added an IOMMU group for a
 147         * device that is not backed by iommu_ops, therefore any iommu_
 148         * callback using iommu_ops can legitimately Oops.  So, while we may
 149         * be about to give a DMA capable device to a user without IOMMU
 150         * protection, which is clearly taint-worthy, let's go ahead and do
 151         * it here.
 152         */
 153        add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 154        dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 155#endif
 156
 157        return group;
 158}
 159EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 160
 161void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 162{
 163#ifdef CONFIG_VFIO_NOIOMMU
 164        if (iommu_group_get_iommudata(group) == &noiommu)
 165                iommu_group_remove_device(dev);
 166#endif
 167
 168        iommu_group_put(group);
 169}
 170EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 171
 172#ifdef CONFIG_VFIO_NOIOMMU
 173static void *vfio_noiommu_open(unsigned long arg)
 174{
 175        if (arg != VFIO_NOIOMMU_IOMMU)
 176                return ERR_PTR(-EINVAL);
 177        if (!capable(CAP_SYS_RAWIO))
 178                return ERR_PTR(-EPERM);
 179
 180        return NULL;
 181}
 182
 183static void vfio_noiommu_release(void *iommu_data)
 184{
 185}
 186
 187static long vfio_noiommu_ioctl(void *iommu_data,
 188                               unsigned int cmd, unsigned long arg)
 189{
 190        if (cmd == VFIO_CHECK_EXTENSION)
 191                return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 192
 193        return -ENOTTY;
 194}
 195
 196static int vfio_noiommu_attach_group(void *iommu_data,
 197                                     struct iommu_group *iommu_group)
 198{
 199        return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 200}
 201
 202static void vfio_noiommu_detach_group(void *iommu_data,
 203                                      struct iommu_group *iommu_group)
 204{
 205}
 206
 207static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 208        .name = "vfio-noiommu",
 209        .owner = THIS_MODULE,
 210        .open = vfio_noiommu_open,
 211        .release = vfio_noiommu_release,
 212        .ioctl = vfio_noiommu_ioctl,
 213        .attach_group = vfio_noiommu_attach_group,
 214        .detach_group = vfio_noiommu_detach_group,
 215};
 216#endif
 217
 218
 219/**
 220 * IOMMU driver registration
 221 */
 222int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 223{
 224        struct vfio_iommu_driver *driver, *tmp;
 225
 226        driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 227        if (!driver)
 228                return -ENOMEM;
 229
 230        driver->ops = ops;
 231
 232        mutex_lock(&vfio.iommu_drivers_lock);
 233
 234        /* Check for duplicates */
 235        list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 236                if (tmp->ops == ops) {
 237                        mutex_unlock(&vfio.iommu_drivers_lock);
 238                        kfree(driver);
 239                        return -EINVAL;
 240                }
 241        }
 242
 243        list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 244
 245        mutex_unlock(&vfio.iommu_drivers_lock);
 246
 247        return 0;
 248}
 249EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 250
 251void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 252{
 253        struct vfio_iommu_driver *driver;
 254
 255        mutex_lock(&vfio.iommu_drivers_lock);
 256        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 257                if (driver->ops == ops) {
 258                        list_del(&driver->vfio_next);
 259                        mutex_unlock(&vfio.iommu_drivers_lock);
 260                        kfree(driver);
 261                        return;
 262                }
 263        }
 264        mutex_unlock(&vfio.iommu_drivers_lock);
 265}
 266EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 267
 268/**
 269 * Group minor allocation/free - both called with vfio.group_lock held
 270 */
 271static int vfio_alloc_group_minor(struct vfio_group *group)
 272{
 273        return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 274}
 275
 276static void vfio_free_group_minor(int minor)
 277{
 278        idr_remove(&vfio.group_idr, minor);
 279}
 280
 281static int vfio_iommu_group_notifier(struct notifier_block *nb,
 282                                     unsigned long action, void *data);
 283static void vfio_group_get(struct vfio_group *group);
 284
 285/**
 286 * Container objects - containers are created when /dev/vfio/vfio is
 287 * opened, but their lifecycle extends until the last user is done, so
 288 * it's freed via kref.  Must support container/group/device being
 289 * closed in any order.
 290 */
 291static void vfio_container_get(struct vfio_container *container)
 292{
 293        kref_get(&container->kref);
 294}
 295
 296static void vfio_container_release(struct kref *kref)
 297{
 298        struct vfio_container *container;
 299        container = container_of(kref, struct vfio_container, kref);
 300
 301        kfree(container);
 302}
 303
 304static void vfio_container_put(struct vfio_container *container)
 305{
 306        kref_put(&container->kref, vfio_container_release);
 307}
 308
 309static void vfio_group_unlock_and_free(struct vfio_group *group)
 310{
 311        mutex_unlock(&vfio.group_lock);
 312        /*
 313         * Unregister outside of lock.  A spurious callback is harmless now
 314         * that the group is no longer in vfio.group_list.
 315         */
 316        iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 317        kfree(group);
 318}
 319
 320/**
 321 * Group objects - create, release, get, put, search
 322 */
 323static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 324{
 325        struct vfio_group *group, *tmp;
 326        struct device *dev;
 327        int ret, minor;
 328
 329        group = kzalloc(sizeof(*group), GFP_KERNEL);
 330        if (!group)
 331                return ERR_PTR(-ENOMEM);
 332
 333        kref_init(&group->kref);
 334        INIT_LIST_HEAD(&group->device_list);
 335        mutex_init(&group->device_lock);
 336        INIT_LIST_HEAD(&group->unbound_list);
 337        mutex_init(&group->unbound_lock);
 338        atomic_set(&group->container_users, 0);
 339        atomic_set(&group->opened, 0);
 340        init_waitqueue_head(&group->container_q);
 341        group->iommu_group = iommu_group;
 342#ifdef CONFIG_VFIO_NOIOMMU
 343        group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 344#endif
 345        BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 346
 347        group->nb.notifier_call = vfio_iommu_group_notifier;
 348
 349        /*
 350         * blocking notifiers acquire a rwsem around registering and hold
 351         * it around callback.  Therefore, need to register outside of
 352         * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 353         * do anything unless it can find the group in vfio.group_list, so
 354         * no harm in registering early.
 355         */
 356        ret = iommu_group_register_notifier(iommu_group, &group->nb);
 357        if (ret) {
 358                kfree(group);
 359                return ERR_PTR(ret);
 360        }
 361
 362        mutex_lock(&vfio.group_lock);
 363
 364        /* Did we race creating this group? */
 365        list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 366                if (tmp->iommu_group == iommu_group) {
 367                        vfio_group_get(tmp);
 368                        vfio_group_unlock_and_free(group);
 369                        return tmp;
 370                }
 371        }
 372
 373        minor = vfio_alloc_group_minor(group);
 374        if (minor < 0) {
 375                vfio_group_unlock_and_free(group);
 376                return ERR_PTR(minor);
 377        }
 378
 379        dev = device_create(vfio.class, NULL,
 380                            MKDEV(MAJOR(vfio.group_devt), minor),
 381                            group, "%s%d", group->noiommu ? "noiommu-" : "",
 382                            iommu_group_id(iommu_group));
 383        if (IS_ERR(dev)) {
 384                vfio_free_group_minor(minor);
 385                vfio_group_unlock_and_free(group);
 386                return ERR_CAST(dev);
 387        }
 388
 389        group->minor = minor;
 390        group->dev = dev;
 391
 392        list_add(&group->vfio_next, &vfio.group_list);
 393
 394        mutex_unlock(&vfio.group_lock);
 395
 396        return group;
 397}
 398
 399/* called with vfio.group_lock held */
 400static void vfio_group_release(struct kref *kref)
 401{
 402        struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 403        struct vfio_unbound_dev *unbound, *tmp;
 404        struct iommu_group *iommu_group = group->iommu_group;
 405
 406        WARN_ON(!list_empty(&group->device_list));
 407        WARN_ON(group->notifier.head);
 408
 409        list_for_each_entry_safe(unbound, tmp,
 410                                 &group->unbound_list, unbound_next) {
 411                list_del(&unbound->unbound_next);
 412                kfree(unbound);
 413        }
 414
 415        device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 416        list_del(&group->vfio_next);
 417        vfio_free_group_minor(group->minor);
 418        vfio_group_unlock_and_free(group);
 419        iommu_group_put(iommu_group);
 420}
 421
 422static void vfio_group_put(struct vfio_group *group)
 423{
 424        kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 425}
 426
 427struct vfio_group_put_work {
 428        struct work_struct work;
 429        struct vfio_group *group;
 430};
 431
 432static void vfio_group_put_bg(struct work_struct *work)
 433{
 434        struct vfio_group_put_work *do_work;
 435
 436        do_work = container_of(work, struct vfio_group_put_work, work);
 437
 438        vfio_group_put(do_work->group);
 439        kfree(do_work);
 440}
 441
 442static void vfio_group_schedule_put(struct vfio_group *group)
 443{
 444        struct vfio_group_put_work *do_work;
 445
 446        do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 447        if (WARN_ON(!do_work))
 448                return;
 449
 450        INIT_WORK(&do_work->work, vfio_group_put_bg);
 451        do_work->group = group;
 452        schedule_work(&do_work->work);
 453}
 454
 455/* Assume group_lock or group reference is held */
 456static void vfio_group_get(struct vfio_group *group)
 457{
 458        kref_get(&group->kref);
 459}
 460
 461/*
 462 * Not really a try as we will sleep for mutex, but we need to make
 463 * sure the group pointer is valid under lock and get a reference.
 464 */
 465static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 466{
 467        struct vfio_group *target = group;
 468
 469        mutex_lock(&vfio.group_lock);
 470        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 471                if (group == target) {
 472                        vfio_group_get(group);
 473                        mutex_unlock(&vfio.group_lock);
 474                        return group;
 475                }
 476        }
 477        mutex_unlock(&vfio.group_lock);
 478
 479        return NULL;
 480}
 481
 482static
 483struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 484{
 485        struct vfio_group *group;
 486
 487        mutex_lock(&vfio.group_lock);
 488        list_for_each_entry(group, &vfio.group_list, vfio_next) {
 489                if (group->iommu_group == iommu_group) {
 490                        vfio_group_get(group);
 491                        mutex_unlock(&vfio.group_lock);
 492                        return group;
 493                }
 494        }
 495        mutex_unlock(&vfio.group_lock);
 496
 497        return NULL;
 498}
 499
 500static struct vfio_group *vfio_group_get_from_minor(int minor)
 501{
 502        struct vfio_group *group;
 503
 504        mutex_lock(&vfio.group_lock);
 505        group = idr_find(&vfio.group_idr, minor);
 506        if (!group) {
 507                mutex_unlock(&vfio.group_lock);
 508                return NULL;
 509        }
 510        vfio_group_get(group);
 511        mutex_unlock(&vfio.group_lock);
 512
 513        return group;
 514}
 515
 516static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 517{
 518        struct iommu_group *iommu_group;
 519        struct vfio_group *group;
 520
 521        iommu_group = iommu_group_get(dev);
 522        if (!iommu_group)
 523                return NULL;
 524
 525        group = vfio_group_get_from_iommu(iommu_group);
 526        iommu_group_put(iommu_group);
 527
 528        return group;
 529}
 530
 531/**
 532 * Device objects - create, release, get, put, search
 533 */
 534static
 535struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 536                                             struct device *dev,
 537                                             const struct vfio_device_ops *ops,
 538                                             void *device_data)
 539{
 540        struct vfio_device *device;
 541
 542        device = kzalloc(sizeof(*device), GFP_KERNEL);
 543        if (!device)
 544                return ERR_PTR(-ENOMEM);
 545
 546        kref_init(&device->kref);
 547        device->dev = dev;
 548        device->group = group;
 549        device->ops = ops;
 550        device->device_data = device_data;
 551        dev_set_drvdata(dev, device);
 552
 553        /* No need to get group_lock, caller has group reference */
 554        vfio_group_get(group);
 555
 556        mutex_lock(&group->device_lock);
 557        list_add(&device->group_next, &group->device_list);
 558        mutex_unlock(&group->device_lock);
 559
 560        return device;
 561}
 562
 563static void vfio_device_release(struct kref *kref)
 564{
 565        struct vfio_device *device = container_of(kref,
 566                                                  struct vfio_device, kref);
 567        struct vfio_group *group = device->group;
 568
 569        list_del(&device->group_next);
 570        mutex_unlock(&group->device_lock);
 571
 572        dev_set_drvdata(device->dev, NULL);
 573
 574        kfree(device);
 575
 576        /* vfio_del_group_dev may be waiting for this device */
 577        wake_up(&vfio.release_q);
 578}
 579
 580/* Device reference always implies a group reference */
 581void vfio_device_put(struct vfio_device *device)
 582{
 583        struct vfio_group *group = device->group;
 584        kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 585        vfio_group_put(group);
 586}
 587EXPORT_SYMBOL_GPL(vfio_device_put);
 588
 589static void vfio_device_get(struct vfio_device *device)
 590{
 591        vfio_group_get(device->group);
 592        kref_get(&device->kref);
 593}
 594
 595static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 596                                                 struct device *dev)
 597{
 598        struct vfio_device *device;
 599
 600        mutex_lock(&group->device_lock);
 601        list_for_each_entry(device, &group->device_list, group_next) {
 602                if (device->dev == dev) {
 603                        vfio_device_get(device);
 604                        mutex_unlock(&group->device_lock);
 605                        return device;
 606                }
 607        }
 608        mutex_unlock(&group->device_lock);
 609        return NULL;
 610}
 611
 612/*
 613 * Some drivers, like pci-stub, are only used to prevent other drivers from
 614 * claiming a device and are therefore perfectly legitimate for a user owned
 615 * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 616 * of the device, but it does prevent the user from having direct access to
 617 * the device, which is useful in some circumstances.
 618 *
 619 * We also assume that we can include PCI interconnect devices, ie. bridges.
 620 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 621 * then all of the downstream devices will be part of the same IOMMU group as
 622 * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 623 * breaks anything, it only does so for user owned devices downstream.  Note
 624 * that error notification via MSI can be affected for platforms that handle
 625 * MSI within the same IOVA space as DMA.
 626 */
 627static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 628
 629static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 630{
 631        if (dev_is_pci(dev)) {
 632                struct pci_dev *pdev = to_pci_dev(dev);
 633
 634                if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 635                        return true;
 636        }
 637
 638        return match_string(vfio_driver_whitelist,
 639                            ARRAY_SIZE(vfio_driver_whitelist),
 640                            drv->name) >= 0;
 641}
 642
 643/*
 644 * A vfio group is viable for use by userspace if all devices are in
 645 * one of the following states:
 646 *  - driver-less
 647 *  - bound to a vfio driver
 648 *  - bound to a whitelisted driver
 649 *  - a PCI interconnect device
 650 *
 651 * We use two methods to determine whether a device is bound to a vfio
 652 * driver.  The first is to test whether the device exists in the vfio
 653 * group.  The second is to test if the device exists on the group
 654 * unbound_list, indicating it's in the middle of transitioning from
 655 * a vfio driver to driver-less.
 656 */
 657static int vfio_dev_viable(struct device *dev, void *data)
 658{
 659        struct vfio_group *group = data;
 660        struct vfio_device *device;
 661        struct device_driver *drv = READ_ONCE(dev->driver);
 662        struct vfio_unbound_dev *unbound;
 663        int ret = -EINVAL;
 664
 665        mutex_lock(&group->unbound_lock);
 666        list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 667                if (dev == unbound->dev) {
 668                        ret = 0;
 669                        break;
 670                }
 671        }
 672        mutex_unlock(&group->unbound_lock);
 673
 674        if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 675                return 0;
 676
 677        device = vfio_group_get_device(group, dev);
 678        if (device) {
 679                vfio_device_put(device);
 680                return 0;
 681        }
 682
 683        return ret;
 684}
 685
 686/**
 687 * Async device support
 688 */
 689static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 690{
 691        struct vfio_device *device;
 692
 693        /* Do we already know about it?  We shouldn't */
 694        device = vfio_group_get_device(group, dev);
 695        if (WARN_ON_ONCE(device)) {
 696                vfio_device_put(device);
 697                return 0;
 698        }
 699
 700        /* Nothing to do for idle groups */
 701        if (!atomic_read(&group->container_users))
 702                return 0;
 703
 704        /* TODO Prevent device auto probing */
 705        dev_WARN(dev, "Device added to live group %d!\n",
 706                 iommu_group_id(group->iommu_group));
 707
 708        return 0;
 709}
 710
 711static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 712{
 713        /* We don't care what happens when the group isn't in use */
 714        if (!atomic_read(&group->container_users))
 715                return 0;
 716
 717        return vfio_dev_viable(dev, group);
 718}
 719
 720static int vfio_iommu_group_notifier(struct notifier_block *nb,
 721                                     unsigned long action, void *data)
 722{
 723        struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 724        struct device *dev = data;
 725        struct vfio_unbound_dev *unbound;
 726
 727        /*
 728         * Need to go through a group_lock lookup to get a reference or we
 729         * risk racing a group being removed.  Ignore spurious notifies.
 730         */
 731        group = vfio_group_try_get(group);
 732        if (!group)
 733                return NOTIFY_OK;
 734
 735        switch (action) {
 736        case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 737                vfio_group_nb_add_dev(group, dev);
 738                break;
 739        case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 740                /*
 741                 * Nothing to do here.  If the device is in use, then the
 742                 * vfio sub-driver should block the remove callback until
 743                 * it is unused.  If the device is unused or attached to a
 744                 * stub driver, then it should be released and we don't
 745                 * care that it will be going away.
 746                 */
 747                break;
 748        case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 749                dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
 750                        iommu_group_id(group->iommu_group));
 751                break;
 752        case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 753                dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
 754                        iommu_group_id(group->iommu_group), dev->driver->name);
 755                BUG_ON(vfio_group_nb_verify(group, dev));
 756                break;
 757        case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 758                dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
 759                        __func__, iommu_group_id(group->iommu_group),
 760                        dev->driver->name);
 761                break;
 762        case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 763                dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
 764                        iommu_group_id(group->iommu_group));
 765                /*
 766                 * XXX An unbound device in a live group is ok, but we'd
 767                 * really like to avoid the above BUG_ON by preventing other
 768                 * drivers from binding to it.  Once that occurs, we have to
 769                 * stop the system to maintain isolation.  At a minimum, we'd
 770                 * want a toggle to disable driver auto probe for this device.
 771                 */
 772
 773                mutex_lock(&group->unbound_lock);
 774                list_for_each_entry(unbound,
 775                                    &group->unbound_list, unbound_next) {
 776                        if (dev == unbound->dev) {
 777                                list_del(&unbound->unbound_next);
 778                                kfree(unbound);
 779                                break;
 780                        }
 781                }
 782                mutex_unlock(&group->unbound_lock);
 783                break;
 784        }
 785
 786        /*
 787         * If we're the last reference to the group, the group will be
 788         * released, which includes unregistering the iommu group notifier.
 789         * We hold a read-lock on that notifier list, unregistering needs
 790         * a write-lock... deadlock.  Release our reference asynchronously
 791         * to avoid that situation.
 792         */
 793        vfio_group_schedule_put(group);
 794        return NOTIFY_OK;
 795}
 796
 797/**
 798 * VFIO driver API
 799 */
 800int vfio_add_group_dev(struct device *dev,
 801                       const struct vfio_device_ops *ops, void *device_data)
 802{
 803        struct iommu_group *iommu_group;
 804        struct vfio_group *group;
 805        struct vfio_device *device;
 806
 807        iommu_group = iommu_group_get(dev);
 808        if (!iommu_group)
 809                return -EINVAL;
 810
 811        group = vfio_group_get_from_iommu(iommu_group);
 812        if (!group) {
 813                group = vfio_create_group(iommu_group);
 814                if (IS_ERR(group)) {
 815                        iommu_group_put(iommu_group);
 816                        return PTR_ERR(group);
 817                }
 818        } else {
 819                /*
 820                 * A found vfio_group already holds a reference to the
 821                 * iommu_group.  A created vfio_group keeps the reference.
 822                 */
 823                iommu_group_put(iommu_group);
 824        }
 825
 826        device = vfio_group_get_device(group, dev);
 827        if (device) {
 828                dev_WARN(dev, "Device already exists on group %d\n",
 829                         iommu_group_id(iommu_group));
 830                vfio_device_put(device);
 831                vfio_group_put(group);
 832                return -EBUSY;
 833        }
 834
 835        device = vfio_group_create_device(group, dev, ops, device_data);
 836        if (IS_ERR(device)) {
 837                vfio_group_put(group);
 838                return PTR_ERR(device);
 839        }
 840
 841        /*
 842         * Drop all but the vfio_device reference.  The vfio_device holds
 843         * a reference to the vfio_group, which holds a reference to the
 844         * iommu_group.
 845         */
 846        vfio_group_put(group);
 847
 848        return 0;
 849}
 850EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 851
 852/**
 853 * Get a reference to the vfio_device for a device.  Even if the
 854 * caller thinks they own the device, they could be racing with a
 855 * release call path, so we can't trust drvdata for the shortcut.
 856 * Go the long way around, from the iommu_group to the vfio_group
 857 * to the vfio_device.
 858 */
 859struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 860{
 861        struct vfio_group *group;
 862        struct vfio_device *device;
 863
 864        group = vfio_group_get_from_dev(dev);
 865        if (!group)
 866                return NULL;
 867
 868        device = vfio_group_get_device(group, dev);
 869        vfio_group_put(group);
 870
 871        return device;
 872}
 873EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 874
 875static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 876                                                     char *buf)
 877{
 878        struct vfio_device *it, *device = ERR_PTR(-ENODEV);
 879
 880        mutex_lock(&group->device_lock);
 881        list_for_each_entry(it, &group->device_list, group_next) {
 882                int ret;
 883
 884                if (it->ops->match) {
 885                        ret = it->ops->match(it->device_data, buf);
 886                        if (ret < 0) {
 887                                device = ERR_PTR(ret);
 888                                break;
 889                        }
 890                } else {
 891                        ret = !strcmp(dev_name(it->dev), buf);
 892                }
 893
 894                if (ret) {
 895                        device = it;
 896                        vfio_device_get(device);
 897                        break;
 898                }
 899        }
 900        mutex_unlock(&group->device_lock);
 901
 902        return device;
 903}
 904
 905/*
 906 * Caller must hold a reference to the vfio_device
 907 */
 908void *vfio_device_data(struct vfio_device *device)
 909{
 910        return device->device_data;
 911}
 912EXPORT_SYMBOL_GPL(vfio_device_data);
 913
 914/*
 915 * Decrement the device reference count and wait for the device to be
 916 * removed.  Open file descriptors for the device... */
 917void *vfio_del_group_dev(struct device *dev)
 918{
 919        DEFINE_WAIT_FUNC(wait, woken_wake_function);
 920        struct vfio_device *device = dev_get_drvdata(dev);
 921        struct vfio_group *group = device->group;
 922        void *device_data = device->device_data;
 923        struct vfio_unbound_dev *unbound;
 924        unsigned int i = 0;
 925        bool interrupted = false;
 926
 927        /*
 928         * The group exists so long as we have a device reference.  Get
 929         * a group reference and use it to scan for the device going away.
 930         */
 931        vfio_group_get(group);
 932
 933        /*
 934         * When the device is removed from the group, the group suddenly
 935         * becomes non-viable; the device has a driver (until the unbind
 936         * completes), but it's not present in the group.  This is bad news
 937         * for any external users that need to re-acquire a group reference
 938         * in order to match and release their existing reference.  To
 939         * solve this, we track such devices on the unbound_list to bridge
 940         * the gap until they're fully unbound.
 941         */
 942        unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 943        if (unbound) {
 944                unbound->dev = dev;
 945                mutex_lock(&group->unbound_lock);
 946                list_add(&unbound->unbound_next, &group->unbound_list);
 947                mutex_unlock(&group->unbound_lock);
 948        }
 949        WARN_ON(!unbound);
 950
 951        vfio_device_put(device);
 952
 953        /*
 954         * If the device is still present in the group after the above
 955         * 'put', then it is in use and we need to request it from the
 956         * bus driver.  The driver may in turn need to request the
 957         * device from the user.  We send the request on an arbitrary
 958         * interval with counter to allow the driver to take escalating
 959         * measures to release the device if it has the ability to do so.
 960         */
 961        add_wait_queue(&vfio.release_q, &wait);
 962
 963        do {
 964                device = vfio_group_get_device(group, dev);
 965                if (!device)
 966                        break;
 967
 968                if (device->ops->request)
 969                        device->ops->request(device_data, i++);
 970
 971                vfio_device_put(device);
 972
 973                if (interrupted) {
 974                        wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
 975                } else {
 976                        wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
 977                        if (signal_pending(current)) {
 978                                interrupted = true;
 979                                dev_warn(dev,
 980                                         "Device is currently in use, task"
 981                                         " \"%s\" (%d) "
 982                                         "blocked until device is released",
 983                                         current->comm, task_pid_nr(current));
 984                        }
 985                }
 986
 987        } while (1);
 988
 989        remove_wait_queue(&vfio.release_q, &wait);
 990        /*
 991         * In order to support multiple devices per group, devices can be
 992         * plucked from the group while other devices in the group are still
 993         * in use.  The container persists with this group and those remaining
 994         * devices still attached.  If the user creates an isolation violation
 995         * by binding this device to another driver while the group is still in
 996         * use, that's their fault.  However, in the case of removing the last,
 997         * or potentially the only, device in the group there can be no other
 998         * in-use devices in the group.  The user has done their due diligence
 999         * and we should lay no claims to those devices.  In order to do that,
1000         * we need to make sure the group is detached from the container.
1001         * Without this stall, we're potentially racing with a user process
1002         * that may attempt to immediately bind this device to another driver.
1003         */
1004        if (list_empty(&group->device_list))
1005                wait_event(group->container_q, !group->container);
1006
1007        vfio_group_put(group);
1008
1009        return device_data;
1010}
1011EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1012
1013/**
1014 * VFIO base fd, /dev/vfio/vfio
1015 */
1016static long vfio_ioctl_check_extension(struct vfio_container *container,
1017                                       unsigned long arg)
1018{
1019        struct vfio_iommu_driver *driver;
1020        long ret = 0;
1021
1022        down_read(&container->group_lock);
1023
1024        driver = container->iommu_driver;
1025
1026        switch (arg) {
1027                /* No base extensions yet */
1028        default:
1029                /*
1030                 * If no driver is set, poll all registered drivers for
1031                 * extensions and return the first positive result.  If
1032                 * a driver is already set, further queries will be passed
1033                 * only to that driver.
1034                 */
1035                if (!driver) {
1036                        mutex_lock(&vfio.iommu_drivers_lock);
1037                        list_for_each_entry(driver, &vfio.iommu_drivers_list,
1038                                            vfio_next) {
1039
1040#ifdef CONFIG_VFIO_NOIOMMU
1041                                if (!list_empty(&container->group_list) &&
1042                                    (container->noiommu !=
1043                                     (driver->ops == &vfio_noiommu_ops)))
1044                                        continue;
1045#endif
1046
1047                                if (!try_module_get(driver->ops->owner))
1048                                        continue;
1049
1050                                ret = driver->ops->ioctl(NULL,
1051                                                         VFIO_CHECK_EXTENSION,
1052                                                         arg);
1053                                module_put(driver->ops->owner);
1054                                if (ret > 0)
1055                                        break;
1056                        }
1057                        mutex_unlock(&vfio.iommu_drivers_lock);
1058                } else
1059                        ret = driver->ops->ioctl(container->iommu_data,
1060                                                 VFIO_CHECK_EXTENSION, arg);
1061        }
1062
1063        up_read(&container->group_lock);
1064
1065        return ret;
1066}
1067
1068/* hold write lock on container->group_lock */
1069static int __vfio_container_attach_groups(struct vfio_container *container,
1070                                          struct vfio_iommu_driver *driver,
1071                                          void *data)
1072{
1073        struct vfio_group *group;
1074        int ret = -ENODEV;
1075
1076        list_for_each_entry(group, &container->group_list, container_next) {
1077                ret = driver->ops->attach_group(data, group->iommu_group);
1078                if (ret)
1079                        goto unwind;
1080        }
1081
1082        return ret;
1083
1084unwind:
1085        list_for_each_entry_continue_reverse(group, &container->group_list,
1086                                             container_next) {
1087                driver->ops->detach_group(data, group->iommu_group);
1088        }
1089
1090        return ret;
1091}
1092
1093static long vfio_ioctl_set_iommu(struct vfio_container *container,
1094                                 unsigned long arg)
1095{
1096        struct vfio_iommu_driver *driver;
1097        long ret = -ENODEV;
1098
1099        down_write(&container->group_lock);
1100
1101        /*
1102         * The container is designed to be an unprivileged interface while
1103         * the group can be assigned to specific users.  Therefore, only by
1104         * adding a group to a container does the user get the privilege of
1105         * enabling the iommu, which may allocate finite resources.  There
1106         * is no unset_iommu, but by removing all the groups from a container,
1107         * the container is deprivileged and returns to an unset state.
1108         */
1109        if (list_empty(&container->group_list) || container->iommu_driver) {
1110                up_write(&container->group_lock);
1111                return -EINVAL;
1112        }
1113
1114        mutex_lock(&vfio.iommu_drivers_lock);
1115        list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1116                void *data;
1117
1118#ifdef CONFIG_VFIO_NOIOMMU
1119                /*
1120                 * Only noiommu containers can use vfio-noiommu and noiommu
1121                 * containers can only use vfio-noiommu.
1122                 */
1123                if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1124                        continue;
1125#endif
1126
1127                if (!try_module_get(driver->ops->owner))
1128                        continue;
1129
1130                /*
1131                 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1132                 * so test which iommu driver reported support for this
1133                 * extension and call open on them.  We also pass them the
1134                 * magic, allowing a single driver to support multiple
1135                 * interfaces if they'd like.
1136                 */
1137                if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1138                        module_put(driver->ops->owner);
1139                        continue;
1140                }
1141
1142                data = driver->ops->open(arg);
1143                if (IS_ERR(data)) {
1144                        ret = PTR_ERR(data);
1145                        module_put(driver->ops->owner);
1146                        continue;
1147                }
1148
1149                ret = __vfio_container_attach_groups(container, driver, data);
1150                if (ret) {
1151                        driver->ops->release(data);
1152                        module_put(driver->ops->owner);
1153                        continue;
1154                }
1155
1156                container->iommu_driver = driver;
1157                container->iommu_data = data;
1158                break;
1159        }
1160
1161        mutex_unlock(&vfio.iommu_drivers_lock);
1162        up_write(&container->group_lock);
1163
1164        return ret;
1165}
1166
1167static long vfio_fops_unl_ioctl(struct file *filep,
1168                                unsigned int cmd, unsigned long arg)
1169{
1170        struct vfio_container *container = filep->private_data;
1171        struct vfio_iommu_driver *driver;
1172        void *data;
1173        long ret = -EINVAL;
1174
1175        if (!container)
1176                return ret;
1177
1178        switch (cmd) {
1179        case VFIO_GET_API_VERSION:
1180                ret = VFIO_API_VERSION;
1181                break;
1182        case VFIO_CHECK_EXTENSION:
1183                ret = vfio_ioctl_check_extension(container, arg);
1184                break;
1185        case VFIO_SET_IOMMU:
1186                ret = vfio_ioctl_set_iommu(container, arg);
1187                break;
1188        default:
1189                driver = container->iommu_driver;
1190                data = container->iommu_data;
1191
1192                if (driver) /* passthrough all unrecognized ioctls */
1193                        ret = driver->ops->ioctl(data, cmd, arg);
1194        }
1195
1196        return ret;
1197}
1198
1199static int vfio_fops_open(struct inode *inode, struct file *filep)
1200{
1201        struct vfio_container *container;
1202
1203        container = kzalloc(sizeof(*container), GFP_KERNEL);
1204        if (!container)
1205                return -ENOMEM;
1206
1207        INIT_LIST_HEAD(&container->group_list);
1208        init_rwsem(&container->group_lock);
1209        kref_init(&container->kref);
1210
1211        filep->private_data = container;
1212
1213        return 0;
1214}
1215
1216static int vfio_fops_release(struct inode *inode, struct file *filep)
1217{
1218        struct vfio_container *container = filep->private_data;
1219
1220        filep->private_data = NULL;
1221
1222        vfio_container_put(container);
1223
1224        return 0;
1225}
1226
1227/*
1228 * Once an iommu driver is set, we optionally pass read/write/mmap
1229 * on to the driver, allowing management interfaces beyond ioctl.
1230 */
1231static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1232                              size_t count, loff_t *ppos)
1233{
1234        struct vfio_container *container = filep->private_data;
1235        struct vfio_iommu_driver *driver;
1236        ssize_t ret = -EINVAL;
1237
1238        driver = container->iommu_driver;
1239        if (likely(driver && driver->ops->read))
1240                ret = driver->ops->read(container->iommu_data,
1241                                        buf, count, ppos);
1242
1243        return ret;
1244}
1245
1246static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1247                               size_t count, loff_t *ppos)
1248{
1249        struct vfio_container *container = filep->private_data;
1250        struct vfio_iommu_driver *driver;
1251        ssize_t ret = -EINVAL;
1252
1253        driver = container->iommu_driver;
1254        if (likely(driver && driver->ops->write))
1255                ret = driver->ops->write(container->iommu_data,
1256                                         buf, count, ppos);
1257
1258        return ret;
1259}
1260
1261static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1262{
1263        struct vfio_container *container = filep->private_data;
1264        struct vfio_iommu_driver *driver;
1265        int ret = -EINVAL;
1266
1267        driver = container->iommu_driver;
1268        if (likely(driver && driver->ops->mmap))
1269                ret = driver->ops->mmap(container->iommu_data, vma);
1270
1271        return ret;
1272}
1273
1274static const struct file_operations vfio_fops = {
1275        .owner          = THIS_MODULE,
1276        .open           = vfio_fops_open,
1277        .release        = vfio_fops_release,
1278        .read           = vfio_fops_read,
1279        .write          = vfio_fops_write,
1280        .unlocked_ioctl = vfio_fops_unl_ioctl,
1281        .compat_ioctl   = compat_ptr_ioctl,
1282        .mmap           = vfio_fops_mmap,
1283};
1284
1285/**
1286 * VFIO Group fd, /dev/vfio/$GROUP
1287 */
1288static void __vfio_group_unset_container(struct vfio_group *group)
1289{
1290        struct vfio_container *container = group->container;
1291        struct vfio_iommu_driver *driver;
1292
1293        down_write(&container->group_lock);
1294
1295        driver = container->iommu_driver;
1296        if (driver)
1297                driver->ops->detach_group(container->iommu_data,
1298                                          group->iommu_group);
1299
1300        group->container = NULL;
1301        wake_up(&group->container_q);
1302        list_del(&group->container_next);
1303
1304        /* Detaching the last group deprivileges a container, remove iommu */
1305        if (driver && list_empty(&container->group_list)) {
1306                driver->ops->release(container->iommu_data);
1307                module_put(driver->ops->owner);
1308                container->iommu_driver = NULL;
1309                container->iommu_data = NULL;
1310        }
1311
1312        up_write(&container->group_lock);
1313
1314        vfio_container_put(container);
1315}
1316
1317/*
1318 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1319 * if there was no container to unset.  Since the ioctl is called on
1320 * the group, we know that still exists, therefore the only valid
1321 * transition here is 1->0.
1322 */
1323static int vfio_group_unset_container(struct vfio_group *group)
1324{
1325        int users = atomic_cmpxchg(&group->container_users, 1, 0);
1326
1327        if (!users)
1328                return -EINVAL;
1329        if (users != 1)
1330                return -EBUSY;
1331
1332        __vfio_group_unset_container(group);
1333
1334        return 0;
1335}
1336
1337/*
1338 * When removing container users, anything that removes the last user
1339 * implicitly removes the group from the container.  That is, if the
1340 * group file descriptor is closed, as well as any device file descriptors,
1341 * the group is free.
1342 */
1343static void vfio_group_try_dissolve_container(struct vfio_group *group)
1344{
1345        if (0 == atomic_dec_if_positive(&group->container_users))
1346                __vfio_group_unset_container(group);
1347}
1348
1349static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1350{
1351        struct fd f;
1352        struct vfio_container *container;
1353        struct vfio_iommu_driver *driver;
1354        int ret = 0;
1355
1356        if (atomic_read(&group->container_users))
1357                return -EINVAL;
1358
1359        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1360                return -EPERM;
1361
1362        f = fdget(container_fd);
1363        if (!f.file)
1364                return -EBADF;
1365
1366        /* Sanity check, is this really our fd? */
1367        if (f.file->f_op != &vfio_fops) {
1368                fdput(f);
1369                return -EINVAL;
1370        }
1371
1372        container = f.file->private_data;
1373        WARN_ON(!container); /* fget ensures we don't race vfio_release */
1374
1375        down_write(&container->group_lock);
1376
1377        /* Real groups and fake groups cannot mix */
1378        if (!list_empty(&container->group_list) &&
1379            container->noiommu != group->noiommu) {
1380                ret = -EPERM;
1381                goto unlock_out;
1382        }
1383
1384        driver = container->iommu_driver;
1385        if (driver) {
1386                ret = driver->ops->attach_group(container->iommu_data,
1387                                                group->iommu_group);
1388                if (ret)
1389                        goto unlock_out;
1390        }
1391
1392        group->container = container;
1393        container->noiommu = group->noiommu;
1394        list_add(&group->container_next, &container->group_list);
1395
1396        /* Get a reference on the container and mark a user within the group */
1397        vfio_container_get(container);
1398        atomic_inc(&group->container_users);
1399
1400unlock_out:
1401        up_write(&container->group_lock);
1402        fdput(f);
1403        return ret;
1404}
1405
1406static bool vfio_group_viable(struct vfio_group *group)
1407{
1408        return (iommu_group_for_each_dev(group->iommu_group,
1409                                         group, vfio_dev_viable) == 0);
1410}
1411
1412static int vfio_group_add_container_user(struct vfio_group *group)
1413{
1414        if (!atomic_inc_not_zero(&group->container_users))
1415                return -EINVAL;
1416
1417        if (group->noiommu) {
1418                atomic_dec(&group->container_users);
1419                return -EPERM;
1420        }
1421        if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1422                atomic_dec(&group->container_users);
1423                return -EINVAL;
1424        }
1425
1426        return 0;
1427}
1428
1429static const struct file_operations vfio_device_fops;
1430
1431static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1432{
1433        struct vfio_device *device;
1434        struct file *filep;
1435        int ret;
1436
1437        if (0 == atomic_read(&group->container_users) ||
1438            !group->container->iommu_driver || !vfio_group_viable(group))
1439                return -EINVAL;
1440
1441        if (group->noiommu && !capable(CAP_SYS_RAWIO))
1442                return -EPERM;
1443
1444        device = vfio_device_get_from_name(group, buf);
1445        if (IS_ERR(device))
1446                return PTR_ERR(device);
1447
1448        ret = device->ops->open(device->device_data);
1449        if (ret) {
1450                vfio_device_put(device);
1451                return ret;
1452        }
1453
1454        /*
1455         * We can't use anon_inode_getfd() because we need to modify
1456         * the f_mode flags directly to allow more than just ioctls
1457         */
1458        ret = get_unused_fd_flags(O_CLOEXEC);
1459        if (ret < 0) {
1460                device->ops->release(device->device_data);
1461                vfio_device_put(device);
1462                return ret;
1463        }
1464
1465        filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1466                                   device, O_RDWR);
1467        if (IS_ERR(filep)) {
1468                put_unused_fd(ret);
1469                ret = PTR_ERR(filep);
1470                device->ops->release(device->device_data);
1471                vfio_device_put(device);
1472                return ret;
1473        }
1474
1475        /*
1476         * TODO: add an anon_inode interface to do this.
1477         * Appears to be missing by lack of need rather than
1478         * explicitly prevented.  Now there's need.
1479         */
1480        filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1481
1482        atomic_inc(&group->container_users);
1483
1484        fd_install(ret, filep);
1485
1486        if (group->noiommu)
1487                dev_warn(device->dev, "vfio-noiommu device opened by user "
1488                         "(%s:%d)\n", current->comm, task_pid_nr(current));
1489
1490        return ret;
1491}
1492
1493static long vfio_group_fops_unl_ioctl(struct file *filep,
1494                                      unsigned int cmd, unsigned long arg)
1495{
1496        struct vfio_group *group = filep->private_data;
1497        long ret = -ENOTTY;
1498
1499        switch (cmd) {
1500        case VFIO_GROUP_GET_STATUS:
1501        {
1502                struct vfio_group_status status;
1503                unsigned long minsz;
1504
1505                minsz = offsetofend(struct vfio_group_status, flags);
1506
1507                if (copy_from_user(&status, (void __user *)arg, minsz))
1508                        return -EFAULT;
1509
1510                if (status.argsz < minsz)
1511                        return -EINVAL;
1512
1513                status.flags = 0;
1514
1515                if (vfio_group_viable(group))
1516                        status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1517
1518                if (group->container)
1519                        status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1520
1521                if (copy_to_user((void __user *)arg, &status, minsz))
1522                        return -EFAULT;
1523
1524                ret = 0;
1525                break;
1526        }
1527        case VFIO_GROUP_SET_CONTAINER:
1528        {
1529                int fd;
1530
1531                if (get_user(fd, (int __user *)arg))
1532                        return -EFAULT;
1533
1534                if (fd < 0)
1535                        return -EINVAL;
1536
1537                ret = vfio_group_set_container(group, fd);
1538                break;
1539        }
1540        case VFIO_GROUP_UNSET_CONTAINER:
1541                ret = vfio_group_unset_container(group);
1542                break;
1543        case VFIO_GROUP_GET_DEVICE_FD:
1544        {
1545                char *buf;
1546
1547                buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1548                if (IS_ERR(buf))
1549                        return PTR_ERR(buf);
1550
1551                ret = vfio_group_get_device_fd(group, buf);
1552                kfree(buf);
1553                break;
1554        }
1555        }
1556
1557        return ret;
1558}
1559
1560static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1561{
1562        struct vfio_group *group;
1563        int opened;
1564
1565        group = vfio_group_get_from_minor(iminor(inode));
1566        if (!group)
1567                return -ENODEV;
1568
1569        if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1570                vfio_group_put(group);
1571                return -EPERM;
1572        }
1573
1574        /* Do we need multiple instances of the group open?  Seems not. */
1575        opened = atomic_cmpxchg(&group->opened, 0, 1);
1576        if (opened) {
1577                vfio_group_put(group);
1578                return -EBUSY;
1579        }
1580
1581        /* Is something still in use from a previous open? */
1582        if (group->container) {
1583                atomic_dec(&group->opened);
1584                vfio_group_put(group);
1585                return -EBUSY;
1586        }
1587
1588        /* Warn if previous user didn't cleanup and re-init to drop them */
1589        if (WARN_ON(group->notifier.head))
1590                BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1591
1592        filep->private_data = group;
1593
1594        return 0;
1595}
1596
1597static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1598{
1599        struct vfio_group *group = filep->private_data;
1600
1601        filep->private_data = NULL;
1602
1603        vfio_group_try_dissolve_container(group);
1604
1605        atomic_dec(&group->opened);
1606
1607        vfio_group_put(group);
1608
1609        return 0;
1610}
1611
1612static const struct file_operations vfio_group_fops = {
1613        .owner          = THIS_MODULE,
1614        .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1615        .compat_ioctl   = compat_ptr_ioctl,
1616        .open           = vfio_group_fops_open,
1617        .release        = vfio_group_fops_release,
1618};
1619
1620/**
1621 * VFIO Device fd
1622 */
1623static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1624{
1625        struct vfio_device *device = filep->private_data;
1626
1627        device->ops->release(device->device_data);
1628
1629        vfio_group_try_dissolve_container(device->group);
1630
1631        vfio_device_put(device);
1632
1633        return 0;
1634}
1635
1636static long vfio_device_fops_unl_ioctl(struct file *filep,
1637                                       unsigned int cmd, unsigned long arg)
1638{
1639        struct vfio_device *device = filep->private_data;
1640
1641        if (unlikely(!device->ops->ioctl))
1642                return -EINVAL;
1643
1644        return device->ops->ioctl(device->device_data, cmd, arg);
1645}
1646
1647static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1648                                     size_t count, loff_t *ppos)
1649{
1650        struct vfio_device *device = filep->private_data;
1651
1652        if (unlikely(!device->ops->read))
1653                return -EINVAL;
1654
1655        return device->ops->read(device->device_data, buf, count, ppos);
1656}
1657
1658static ssize_t vfio_device_fops_write(struct file *filep,
1659                                      const char __user *buf,
1660                                      size_t count, loff_t *ppos)
1661{
1662        struct vfio_device *device = filep->private_data;
1663
1664        if (unlikely(!device->ops->write))
1665                return -EINVAL;
1666
1667        return device->ops->write(device->device_data, buf, count, ppos);
1668}
1669
1670static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1671{
1672        struct vfio_device *device = filep->private_data;
1673
1674        if (unlikely(!device->ops->mmap))
1675                return -EINVAL;
1676
1677        return device->ops->mmap(device->device_data, vma);
1678}
1679
1680static const struct file_operations vfio_device_fops = {
1681        .owner          = THIS_MODULE,
1682        .release        = vfio_device_fops_release,
1683        .read           = vfio_device_fops_read,
1684        .write          = vfio_device_fops_write,
1685        .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1686        .compat_ioctl   = compat_ptr_ioctl,
1687        .mmap           = vfio_device_fops_mmap,
1688};
1689
1690/**
1691 * External user API, exported by symbols to be linked dynamically.
1692 *
1693 * The protocol includes:
1694 *  1. do normal VFIO init operation:
1695 *      - opening a new container;
1696 *      - attaching group(s) to it;
1697 *      - setting an IOMMU driver for a container.
1698 * When IOMMU is set for a container, all groups in it are
1699 * considered ready to use by an external user.
1700 *
1701 * 2. User space passes a group fd to an external user.
1702 * The external user calls vfio_group_get_external_user()
1703 * to verify that:
1704 *      - the group is initialized;
1705 *      - IOMMU is set for it.
1706 * If both checks passed, vfio_group_get_external_user()
1707 * increments the container user counter to prevent
1708 * the VFIO group from disposal before KVM exits.
1709 *
1710 * 3. The external user calls vfio_external_user_iommu_id()
1711 * to know an IOMMU ID.
1712 *
1713 * 4. When the external KVM finishes, it calls
1714 * vfio_group_put_external_user() to release the VFIO group.
1715 * This call decrements the container user counter.
1716 */
1717struct vfio_group *vfio_group_get_external_user(struct file *filep)
1718{
1719        struct vfio_group *group = filep->private_data;
1720        int ret;
1721
1722        if (filep->f_op != &vfio_group_fops)
1723                return ERR_PTR(-EINVAL);
1724
1725        ret = vfio_group_add_container_user(group);
1726        if (ret)
1727                return ERR_PTR(ret);
1728
1729        vfio_group_get(group);
1730
1731        return group;
1732}
1733EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1734
1735/**
1736 * External user API, exported by symbols to be linked dynamically.
1737 * The external user passes in a device pointer
1738 * to verify that:
1739 *      - A VFIO group is assiciated with the device;
1740 *      - IOMMU is set for the group.
1741 * If both checks passed, vfio_group_get_external_user_from_dev()
1742 * increments the container user counter to prevent the VFIO group
1743 * from disposal before external user exits and returns the pointer
1744 * to the VFIO group.
1745 *
1746 * When the external user finishes using the VFIO group, it calls
1747 * vfio_group_put_external_user() to release the VFIO group and
1748 * decrement the container user counter.
1749 *
1750 * @dev [in]    : device
1751 * Return error PTR or pointer to VFIO group.
1752 */
1753
1754struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1755{
1756        struct vfio_group *group;
1757        int ret;
1758
1759        group = vfio_group_get_from_dev(dev);
1760        if (!group)
1761                return ERR_PTR(-ENODEV);
1762
1763        ret = vfio_group_add_container_user(group);
1764        if (ret) {
1765                vfio_group_put(group);
1766                return ERR_PTR(ret);
1767        }
1768
1769        return group;
1770}
1771EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1772
1773void vfio_group_put_external_user(struct vfio_group *group)
1774{
1775        vfio_group_try_dissolve_container(group);
1776        vfio_group_put(group);
1777}
1778EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1779
1780bool vfio_external_group_match_file(struct vfio_group *test_group,
1781                                    struct file *filep)
1782{
1783        struct vfio_group *group = filep->private_data;
1784
1785        return (filep->f_op == &vfio_group_fops) && (group == test_group);
1786}
1787EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1788
1789int vfio_external_user_iommu_id(struct vfio_group *group)
1790{
1791        return iommu_group_id(group->iommu_group);
1792}
1793EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1794
1795long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1796{
1797        return vfio_ioctl_check_extension(group->container, arg);
1798}
1799EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1800
1801/**
1802 * Sub-module support
1803 */
1804/*
1805 * Helper for managing a buffer of info chain capabilities, allocate or
1806 * reallocate a buffer with additional @size, filling in @id and @version
1807 * of the capability.  A pointer to the new capability is returned.
1808 *
1809 * NB. The chain is based at the head of the buffer, so new entries are
1810 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1811 * next offsets prior to copying to the user buffer.
1812 */
1813struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1814                                               size_t size, u16 id, u16 version)
1815{
1816        void *buf;
1817        struct vfio_info_cap_header *header, *tmp;
1818
1819        buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1820        if (!buf) {
1821                kfree(caps->buf);
1822                caps->size = 0;
1823                return ERR_PTR(-ENOMEM);
1824        }
1825
1826        caps->buf = buf;
1827        header = buf + caps->size;
1828
1829        /* Eventually copied to user buffer, zero */
1830        memset(header, 0, size);
1831
1832        header->id = id;
1833        header->version = version;
1834
1835        /* Add to the end of the capability chain */
1836        for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1837                ; /* nothing */
1838
1839        tmp->next = caps->size;
1840        caps->size += size;
1841
1842        return header;
1843}
1844EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1845
1846void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1847{
1848        struct vfio_info_cap_header *tmp;
1849        void *buf = (void *)caps->buf;
1850
1851        for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1852                tmp->next += offset;
1853}
1854EXPORT_SYMBOL(vfio_info_cap_shift);
1855
1856int vfio_info_add_capability(struct vfio_info_cap *caps,
1857                             struct vfio_info_cap_header *cap, size_t size)
1858{
1859        struct vfio_info_cap_header *header;
1860
1861        header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1862        if (IS_ERR(header))
1863                return PTR_ERR(header);
1864
1865        memcpy(header + 1, cap + 1, size - sizeof(*header));
1866
1867        return 0;
1868}
1869EXPORT_SYMBOL(vfio_info_add_capability);
1870
1871int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1872                                       int max_irq_type, size_t *data_size)
1873{
1874        unsigned long minsz;
1875        size_t size;
1876
1877        minsz = offsetofend(struct vfio_irq_set, count);
1878
1879        if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1880            (hdr->count >= (U32_MAX - hdr->start)) ||
1881            (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1882                                VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1883                return -EINVAL;
1884
1885        if (data_size)
1886                *data_size = 0;
1887
1888        if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1889                return -EINVAL;
1890
1891        switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1892        case VFIO_IRQ_SET_DATA_NONE:
1893                size = 0;
1894                break;
1895        case VFIO_IRQ_SET_DATA_BOOL:
1896                size = sizeof(uint8_t);
1897                break;
1898        case VFIO_IRQ_SET_DATA_EVENTFD:
1899                size = sizeof(int32_t);
1900                break;
1901        default:
1902                return -EINVAL;
1903        }
1904
1905        if (size) {
1906                if (hdr->argsz - minsz < hdr->count * size)
1907                        return -EINVAL;
1908
1909                if (!data_size)
1910                        return -EINVAL;
1911
1912                *data_size = hdr->count * size;
1913        }
1914
1915        return 0;
1916}
1917EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1918
1919/*
1920 * Pin a set of guest PFNs and return their associated host PFNs for local
1921 * domain only.
1922 * @dev [in]     : device
1923 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1924 * @npage [in]   : count of elements in user_pfn array.  This count should not
1925 *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1926 * @prot [in]    : protection flags
1927 * @phys_pfn[out]: array of host PFNs
1928 * Return error or number of pages pinned.
1929 */
1930int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1931                   int prot, unsigned long *phys_pfn)
1932{
1933        struct vfio_container *container;
1934        struct vfio_group *group;
1935        struct vfio_iommu_driver *driver;
1936        int ret;
1937
1938        if (!dev || !user_pfn || !phys_pfn || !npage)
1939                return -EINVAL;
1940
1941        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1942                return -E2BIG;
1943
1944        group = vfio_group_get_from_dev(dev);
1945        if (!group)
1946                return -ENODEV;
1947
1948        ret = vfio_group_add_container_user(group);
1949        if (ret)
1950                goto err_pin_pages;
1951
1952        container = group->container;
1953        driver = container->iommu_driver;
1954        if (likely(driver && driver->ops->pin_pages))
1955                ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1956                                             npage, prot, phys_pfn);
1957        else
1958                ret = -ENOTTY;
1959
1960        vfio_group_try_dissolve_container(group);
1961
1962err_pin_pages:
1963        vfio_group_put(group);
1964        return ret;
1965}
1966EXPORT_SYMBOL(vfio_pin_pages);
1967
1968/*
1969 * Unpin set of host PFNs for local domain only.
1970 * @dev [in]     : device
1971 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1972 *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1973 * @npage [in]   : count of elements in user_pfn array.  This count should not
1974 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1975 * Return error or number of pages unpinned.
1976 */
1977int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1978{
1979        struct vfio_container *container;
1980        struct vfio_group *group;
1981        struct vfio_iommu_driver *driver;
1982        int ret;
1983
1984        if (!dev || !user_pfn || !npage)
1985                return -EINVAL;
1986
1987        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1988                return -E2BIG;
1989
1990        group = vfio_group_get_from_dev(dev);
1991        if (!group)
1992                return -ENODEV;
1993
1994        ret = vfio_group_add_container_user(group);
1995        if (ret)
1996                goto err_unpin_pages;
1997
1998        container = group->container;
1999        driver = container->iommu_driver;
2000        if (likely(driver && driver->ops->unpin_pages))
2001                ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2002                                               npage);
2003        else
2004                ret = -ENOTTY;
2005
2006        vfio_group_try_dissolve_container(group);
2007
2008err_unpin_pages:
2009        vfio_group_put(group);
2010        return ret;
2011}
2012EXPORT_SYMBOL(vfio_unpin_pages);
2013
2014/*
2015 * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
2016 * VFIO group.
2017 *
2018 * The caller needs to call vfio_group_get_external_user() or
2019 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2020 * so as to prevent the VFIO group from disposal in the middle of the call.
2021 * But it can keep the reference to the VFIO group for several calls into
2022 * this interface.
2023 * After finishing using of the VFIO group, the caller needs to release the
2024 * VFIO group by calling vfio_group_put_external_user().
2025 *
2026 * @group [in]          : VFIO group
2027 * @user_iova_pfn [in]  : array of user/guest IOVA PFNs to be pinned.
2028 * @npage [in]          : count of elements in user_iova_pfn array.
2029 *                        This count should not be greater
2030 *                        VFIO_PIN_PAGES_MAX_ENTRIES.
2031 * @prot [in]           : protection flags
2032 * @phys_pfn [out]      : array of host PFNs
2033 * Return error or number of pages pinned.
2034 */
2035int vfio_group_pin_pages(struct vfio_group *group,
2036                         unsigned long *user_iova_pfn, int npage,
2037                         int prot, unsigned long *phys_pfn)
2038{
2039        struct vfio_container *container;
2040        struct vfio_iommu_driver *driver;
2041        int ret;
2042
2043        if (!group || !user_iova_pfn || !phys_pfn || !npage)
2044                return -EINVAL;
2045
2046        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2047                return -E2BIG;
2048
2049        container = group->container;
2050        driver = container->iommu_driver;
2051        if (likely(driver && driver->ops->pin_pages))
2052                ret = driver->ops->pin_pages(container->iommu_data,
2053                                             user_iova_pfn, npage,
2054                                             prot, phys_pfn);
2055        else
2056                ret = -ENOTTY;
2057
2058        return ret;
2059}
2060EXPORT_SYMBOL(vfio_group_pin_pages);
2061
2062/*
2063 * Unpin a set of guest IOVA PFNs for a VFIO group.
2064 *
2065 * The caller needs to call vfio_group_get_external_user() or
2066 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2067 * so as to prevent the VFIO group from disposal in the middle of the call.
2068 * But it can keep the reference to the VFIO group for several calls into
2069 * this interface.
2070 * After finishing using of the VFIO group, the caller needs to release the
2071 * VFIO group by calling vfio_group_put_external_user().
2072 *
2073 * @group [in]          : vfio group
2074 * @user_iova_pfn [in]  : array of user/guest IOVA PFNs to be unpinned.
2075 * @npage [in]          : count of elements in user_iova_pfn array.
2076 *                        This count should not be greater than
2077 *                        VFIO_PIN_PAGES_MAX_ENTRIES.
2078 * Return error or number of pages unpinned.
2079 */
2080int vfio_group_unpin_pages(struct vfio_group *group,
2081                           unsigned long *user_iova_pfn, int npage)
2082{
2083        struct vfio_container *container;
2084        struct vfio_iommu_driver *driver;
2085        int ret;
2086
2087        if (!group || !user_iova_pfn || !npage)
2088                return -EINVAL;
2089
2090        if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2091                return -E2BIG;
2092
2093        container = group->container;
2094        driver = container->iommu_driver;
2095        if (likely(driver && driver->ops->unpin_pages))
2096                ret = driver->ops->unpin_pages(container->iommu_data,
2097                                               user_iova_pfn, npage);
2098        else
2099                ret = -ENOTTY;
2100
2101        return ret;
2102}
2103EXPORT_SYMBOL(vfio_group_unpin_pages);
2104
2105
2106/*
2107 * This interface allows the CPUs to perform some sort of virtual DMA on
2108 * behalf of the device.
2109 *
2110 * CPUs read/write from/into a range of IOVAs pointing to user space memory
2111 * into/from a kernel buffer.
2112 *
2113 * As the read/write of user space memory is conducted via the CPUs and is
2114 * not a real device DMA, it is not necessary to pin the user space memory.
2115 *
2116 * The caller needs to call vfio_group_get_external_user() or
2117 * vfio_group_get_external_user_from_dev() prior to calling this interface,
2118 * so as to prevent the VFIO group from disposal in the middle of the call.
2119 * But it can keep the reference to the VFIO group for several calls into
2120 * this interface.
2121 * After finishing using of the VFIO group, the caller needs to release the
2122 * VFIO group by calling vfio_group_put_external_user().
2123 *
2124 * @group [in]          : VFIO group
2125 * @user_iova [in]      : base IOVA of a user space buffer
2126 * @data [in]           : pointer to kernel buffer
2127 * @len [in]            : kernel buffer length
2128 * @write               : indicate read or write
2129 * Return error code on failure or 0 on success.
2130 */
2131int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2132                void *data, size_t len, bool write)
2133{
2134        struct vfio_container *container;
2135        struct vfio_iommu_driver *driver;
2136        int ret = 0;
2137
2138        if (!group || !data || len <= 0)
2139                return -EINVAL;
2140
2141        container = group->container;
2142        driver = container->iommu_driver;
2143
2144        if (likely(driver && driver->ops->dma_rw))
2145                ret = driver->ops->dma_rw(container->iommu_data,
2146                                          user_iova, data, len, write);
2147        else
2148                ret = -ENOTTY;
2149
2150        return ret;
2151}
2152EXPORT_SYMBOL(vfio_dma_rw);
2153
2154static int vfio_register_iommu_notifier(struct vfio_group *group,
2155                                        unsigned long *events,
2156                                        struct notifier_block *nb)
2157{
2158        struct vfio_container *container;
2159        struct vfio_iommu_driver *driver;
2160        int ret;
2161
2162        ret = vfio_group_add_container_user(group);
2163        if (ret)
2164                return -EINVAL;
2165
2166        container = group->container;
2167        driver = container->iommu_driver;
2168        if (likely(driver && driver->ops->register_notifier))
2169                ret = driver->ops->register_notifier(container->iommu_data,
2170                                                     events, nb);
2171        else
2172                ret = -ENOTTY;
2173
2174        vfio_group_try_dissolve_container(group);
2175
2176        return ret;
2177}
2178
2179static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2180                                          struct notifier_block *nb)
2181{
2182        struct vfio_container *container;
2183        struct vfio_iommu_driver *driver;
2184        int ret;
2185
2186        ret = vfio_group_add_container_user(group);
2187        if (ret)
2188                return -EINVAL;
2189
2190        container = group->container;
2191        driver = container->iommu_driver;
2192        if (likely(driver && driver->ops->unregister_notifier))
2193                ret = driver->ops->unregister_notifier(container->iommu_data,
2194                                                       nb);
2195        else
2196                ret = -ENOTTY;
2197
2198        vfio_group_try_dissolve_container(group);
2199
2200        return ret;
2201}
2202
2203void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2204{
2205        group->kvm = kvm;
2206        blocking_notifier_call_chain(&group->notifier,
2207                                VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2208}
2209EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2210
2211static int vfio_register_group_notifier(struct vfio_group *group,
2212                                        unsigned long *events,
2213                                        struct notifier_block *nb)
2214{
2215        int ret;
2216        bool set_kvm = false;
2217
2218        if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2219                set_kvm = true;
2220
2221        /* clear known events */
2222        *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2223
2224        /* refuse to continue if still events remaining */
2225        if (*events)
2226                return -EINVAL;
2227
2228        ret = vfio_group_add_container_user(group);
2229        if (ret)
2230                return -EINVAL;
2231
2232        ret = blocking_notifier_chain_register(&group->notifier, nb);
2233
2234        /*
2235         * The attaching of kvm and vfio_group might already happen, so
2236         * here we replay once upon registration.
2237         */
2238        if (!ret && set_kvm && group->kvm)
2239                blocking_notifier_call_chain(&group->notifier,
2240                                        VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2241
2242        vfio_group_try_dissolve_container(group);
2243
2244        return ret;
2245}
2246
2247static int vfio_unregister_group_notifier(struct vfio_group *group,
2248                                         struct notifier_block *nb)
2249{
2250        int ret;
2251
2252        ret = vfio_group_add_container_user(group);
2253        if (ret)
2254                return -EINVAL;
2255
2256        ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2257
2258        vfio_group_try_dissolve_container(group);
2259
2260        return ret;
2261}
2262
2263int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2264                           unsigned long *events, struct notifier_block *nb)
2265{
2266        struct vfio_group *group;
2267        int ret;
2268
2269        if (!dev || !nb || !events || (*events == 0))
2270                return -EINVAL;
2271
2272        group = vfio_group_get_from_dev(dev);
2273        if (!group)
2274                return -ENODEV;
2275
2276        switch (type) {
2277        case VFIO_IOMMU_NOTIFY:
2278                ret = vfio_register_iommu_notifier(group, events, nb);
2279                break;
2280        case VFIO_GROUP_NOTIFY:
2281                ret = vfio_register_group_notifier(group, events, nb);
2282                break;
2283        default:
2284                ret = -EINVAL;
2285        }
2286
2287        vfio_group_put(group);
2288        return ret;
2289}
2290EXPORT_SYMBOL(vfio_register_notifier);
2291
2292int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2293                             struct notifier_block *nb)
2294{
2295        struct vfio_group *group;
2296        int ret;
2297
2298        if (!dev || !nb)
2299                return -EINVAL;
2300
2301        group = vfio_group_get_from_dev(dev);
2302        if (!group)
2303                return -ENODEV;
2304
2305        switch (type) {
2306        case VFIO_IOMMU_NOTIFY:
2307                ret = vfio_unregister_iommu_notifier(group, nb);
2308                break;
2309        case VFIO_GROUP_NOTIFY:
2310                ret = vfio_unregister_group_notifier(group, nb);
2311                break;
2312        default:
2313                ret = -EINVAL;
2314        }
2315
2316        vfio_group_put(group);
2317        return ret;
2318}
2319EXPORT_SYMBOL(vfio_unregister_notifier);
2320
2321/**
2322 * Module/class support
2323 */
2324static char *vfio_devnode(struct device *dev, umode_t *mode)
2325{
2326        return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2327}
2328
2329static struct miscdevice vfio_dev = {
2330        .minor = VFIO_MINOR,
2331        .name = "vfio",
2332        .fops = &vfio_fops,
2333        .nodename = "vfio/vfio",
2334        .mode = S_IRUGO | S_IWUGO,
2335};
2336
2337static int __init vfio_init(void)
2338{
2339        int ret;
2340
2341        idr_init(&vfio.group_idr);
2342        mutex_init(&vfio.group_lock);
2343        mutex_init(&vfio.iommu_drivers_lock);
2344        INIT_LIST_HEAD(&vfio.group_list);
2345        INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2346        init_waitqueue_head(&vfio.release_q);
2347
2348        ret = misc_register(&vfio_dev);
2349        if (ret) {
2350                pr_err("vfio: misc device register failed\n");
2351                return ret;
2352        }
2353
2354        /* /dev/vfio/$GROUP */
2355        vfio.class = class_create(THIS_MODULE, "vfio");
2356        if (IS_ERR(vfio.class)) {
2357                ret = PTR_ERR(vfio.class);
2358                goto err_class;
2359        }
2360
2361        vfio.class->devnode = vfio_devnode;
2362
2363        ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2364        if (ret)
2365                goto err_alloc_chrdev;
2366
2367        cdev_init(&vfio.group_cdev, &vfio_group_fops);
2368        ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2369        if (ret)
2370                goto err_cdev_add;
2371
2372        pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2373
2374#ifdef CONFIG_VFIO_NOIOMMU
2375        vfio_register_iommu_driver(&vfio_noiommu_ops);
2376#endif
2377        return 0;
2378
2379err_cdev_add:
2380        unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2381err_alloc_chrdev:
2382        class_destroy(vfio.class);
2383        vfio.class = NULL;
2384err_class:
2385        misc_deregister(&vfio_dev);
2386        return ret;
2387}
2388
2389static void __exit vfio_cleanup(void)
2390{
2391        WARN_ON(!list_empty(&vfio.group_list));
2392
2393#ifdef CONFIG_VFIO_NOIOMMU
2394        vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2395#endif
2396        idr_destroy(&vfio.group_idr);
2397        cdev_del(&vfio.group_cdev);
2398        unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2399        class_destroy(vfio.class);
2400        vfio.class = NULL;
2401        misc_deregister(&vfio_dev);
2402}
2403
2404module_init(vfio_init);
2405module_exit(vfio_cleanup);
2406
2407MODULE_VERSION(DRIVER_VERSION);
2408MODULE_LICENSE("GPL v2");
2409MODULE_AUTHOR(DRIVER_AUTHOR);
2410MODULE_DESCRIPTION(DRIVER_DESC);
2411MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2412MODULE_ALIAS("devname:vfio/vfio");
2413MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2414