LXR linux/drivers/vfio/vfio_iommu

   1/*
   2 * VFIO: IOMMU DMA mapping support for Type1 IOMMU
   3 *
   4 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5 *     Author: Alex Williamson <alex.williamson@redhat.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio:
  12 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13 * Author: Tom Lyon, pugs@cisco.com
  14 *
  15 * We arbitrarily define a Type1 IOMMU as one matching the below code.
  16 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
  17 * VT-d, but that makes it harder to re-use as theoretically anyone
  18 * implementing a similar IOMMU could make use of this.  We expect the
  19 * IOMMU to support the IOMMU API and have few to no restrictions around
  20 * the IOVA range that can be mapped.  The Type1 IOMMU is currently
  21 * optimized for relatively static mappings of a userspace process with
  22 * userpsace pages pinned into memory.  We also assume devices and IOMMU
  23 * domains are PCI based as the IOMMU API is still centered around a
  24 * device/bus interface rather than a group interface.
  25 */
  26
  27#include <linux/compat.h>
  28#include <linux/device.h>
  29#include <linux/fs.h>
  30#include <linux/iommu.h>
  31#include <linux/module.h>
  32#include <linux/mm.h>
  33#include <linux/pci.h>          /* pci_bus_type */
  34#include <linux/rbtree.h>
  35#include <linux/sched.h>
  36#include <linux/slab.h>
  37#include <linux/uaccess.h>
  38#include <linux/vfio.h>
  39#include <linux/workqueue.h>
  40
  41#define DRIVER_VERSION  "0.2"
  42#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  43#define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
  44
  45static bool allow_unsafe_interrupts;
  46module_param_named(allow_unsafe_interrupts,
  47                   allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
  48MODULE_PARM_DESC(allow_unsafe_interrupts,
  49                 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
  50
  51static bool disable_hugepages;
  52module_param_named(disable_hugepages,
  53                   disable_hugepages, bool, S_IRUGO | S_IWUSR);
  54MODULE_PARM_DESC(disable_hugepages,
  55                 "Disable VFIO IOMMU support for IOMMU hugepages.");
  56
  57struct vfio_iommu {
  58        struct iommu_domain     *domain;
  59        struct mutex            lock;
  60        struct rb_root          dma_list;
  61        struct list_head        group_list;
  62        bool                    cache;
  63};
  64
  65struct vfio_dma {
  66        struct rb_node          node;
  67        dma_addr_t              iova;           /* Device address */
  68        unsigned long           vaddr;          /* Process virtual addr */
  69        size_t                  size;           /* Map size (bytes) */
  70        int                     prot;           /* IOMMU_READ/WRITE */
  71};
  72
  73struct vfio_group {
  74        struct iommu_group      *iommu_group;
  75        struct list_head        next;
  76};
  77
  78/*
  79 * This code handles mapping and unmapping of user data buffers
  80 * into DMA'ble space using the IOMMU
  81 */
  82
  83static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
  84                                      dma_addr_t start, size_t size)
  85{
  86        struct rb_node *node = iommu->dma_list.rb_node;
  87
  88        while (node) {
  89                struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
  90
  91                if (start + size <= dma->iova)
  92                        node = node->rb_left;
  93                else if (start >= dma->iova + dma->size)
  94                        node = node->rb_right;
  95                else
  96                        return dma;
  97        }
  98
  99        return NULL;
 100}
 101
 102static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 103{
 104        struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
 105        struct vfio_dma *dma;
 106
 107        while (*link) {
 108                parent = *link;
 109                dma = rb_entry(parent, struct vfio_dma, node);
 110
 111                if (new->iova + new->size <= dma->iova)
 112                        link = &(*link)->rb_left;
 113                else
 114                        link = &(*link)->rb_right;
 115        }
 116
 117        rb_link_node(&new->node, parent, link);
 118        rb_insert_color(&new->node, &iommu->dma_list);
 119}
 120
 121static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 122{
 123        rb_erase(&old->node, &iommu->dma_list);
 124}
 125
 126struct vwork {
 127        struct mm_struct        *mm;
 128        long                    npage;
 129        struct work_struct      work;
 130};
 131
 132/* delayed decrement/increment for locked_vm */
 133static void vfio_lock_acct_bg(struct work_struct *work)
 134{
 135        struct vwork *vwork = container_of(work, struct vwork, work);
 136        struct mm_struct *mm;
 137
 138        mm = vwork->mm;
 139        down_write(&mm->mmap_sem);
 140        mm->locked_vm += vwork->npage;
 141        up_write(&mm->mmap_sem);
 142        mmput(mm);
 143        kfree(vwork);
 144}
 145
 146static void vfio_lock_acct(long npage)
 147{
 148        struct vwork *vwork;
 149        struct mm_struct *mm;
 150
 151        if (!current->mm || !npage)
 152                return; /* process exited or nothing to do */
 153
 154        if (down_write_trylock(&current->mm->mmap_sem)) {
 155                current->mm->locked_vm += npage;
 156                up_write(&current->mm->mmap_sem);
 157                return;
 158        }
 159
 160        /*
 161         * Couldn't get mmap_sem lock, so must setup to update
 162         * mm->locked_vm later. If locked_vm were atomic, we
 163         * wouldn't need this silliness
 164         */
 165        vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
 166        if (!vwork)
 167                return;
 168        mm = get_task_mm(current);
 169        if (!mm) {
 170                kfree(vwork);
 171                return;
 172        }
 173        INIT_WORK(&vwork->work, vfio_lock_acct_bg);
 174        vwork->mm = mm;
 175        vwork->npage = npage;
 176        schedule_work(&vwork->work);
 177}
 178
 179/*
 180 * Some mappings aren't backed by a struct page, for example an mmap'd
 181 * MMIO range for our own or another device.  These use a different
 182 * pfn conversion and shouldn't be tracked as locked pages.
 183 */
 184static bool is_invalid_reserved_pfn(unsigned long pfn)
 185{
 186        if (pfn_valid(pfn)) {
 187                bool reserved;
 188                struct page *tail = pfn_to_page(pfn);
 189                struct page *head = compound_trans_head(tail);
 190                reserved = !!(PageReserved(head));
 191                if (head != tail) {
 192                        /*
 193                         * "head" is not a dangling pointer
 194                         * (compound_trans_head takes care of that)
 195                         * but the hugepage may have been split
 196                         * from under us (and we may not hold a
 197                         * reference count on the head page so it can
 198                         * be reused before we run PageReferenced), so
 199                         * we've to check PageTail before returning
 200                         * what we just read.
 201                         */
 202                        smp_rmb();
 203                        if (PageTail(tail))
 204                                return reserved;
 205                }
 206                return PageReserved(tail);
 207        }
 208
 209        return true;
 210}
 211
 212static int put_pfn(unsigned long pfn, int prot)
 213{
 214        if (!is_invalid_reserved_pfn(pfn)) {
 215                struct page *page = pfn_to_page(pfn);
 216                if (prot & IOMMU_WRITE)
 217                        SetPageDirty(page);
 218                put_page(page);
 219                return 1;
 220        }
 221        return 0;
 222}
 223
 224static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 225{
 226        struct page *page[1];
 227        struct vm_area_struct *vma;
 228        int ret = -EFAULT;
 229
 230        if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
 231                *pfn = page_to_pfn(page[0]);
 232                return 0;
 233        }
 234
 235        down_read(&current->mm->mmap_sem);
 236
 237        vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
 238
 239        if (vma && vma->vm_flags & VM_PFNMAP) {
 240                *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 241                if (is_invalid_reserved_pfn(*pfn))
 242                        ret = 0;
 243        }
 244
 245        up_read(&current->mm->mmap_sem);
 246
 247        return ret;
 248}
 249
 250/*
 251 * Attempt to pin pages.  We really don't want to track all the pfns and
 252 * the iommu can only map chunks of consecutive pfns anyway, so get the
 253 * first page and all consecutive pages with the same locking.
 254 */
 255static long vfio_pin_pages(unsigned long vaddr, long npage,
 256                           int prot, unsigned long *pfn_base)
 257{
 258        unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 259        bool lock_cap = capable(CAP_IPC_LOCK);
 260        long ret, i;
 261
 262        if (!current->mm)
 263                return -ENODEV;
 264
 265        ret = vaddr_get_pfn(vaddr, prot, pfn_base);
 266        if (ret)
 267                return ret;
 268
 269        if (is_invalid_reserved_pfn(*pfn_base))
 270                return 1;
 271
 272        if (!lock_cap && current->mm->locked_vm + 1 > limit) {
 273                put_pfn(*pfn_base, prot);
 274                pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 275                        limit << PAGE_SHIFT);
 276                return -ENOMEM;
 277        }
 278
 279        if (unlikely(disable_hugepages)) {
 280                vfio_lock_acct(1);
 281                return 1;
 282        }
 283
 284        /* Lock all the consecutive pages from pfn_base */
 285        for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 286                unsigned long pfn = 0;
 287
 288                ret = vaddr_get_pfn(vaddr, prot, &pfn);
 289                if (ret)
 290                        break;
 291
 292                if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
 293                        put_pfn(pfn, prot);
 294                        break;
 295                }
 296
 297                if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
 298                        put_pfn(pfn, prot);
 299                        pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 300                                __func__, limit << PAGE_SHIFT);
 301                        break;
 302                }
 303        }
 304
 305        vfio_lock_acct(i);
 306
 307        return i;
 308}
 309
 310static long vfio_unpin_pages(unsigned long pfn, long npage,
 311                             int prot, bool do_accounting)
 312{
 313        unsigned long unlocked = 0;
 314        long i;
 315
 316        for (i = 0; i < npage; i++)
 317                unlocked += put_pfn(pfn++, prot);
 318
 319        if (do_accounting)
 320                vfio_lock_acct(-unlocked);
 321
 322        return unlocked;
 323}
 324
 325static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 326                            dma_addr_t iova, size_t *size)
 327{
 328        dma_addr_t start = iova, end = iova + *size;
 329        long unlocked = 0;
 330
 331        while (iova < end) {
 332                size_t unmapped;
 333                phys_addr_t phys;
 334
 335                /*
 336                 * We use the IOMMU to track the physical address.  This
 337                 * saves us from having a lot more entries in our mapping
 338                 * tree.  The downside is that we don't track the size
 339                 * used to do the mapping.  We request unmap of a single
 340                 * page, but expect IOMMUs that support large pages to
 341                 * unmap a larger chunk.
 342                 */
 343                phys = iommu_iova_to_phys(iommu->domain, iova);
 344                if (WARN_ON(!phys)) {
 345                        iova += PAGE_SIZE;
 346                        continue;
 347                }
 348
 349                unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
 350                if (!unmapped)
 351                        break;
 352
 353                unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
 354                                             unmapped >> PAGE_SHIFT,
 355                                             dma->prot, false);
 356                iova += unmapped;
 357        }
 358
 359        vfio_lock_acct(-unlocked);
 360
 361        *size = iova - start;
 362
 363        return 0;
 364}
 365
 366static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 367                                   size_t *size, struct vfio_dma *dma)
 368{
 369        size_t offset, overlap, tmp;
 370        struct vfio_dma *split;
 371        int ret;
 372
 373        if (!*size)
 374                return 0;
 375
 376        /*
 377         * Existing dma region is completely covered, unmap all.  This is
 378         * the likely case since userspace tends to map and unmap buffers
 379         * in one shot rather than multiple mappings within a buffer.
 380         */
 381        if (likely(start <= dma->iova &&
 382                   start + *size >= dma->iova + dma->size)) {
 383                *size = dma->size;
 384                ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
 385                if (ret)
 386                        return ret;
 387
 388                /*
 389                 * Did we remove more than we have?  Should never happen
 390                 * since a vfio_dma is contiguous in iova and vaddr.
 391                 */
 392                WARN_ON(*size != dma->size);
 393
 394                vfio_remove_dma(iommu, dma);
 395                kfree(dma);
 396                return 0;
 397        }
 398
 399        /* Overlap low address of existing range */
 400        if (start <= dma->iova) {
 401                overlap = start + *size - dma->iova;
 402                ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
 403                if (ret)
 404                        return ret;
 405
 406                vfio_remove_dma(iommu, dma);
 407
 408                /*
 409                 * Check, we may have removed to whole vfio_dma.  If not
 410                 * fixup and re-insert.
 411                 */
 412                if (overlap < dma->size) {
 413                        dma->iova += overlap;
 414                        dma->vaddr += overlap;
 415                        dma->size -= overlap;
 416                        vfio_insert_dma(iommu, dma);
 417                } else
 418                        kfree(dma);
 419
 420                *size = overlap;
 421                return 0;
 422        }
 423
 424        /* Overlap high address of existing range */
 425        if (start + *size >= dma->iova + dma->size) {
 426                offset = start - dma->iova;
 427                overlap = dma->size - offset;
 428
 429                ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
 430                if (ret)
 431                        return ret;
 432
 433                dma->size -= overlap;
 434                *size = overlap;
 435                return 0;
 436        }
 437
 438        /* Split existing */
 439
 440        /*
 441         * Allocate our tracking structure early even though it may not
 442         * be used.  An Allocation failure later loses track of pages and
 443         * is more difficult to unwind.
 444         */
 445        split = kzalloc(sizeof(*split), GFP_KERNEL);
 446        if (!split)
 447                return -ENOMEM;
 448
 449        offset = start - dma->iova;
 450
 451        ret = vfio_unmap_unpin(iommu, dma, start, size);
 452        if (ret || !*size) {
 453                kfree(split);
 454                return ret;
 455        }
 456
 457        tmp = dma->size;
 458
 459        /* Resize the lower vfio_dma in place, before the below insert */
 460        dma->size = offset;
 461
 462        /* Insert new for remainder, assuming it didn't all get unmapped */
 463        if (likely(offset + *size < tmp)) {
 464                split->size = tmp - offset - *size;
 465                split->iova = dma->iova + offset + *size;
 466                split->vaddr = dma->vaddr + offset + *size;
 467                split->prot = dma->prot;
 468                vfio_insert_dma(iommu, split);
 469        } else
 470                kfree(split);
 471
 472        return 0;
 473}
 474
 475static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 476                             struct vfio_iommu_type1_dma_unmap *unmap)
 477{
 478        uint64_t mask;
 479        struct vfio_dma *dma;
 480        size_t unmapped = 0, size;
 481        int ret = 0;
 482
 483        mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 484
 485        if (unmap->iova & mask)
 486                return -EINVAL;
 487        if (!unmap->size || unmap->size & mask)
 488                return -EINVAL;
 489
 490        WARN_ON(mask & PAGE_MASK);
 491
 492        mutex_lock(&iommu->lock);
 493
 494        while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
 495                size = unmap->size;
 496                ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
 497                if (ret || !size)
 498                        break;
 499                unmapped += size;
 500        }
 501
 502        mutex_unlock(&iommu->lock);
 503
 504        /*
 505         * We may unmap more than requested, update the unmap struct so
 506         * userspace can know.
 507         */
 508        unmap->size = unmapped;
 509
 510        return ret;
 511}
 512
 513/*
 514 * Turns out AMD IOMMU has a page table bug where it won't map large pages
 515 * to a region that previously mapped smaller pages.  This should be fixed
 516 * soon, so this is just a temporary workaround to break mappings down into
 517 * PAGE_SIZE.  Better to map smaller pages than nothing.
 518 */
 519static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
 520                          unsigned long pfn, long npage, int prot)
 521{
 522        long i;
 523        int ret;
 524
 525        for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
 526                ret = iommu_map(iommu->domain, iova,
 527                                (phys_addr_t)pfn << PAGE_SHIFT,
 528                                PAGE_SIZE, prot);
 529                if (ret)
 530                        break;
 531        }
 532
 533        for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
 534                iommu_unmap(iommu->domain, iova, PAGE_SIZE);
 535
 536        return ret;
 537}
 538
 539static int vfio_dma_do_map(struct vfio_iommu *iommu,
 540                           struct vfio_iommu_type1_dma_map *map)
 541{
 542        dma_addr_t end, iova;
 543        unsigned long vaddr = map->vaddr;
 544        size_t size = map->size;
 545        long npage;
 546        int ret = 0, prot = 0;
 547        uint64_t mask;
 548
 549        end = map->iova + map->size;
 550
 551        mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 552
 553        /* READ/WRITE from device perspective */
 554        if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
 555                prot |= IOMMU_WRITE;
 556        if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 557                prot |= IOMMU_READ;
 558
 559        if (!prot)
 560                return -EINVAL; /* No READ/WRITE? */
 561
 562        if (iommu->cache)
 563                prot |= IOMMU_CACHE;
 564
 565        if (vaddr & mask)
 566                return -EINVAL;
 567        if (map->iova & mask)
 568                return -EINVAL;
 569        if (!map->size || map->size & mask)
 570                return -EINVAL;
 571
 572        WARN_ON(mask & PAGE_MASK);
 573
 574        /* Don't allow IOVA wrap */
 575        if (end && end < map->iova)
 576                return -EINVAL;
 577
 578        /* Don't allow virtual address wrap */
 579        if (vaddr + map->size && vaddr + map->size < vaddr)
 580                return -EINVAL;
 581
 582        mutex_lock(&iommu->lock);
 583
 584        if (vfio_find_dma(iommu, map->iova, map->size)) {
 585                mutex_unlock(&iommu->lock);
 586                return -EEXIST;
 587        }
 588
 589        for (iova = map->iova; iova < end; iova += size, vaddr += size) {
 590                struct vfio_dma *dma = NULL;
 591                unsigned long pfn;
 592                long i;
 593
 594                /* Pin a contiguous chunk of memory */
 595                npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
 596                                       prot, &pfn);
 597                if (npage <= 0) {
 598                        WARN_ON(!npage);
 599                        ret = (int)npage;
 600                        break;
 601                }
 602
 603                /* Verify pages are not already mapped */
 604                for (i = 0; i < npage; i++) {
 605                        if (iommu_iova_to_phys(iommu->domain,
 606                                               iova + (i << PAGE_SHIFT))) {
 607                                vfio_unpin_pages(pfn, npage, prot, true);
 608                                ret = -EBUSY;
 609                                break;
 610                        }
 611                }
 612
 613                ret = iommu_map(iommu->domain, iova,
 614                                (phys_addr_t)pfn << PAGE_SHIFT,
 615                                npage << PAGE_SHIFT, prot);
 616                if (ret) {
 617                        if (ret != -EBUSY ||
 618                            map_try_harder(iommu, iova, pfn, npage, prot)) {
 619                                vfio_unpin_pages(pfn, npage, prot, true);
 620                                break;
 621                        }
 622                }
 623
 624                size = npage << PAGE_SHIFT;
 625
 626                /*
 627                 * Check if we abut a region below - nothing below 0.
 628                 * This is the most likely case when mapping chunks of
 629                 * physically contiguous regions within a virtual address
 630                 * range.  Update the abutting entry in place since iova
 631                 * doesn't change.
 632                 */
 633                if (likely(iova)) {
 634                        struct vfio_dma *tmp;
 635                        tmp = vfio_find_dma(iommu, iova - 1, 1);
 636                        if (tmp && tmp->prot == prot &&
 637                            tmp->vaddr + tmp->size == vaddr) {
 638                                tmp->size += size;
 639                                iova = tmp->iova;
 640                                size = tmp->size;
 641                                vaddr = tmp->vaddr;
 642                                dma = tmp;
 643                        }
 644                }
 645
 646                /*
 647                 * Check if we abut a region above - nothing above ~0 + 1.
 648                 * If we abut above and below, remove and free.  If only
 649                 * abut above, remove, modify, reinsert.
 650                 */
 651                if (likely(iova + size)) {
 652                        struct vfio_dma *tmp;
 653                        tmp = vfio_find_dma(iommu, iova + size, 1);
 654                        if (tmp && tmp->prot == prot &&
 655                            tmp->vaddr == vaddr + size) {
 656                                vfio_remove_dma(iommu, tmp);
 657                                if (dma) {
 658                                        dma->size += tmp->size;
 659                                        kfree(tmp);
 660                                } else {
 661                                        size += tmp->size;
 662                                        tmp->size = size;
 663                                        tmp->iova = iova;
 664                                        tmp->vaddr = vaddr;
 665                                        vfio_insert_dma(iommu, tmp);
 666                                        dma = tmp;
 667                                }
 668                        }
 669                }
 670
 671                if (!dma) {
 672                        dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 673                        if (!dma) {
 674                                iommu_unmap(iommu->domain, iova, size);
 675                                vfio_unpin_pages(pfn, npage, prot, true);
 676                                ret = -ENOMEM;
 677                                break;
 678                        }
 679
 680                        dma->size = size;
 681                        dma->iova = iova;
 682                        dma->vaddr = vaddr;
 683                        dma->prot = prot;
 684                        vfio_insert_dma(iommu, dma);
 685                }
 686        }
 687
 688        if (ret) {
 689                struct vfio_dma *tmp;
 690                iova = map->iova;
 691                size = map->size;
 692                while ((tmp = vfio_find_dma(iommu, iova, size))) {
 693                        int r = vfio_remove_dma_overlap(iommu, iova,
 694                                                        &size, tmp);
 695                        if (WARN_ON(r || !size))
 696                                break;
 697                }
 698        }
 699
 700        mutex_unlock(&iommu->lock);
 701        return ret;
 702}
 703
 704static int vfio_iommu_type1_attach_group(void *iommu_data,
 705                                         struct iommu_group *iommu_group)
 706{
 707        struct vfio_iommu *iommu = iommu_data;
 708        struct vfio_group *group, *tmp;
 709        int ret;
 710
 711        group = kzalloc(sizeof(*group), GFP_KERNEL);
 712        if (!group)
 713                return -ENOMEM;
 714
 715        mutex_lock(&iommu->lock);
 716
 717        list_for_each_entry(tmp, &iommu->group_list, next) {
 718                if (tmp->iommu_group == iommu_group) {
 719                        mutex_unlock(&iommu->lock);
 720                        kfree(group);
 721                        return -EINVAL;
 722                }
 723        }
 724
 725        /*
 726         * TODO: Domain have capabilities that might change as we add
 727         * groups (see iommu->cache, currently never set).  Check for
 728         * them and potentially disallow groups to be attached when it
 729         * would change capabilities (ugh).
 730         */
 731        ret = iommu_attach_group(iommu->domain, iommu_group);
 732        if (ret) {
 733                mutex_unlock(&iommu->lock);
 734                kfree(group);
 735                return ret;
 736        }
 737
 738        group->iommu_group = iommu_group;
 739        list_add(&group->next, &iommu->group_list);
 740
 741        mutex_unlock(&iommu->lock);
 742
 743        return 0;
 744}
 745
 746static void vfio_iommu_type1_detach_group(void *iommu_data,
 747                                          struct iommu_group *iommu_group)
 748{
 749        struct vfio_iommu *iommu = iommu_data;
 750        struct vfio_group *group;
 751
 752        mutex_lock(&iommu->lock);
 753
 754        list_for_each_entry(group, &iommu->group_list, next) {
 755                if (group->iommu_group == iommu_group) {
 756                        iommu_detach_group(iommu->domain, iommu_group);
 757                        list_del(&group->next);
 758                        kfree(group);
 759                        break;
 760                }
 761        }
 762
 763        mutex_unlock(&iommu->lock);
 764}
 765
 766static void *vfio_iommu_type1_open(unsigned long arg)
 767{
 768        struct vfio_iommu *iommu;
 769
 770        if (arg != VFIO_TYPE1_IOMMU)
 771                return ERR_PTR(-EINVAL);
 772
 773        iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
 774        if (!iommu)
 775                return ERR_PTR(-ENOMEM);
 776
 777        INIT_LIST_HEAD(&iommu->group_list);
 778        iommu->dma_list = RB_ROOT;
 779        mutex_init(&iommu->lock);
 780
 781        /*
 782         * Wish we didn't have to know about bus_type here.
 783         */
 784        iommu->domain = iommu_domain_alloc(&pci_bus_type);
 785        if (!iommu->domain) {
 786                kfree(iommu);
 787                return ERR_PTR(-EIO);
 788        }
 789
 790        /*
 791         * Wish we could specify required capabilities rather than create
 792         * a domain, see what comes out and hope it doesn't change along
 793         * the way.  Fortunately we know interrupt remapping is global for
 794         * our iommus.
 795         */
 796        if (!allow_unsafe_interrupts &&
 797            !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
 798                pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
 799                       __func__);
 800                iommu_domain_free(iommu->domain);
 801                kfree(iommu);
 802                return ERR_PTR(-EPERM);
 803        }
 804
 805        return iommu;
 806}
 807
 808static void vfio_iommu_type1_release(void *iommu_data)
 809{
 810        struct vfio_iommu *iommu = iommu_data;
 811        struct vfio_group *group, *group_tmp;
 812        struct rb_node *node;
 813
 814        list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
 815                iommu_detach_group(iommu->domain, group->iommu_group);
 816                list_del(&group->next);
 817                kfree(group);
 818        }
 819
 820        while ((node = rb_first(&iommu->dma_list))) {
 821                struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 822                size_t size = dma->size;
 823                vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
 824                if (WARN_ON(!size))
 825                        break;
 826        }
 827
 828        iommu_domain_free(iommu->domain);
 829        iommu->domain = NULL;
 830        kfree(iommu);
 831}
 832
 833static long vfio_iommu_type1_ioctl(void *iommu_data,
 834                                   unsigned int cmd, unsigned long arg)
 835{
 836        struct vfio_iommu *iommu = iommu_data;
 837        unsigned long minsz;
 838
 839        if (cmd == VFIO_CHECK_EXTENSION) {
 840                switch (arg) {
 841                case VFIO_TYPE1_IOMMU:
 842                        return 1;
 843                default:
 844                        return 0;
 845                }
 846        } else if (cmd == VFIO_IOMMU_GET_INFO) {
 847                struct vfio_iommu_type1_info info;
 848
 849                minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 850
 851                if (copy_from_user(&info, (void __user *)arg, minsz))
 852                        return -EFAULT;
 853
 854                if (info.argsz < minsz)
 855                        return -EINVAL;
 856
 857                info.flags = 0;
 858
 859                info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap;
 860
 861                return copy_to_user((void __user *)arg, &info, minsz);
 862
 863        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
 864                struct vfio_iommu_type1_dma_map map;
 865                uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
 866                                VFIO_DMA_MAP_FLAG_WRITE;
 867
 868                minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 869
 870                if (copy_from_user(&map, (void __user *)arg, minsz))
 871                        return -EFAULT;
 872
 873                if (map.argsz < minsz || map.flags & ~mask)
 874                        return -EINVAL;
 875
 876                return vfio_dma_do_map(iommu, &map);
 877
 878        } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
 879                struct vfio_iommu_type1_dma_unmap unmap;
 880                long ret;
 881
 882                minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 883
 884                if (copy_from_user(&unmap, (void __user *)arg, minsz))
 885                        return -EFAULT;
 886
 887                if (unmap.argsz < minsz || unmap.flags)
 888                        return -EINVAL;
 889
 890                ret = vfio_dma_do_unmap(iommu, &unmap);
 891                if (ret)
 892                        return ret;
 893
 894                return copy_to_user((void __user *)arg, &unmap, minsz);
 895        }
 896
 897        return -ENOTTY;
 898}
 899
 900static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 901        .name           = "vfio-iommu-type1",
 902        .owner          = THIS_MODULE,
 903        .open           = vfio_iommu_type1_open,
 904        .release        = vfio_iommu_type1_release,
 905        .ioctl          = vfio_iommu_type1_ioctl,
 906        .attach_group   = vfio_iommu_type1_attach_group,
 907        .detach_group   = vfio_iommu_type1_detach_group,
 908};
 909
 910static int __init vfio_iommu_type1_init(void)
 911{
 912        if (!iommu_present(&pci_bus_type))
 913                return -ENODEV;
 914
 915        return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
 916}
 917
 918static void __exit vfio_iommu_type1_cleanup(void)
 919{
 920        vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
 921}
 922
 923module_init(vfio_iommu_type1_init);
 924module_exit(vfio_iommu_type1_cleanup);
 925
 926MODULE_VERSION(DRIVER_VERSION);
 927MODULE_LICENSE("GPL v2");
 928MODULE_AUTHOR(DRIVER_AUTHOR);
 929MODULE_DESCRIPTION(DRIVER_DESC);
 930