LXR linux/drivers/vfio/vfio_iommu

   1/*
   2 * VFIO: IOMMU DMA mapping support for Type1 IOMMU
   3 *
   4 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5 *     Author: Alex Williamson <alex.williamson@redhat.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio:
  12 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13 * Author: Tom Lyon, pugs@cisco.com
  14 *
  15 * We arbitrarily define a Type1 IOMMU as one matching the below code.
  16 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
  17 * VT-d, but that makes it harder to re-use as theoretically anyone
  18 * implementing a similar IOMMU could make use of this.  We expect the
  19 * IOMMU to support the IOMMU API and have few to no restrictions around
  20 * the IOVA range that can be mapped.  The Type1 IOMMU is currently
  21 * optimized for relatively static mappings of a userspace process with
  22 * userpsace pages pinned into memory.  We also assume devices and IOMMU
  23 * domains are PCI based as the IOMMU API is still centered around a
  24 * device/bus interface rather than a group interface.
  25 */
  26
  27#include <linux/compat.h>
  28#include <linux/device.h>
  29#include <linux/fs.h>
  30#include <linux/iommu.h>
  31#include <linux/module.h>
  32#include <linux/mm.h>
  33#include <linux/rbtree.h>
  34#include <linux/sched.h>
  35#include <linux/slab.h>
  36#include <linux/uaccess.h>
  37#include <linux/vfio.h>
  38#include <linux/workqueue.h>
  39
  40#define DRIVER_VERSION  "0.2"
  41#define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  42#define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
  43
  44static bool allow_unsafe_interrupts;
  45module_param_named(allow_unsafe_interrupts,
  46                   allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
  47MODULE_PARM_DESC(allow_unsafe_interrupts,
  48                 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
  49
  50static bool disable_hugepages;
  51module_param_named(disable_hugepages,
  52                   disable_hugepages, bool, S_IRUGO | S_IWUSR);
  53MODULE_PARM_DESC(disable_hugepages,
  54                 "Disable VFIO IOMMU support for IOMMU hugepages.");
  55
  56struct vfio_iommu {
  57        struct list_head        domain_list;
  58        struct mutex            lock;
  59        struct rb_root          dma_list;
  60        bool                    v2;
  61        bool                    nesting;
  62};
  63
  64struct vfio_domain {
  65        struct iommu_domain     *domain;
  66        struct list_head        next;
  67        struct list_head        group_list;
  68        int                     prot;           /* IOMMU_CACHE */
  69        bool                    fgsp;           /* Fine-grained super pages */
  70};
  71
  72struct vfio_dma {
  73        struct rb_node          node;
  74        dma_addr_t              iova;           /* Device address */
  75        unsigned long           vaddr;          /* Process virtual addr */
  76        size_t                  size;           /* Map size (bytes) */
  77        int                     prot;           /* IOMMU_READ/WRITE */
  78};
  79
  80struct vfio_group {
  81        struct iommu_group      *iommu_group;
  82        struct list_head        next;
  83};
  84
  85/*
  86 * This code handles mapping and unmapping of user data buffers
  87 * into DMA'ble space using the IOMMU
  88 */
  89
  90static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
  91                                      dma_addr_t start, size_t size)
  92{
  93        struct rb_node *node = iommu->dma_list.rb_node;
  94
  95        while (node) {
  96                struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
  97
  98                if (start + size <= dma->iova)
  99                        node = node->rb_left;
 100                else if (start >= dma->iova + dma->size)
 101                        node = node->rb_right;
 102                else
 103                        return dma;
 104        }
 105
 106        return NULL;
 107}
 108
 109static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 110{
 111        struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
 112        struct vfio_dma *dma;
 113
 114        while (*link) {
 115                parent = *link;
 116                dma = rb_entry(parent, struct vfio_dma, node);
 117
 118                if (new->iova + new->size <= dma->iova)
 119                        link = &(*link)->rb_left;
 120                else
 121                        link = &(*link)->rb_right;
 122        }
 123
 124        rb_link_node(&new->node, parent, link);
 125        rb_insert_color(&new->node, &iommu->dma_list);
 126}
 127
 128static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 129{
 130        rb_erase(&old->node, &iommu->dma_list);
 131}
 132
 133struct vwork {
 134        struct mm_struct        *mm;
 135        long                    npage;
 136        struct work_struct      work;
 137};
 138
 139/* delayed decrement/increment for locked_vm */
 140static void vfio_lock_acct_bg(struct work_struct *work)
 141{
 142        struct vwork *vwork = container_of(work, struct vwork, work);
 143        struct mm_struct *mm;
 144
 145        mm = vwork->mm;
 146        down_write(&mm->mmap_sem);
 147        mm->locked_vm += vwork->npage;
 148        up_write(&mm->mmap_sem);
 149        mmput(mm);
 150        kfree(vwork);
 151}
 152
 153static void vfio_lock_acct(long npage)
 154{
 155        struct vwork *vwork;
 156        struct mm_struct *mm;
 157
 158        if (!current->mm || !npage)
 159                return; /* process exited or nothing to do */
 160
 161        if (down_write_trylock(&current->mm->mmap_sem)) {
 162                current->mm->locked_vm += npage;
 163                up_write(&current->mm->mmap_sem);
 164                return;
 165        }
 166
 167        /*
 168         * Couldn't get mmap_sem lock, so must setup to update
 169         * mm->locked_vm later. If locked_vm were atomic, we
 170         * wouldn't need this silliness
 171         */
 172        vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
 173        if (!vwork)
 174                return;
 175        mm = get_task_mm(current);
 176        if (!mm) {
 177                kfree(vwork);
 178                return;
 179        }
 180        INIT_WORK(&vwork->work, vfio_lock_acct_bg);
 181        vwork->mm = mm;
 182        vwork->npage = npage;
 183        schedule_work(&vwork->work);
 184}
 185
 186/*
 187 * Some mappings aren't backed by a struct page, for example an mmap'd
 188 * MMIO range for our own or another device.  These use a different
 189 * pfn conversion and shouldn't be tracked as locked pages.
 190 */
 191static bool is_invalid_reserved_pfn(unsigned long pfn)
 192{
 193        if (pfn_valid(pfn)) {
 194                bool reserved;
 195                struct page *tail = pfn_to_page(pfn);
 196                struct page *head = compound_head(tail);
 197                reserved = !!(PageReserved(head));
 198                if (head != tail) {
 199                        /*
 200                         * "head" is not a dangling pointer
 201                         * (compound_head takes care of that)
 202                         * but the hugepage may have been split
 203                         * from under us (and we may not hold a
 204                         * reference count on the head page so it can
 205                         * be reused before we run PageReferenced), so
 206                         * we've to check PageTail before returning
 207                         * what we just read.
 208                         */
 209                        smp_rmb();
 210                        if (PageTail(tail))
 211                                return reserved;
 212                }
 213                return PageReserved(tail);
 214        }
 215
 216        return true;
 217}
 218
 219static int put_pfn(unsigned long pfn, int prot)
 220{
 221        if (!is_invalid_reserved_pfn(pfn)) {
 222                struct page *page = pfn_to_page(pfn);
 223                if (prot & IOMMU_WRITE)
 224                        SetPageDirty(page);
 225                put_page(page);
 226                return 1;
 227        }
 228        return 0;
 229}
 230
 231static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 232{
 233        struct page *page[1];
 234        struct vm_area_struct *vma;
 235        int ret = -EFAULT;
 236
 237        if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
 238                *pfn = page_to_pfn(page[0]);
 239                return 0;
 240        }
 241
 242        down_read(&current->mm->mmap_sem);
 243
 244        vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
 245
 246        if (vma && vma->vm_flags & VM_PFNMAP) {
 247                *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 248                if (is_invalid_reserved_pfn(*pfn))
 249                        ret = 0;
 250        }
 251
 252        up_read(&current->mm->mmap_sem);
 253
 254        return ret;
 255}
 256
 257/*
 258 * Attempt to pin pages.  We really don't want to track all the pfns and
 259 * the iommu can only map chunks of consecutive pfns anyway, so get the
 260 * first page and all consecutive pages with the same locking.
 261 */
 262static long vfio_pin_pages(unsigned long vaddr, long npage,
 263                           int prot, unsigned long *pfn_base)
 264{
 265        unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 266        bool lock_cap = capable(CAP_IPC_LOCK);
 267        long ret, i;
 268        bool rsvd;
 269
 270        if (!current->mm)
 271                return -ENODEV;
 272
 273        ret = vaddr_get_pfn(vaddr, prot, pfn_base);
 274        if (ret)
 275                return ret;
 276
 277        rsvd = is_invalid_reserved_pfn(*pfn_base);
 278
 279        if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
 280                put_pfn(*pfn_base, prot);
 281                pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 282                        limit << PAGE_SHIFT);
 283                return -ENOMEM;
 284        }
 285
 286        if (unlikely(disable_hugepages)) {
 287                if (!rsvd)
 288                        vfio_lock_acct(1);
 289                return 1;
 290        }
 291
 292        /* Lock all the consecutive pages from pfn_base */
 293        for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 294                unsigned long pfn = 0;
 295
 296                ret = vaddr_get_pfn(vaddr, prot, &pfn);
 297                if (ret)
 298                        break;
 299
 300                if (pfn != *pfn_base + i ||
 301                    rsvd != is_invalid_reserved_pfn(pfn)) {
 302                        put_pfn(pfn, prot);
 303                        break;
 304                }
 305
 306                if (!rsvd && !lock_cap &&
 307                    current->mm->locked_vm + i + 1 > limit) {
 308                        put_pfn(pfn, prot);
 309                        pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 310                                __func__, limit << PAGE_SHIFT);
 311                        break;
 312                }
 313        }
 314
 315        if (!rsvd)
 316                vfio_lock_acct(i);
 317
 318        return i;
 319}
 320
 321static long vfio_unpin_pages(unsigned long pfn, long npage,
 322                             int prot, bool do_accounting)
 323{
 324        unsigned long unlocked = 0;
 325        long i;
 326
 327        for (i = 0; i < npage; i++)
 328                unlocked += put_pfn(pfn++, prot);
 329
 330        if (do_accounting)
 331                vfio_lock_acct(-unlocked);
 332
 333        return unlocked;
 334}
 335
 336static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
 337{
 338        dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
 339        struct vfio_domain *domain, *d;
 340        long unlocked = 0;
 341
 342        if (!dma->size)
 343                return;
 344        /*
 345         * We use the IOMMU to track the physical addresses, otherwise we'd
 346         * need a much more complicated tracking system.  Unfortunately that
 347         * means we need to use one of the iommu domains to figure out the
 348         * pfns to unpin.  The rest need to be unmapped in advance so we have
 349         * no iommu translations remaining when the pages are unpinned.
 350         */
 351        domain = d = list_first_entry(&iommu->domain_list,
 352                                      struct vfio_domain, next);
 353
 354        list_for_each_entry_continue(d, &iommu->domain_list, next) {
 355                iommu_unmap(d->domain, dma->iova, dma->size);
 356                cond_resched();
 357        }
 358
 359        while (iova < end) {
 360                size_t unmapped, len;
 361                phys_addr_t phys, next;
 362
 363                phys = iommu_iova_to_phys(domain->domain, iova);
 364                if (WARN_ON(!phys)) {
 365                        iova += PAGE_SIZE;
 366                        continue;
 367                }
 368
 369                /*
 370                 * To optimize for fewer iommu_unmap() calls, each of which
 371                 * may require hardware cache flushing, try to find the
 372                 * largest contiguous physical memory chunk to unmap.
 373                 */
 374                for (len = PAGE_SIZE;
 375                     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
 376                        next = iommu_iova_to_phys(domain->domain, iova + len);
 377                        if (next != phys + len)
 378                                break;
 379                }
 380
 381                unmapped = iommu_unmap(domain->domain, iova, len);
 382                if (WARN_ON(!unmapped))
 383                        break;
 384
 385                unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
 386                                             unmapped >> PAGE_SHIFT,
 387                                             dma->prot, false);
 388                iova += unmapped;
 389
 390                cond_resched();
 391        }
 392
 393        vfio_lock_acct(-unlocked);
 394}
 395
 396static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 397{
 398        vfio_unmap_unpin(iommu, dma);
 399        vfio_unlink_dma(iommu, dma);
 400        kfree(dma);
 401}
 402
 403static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 404{
 405        struct vfio_domain *domain;
 406        unsigned long bitmap = ULONG_MAX;
 407
 408        mutex_lock(&iommu->lock);
 409        list_for_each_entry(domain, &iommu->domain_list, next)
 410                bitmap &= domain->domain->pgsize_bitmap;
 411        mutex_unlock(&iommu->lock);
 412
 413        /*
 414         * In case the IOMMU supports page sizes smaller than PAGE_SIZE
 415         * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
 416         * That way the user will be able to map/unmap buffers whose size/
 417         * start address is aligned with PAGE_SIZE. Pinning code uses that
 418         * granularity while iommu driver can use the sub-PAGE_SIZE size
 419         * to map the buffer.
 420         */
 421        if (bitmap & ~PAGE_MASK) {
 422                bitmap &= PAGE_MASK;
 423                bitmap |= PAGE_SIZE;
 424        }
 425
 426        return bitmap;
 427}
 428
 429static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 430                             struct vfio_iommu_type1_dma_unmap *unmap)
 431{
 432        uint64_t mask;
 433        struct vfio_dma *dma;
 434        size_t unmapped = 0;
 435        int ret = 0;
 436
 437        mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
 438
 439        if (unmap->iova & mask)
 440                return -EINVAL;
 441        if (!unmap->size || unmap->size & mask)
 442                return -EINVAL;
 443
 444        WARN_ON(mask & PAGE_MASK);
 445
 446        mutex_lock(&iommu->lock);
 447
 448        /*
 449         * vfio-iommu-type1 (v1) - User mappings were coalesced together to
 450         * avoid tracking individual mappings.  This means that the granularity
 451         * of the original mapping was lost and the user was allowed to attempt
 452         * to unmap any range.  Depending on the contiguousness of physical
 453         * memory and page sizes supported by the IOMMU, arbitrary unmaps may
 454         * or may not have worked.  We only guaranteed unmap granularity
 455         * matching the original mapping; even though it was untracked here,
 456         * the original mappings are reflected in IOMMU mappings.  This
 457         * resulted in a couple unusual behaviors.  First, if a range is not
 458         * able to be unmapped, ex. a set of 4k pages that was mapped as a
 459         * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
 460         * a zero sized unmap.  Also, if an unmap request overlaps the first
 461         * address of a hugepage, the IOMMU will unmap the entire hugepage.
 462         * This also returns success and the returned unmap size reflects the
 463         * actual size unmapped.
 464         *
 465         * We attempt to maintain compatibility with this "v1" interface, but
 466         * we take control out of the hands of the IOMMU.  Therefore, an unmap
 467         * request offset from the beginning of the original mapping will
 468         * return success with zero sized unmap.  And an unmap request covering
 469         * the first iova of mapping will unmap the entire range.
 470         *
 471         * The v2 version of this interface intends to be more deterministic.
 472         * Unmap requests must fully cover previous mappings.  Multiple
 473         * mappings may still be unmaped by specifying large ranges, but there
 474         * must not be any previous mappings bisected by the range.  An error
 475         * will be returned if these conditions are not met.  The v2 interface
 476         * will only return success and a size of zero if there were no
 477         * mappings within the range.
 478         */
 479        if (iommu->v2) {
 480                dma = vfio_find_dma(iommu, unmap->iova, 0);
 481                if (dma && dma->iova != unmap->iova) {
 482                        ret = -EINVAL;
 483                        goto unlock;
 484                }
 485                dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
 486                if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
 487                        ret = -EINVAL;
 488                        goto unlock;
 489                }
 490        }
 491
 492        while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
 493                if (!iommu->v2 && unmap->iova > dma->iova)
 494                        break;
 495                unmapped += dma->size;
 496                vfio_remove_dma(iommu, dma);
 497        }
 498
 499unlock:
 500        mutex_unlock(&iommu->lock);
 501
 502        /* Report how much was unmapped */
 503        unmap->size = unmapped;
 504
 505        return ret;
 506}
 507
 508/*
 509 * Turns out AMD IOMMU has a page table bug where it won't map large pages
 510 * to a region that previously mapped smaller pages.  This should be fixed
 511 * soon, so this is just a temporary workaround to break mappings down into
 512 * PAGE_SIZE.  Better to map smaller pages than nothing.
 513 */
 514static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
 515                          unsigned long pfn, long npage, int prot)
 516{
 517        long i;
 518        int ret = 0;
 519
 520        for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
 521                ret = iommu_map(domain->domain, iova,
 522                                (phys_addr_t)pfn << PAGE_SHIFT,
 523                                PAGE_SIZE, prot | domain->prot);
 524                if (ret)
 525                        break;
 526        }
 527
 528        for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
 529                iommu_unmap(domain->domain, iova, PAGE_SIZE);
 530
 531        return ret;
 532}
 533
 534static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
 535                          unsigned long pfn, long npage, int prot)
 536{
 537        struct vfio_domain *d;
 538        int ret;
 539
 540        list_for_each_entry(d, &iommu->domain_list, next) {
 541                ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
 542                                npage << PAGE_SHIFT, prot | d->prot);
 543                if (ret) {
 544                        if (ret != -EBUSY ||
 545                            map_try_harder(d, iova, pfn, npage, prot))
 546                                goto unwind;
 547                }
 548
 549                cond_resched();
 550        }
 551
 552        return 0;
 553
 554unwind:
 555        list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
 556                iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
 557
 558        return ret;
 559}
 560
 561static int vfio_dma_do_map(struct vfio_iommu *iommu,
 562                           struct vfio_iommu_type1_dma_map *map)
 563{
 564        dma_addr_t iova = map->iova;
 565        unsigned long vaddr = map->vaddr;
 566        size_t size = map->size;
 567        long npage;
 568        int ret = 0, prot = 0;
 569        uint64_t mask;
 570        struct vfio_dma *dma;
 571        unsigned long pfn;
 572
 573        /* Verify that none of our __u64 fields overflow */
 574        if (map->size != size || map->vaddr != vaddr || map->iova != iova)
 575                return -EINVAL;
 576
 577        mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
 578
 579        WARN_ON(mask & PAGE_MASK);
 580
 581        /* READ/WRITE from device perspective */
 582        if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
 583                prot |= IOMMU_WRITE;
 584        if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 585                prot |= IOMMU_READ;
 586
 587        if (!prot || !size || (size | iova | vaddr) & mask)
 588                return -EINVAL;
 589
 590        /* Don't allow IOVA or virtual address wrap */
 591        if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
 592                return -EINVAL;
 593
 594        mutex_lock(&iommu->lock);
 595
 596        if (vfio_find_dma(iommu, iova, size)) {
 597                mutex_unlock(&iommu->lock);
 598                return -EEXIST;
 599        }
 600
 601        dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 602        if (!dma) {
 603                mutex_unlock(&iommu->lock);
 604                return -ENOMEM;
 605        }
 606
 607        dma->iova = iova;
 608        dma->vaddr = vaddr;
 609        dma->prot = prot;
 610
 611        /* Insert zero-sized and grow as we map chunks of it */
 612        vfio_link_dma(iommu, dma);
 613
 614        while (size) {
 615                /* Pin a contiguous chunk of memory */
 616                npage = vfio_pin_pages(vaddr + dma->size,
 617                                       size >> PAGE_SHIFT, prot, &pfn);
 618                if (npage <= 0) {
 619                        WARN_ON(!npage);
 620                        ret = (int)npage;
 621                        break;
 622                }
 623
 624                /* Map it! */
 625                ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
 626                if (ret) {
 627                        vfio_unpin_pages(pfn, npage, prot, true);
 628                        break;
 629                }
 630
 631                size -= npage << PAGE_SHIFT;
 632                dma->size += npage << PAGE_SHIFT;
 633        }
 634
 635        if (ret)
 636                vfio_remove_dma(iommu, dma);
 637
 638        mutex_unlock(&iommu->lock);
 639        return ret;
 640}
 641
 642static int vfio_bus_type(struct device *dev, void *data)
 643{
 644        struct bus_type **bus = data;
 645
 646        if (*bus && *bus != dev->bus)
 647                return -EINVAL;
 648
 649        *bus = dev->bus;
 650
 651        return 0;
 652}
 653
 654static int vfio_iommu_replay(struct vfio_iommu *iommu,
 655                             struct vfio_domain *domain)
 656{
 657        struct vfio_domain *d;
 658        struct rb_node *n;
 659        int ret;
 660
 661        /* Arbitrarily pick the first domain in the list for lookups */
 662        d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
 663        n = rb_first(&iommu->dma_list);
 664
 665        /* If there's not a domain, there better not be any mappings */
 666        if (WARN_ON(n && !d))
 667                return -EINVAL;
 668
 669        for (; n; n = rb_next(n)) {
 670                struct vfio_dma *dma;
 671                dma_addr_t iova;
 672
 673                dma = rb_entry(n, struct vfio_dma, node);
 674                iova = dma->iova;
 675
 676                while (iova < dma->iova + dma->size) {
 677                        phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
 678                        size_t size;
 679
 680                        if (WARN_ON(!phys)) {
 681                                iova += PAGE_SIZE;
 682                                continue;
 683                        }
 684
 685                        size = PAGE_SIZE;
 686
 687                        while (iova + size < dma->iova + dma->size &&
 688                               phys + size == iommu_iova_to_phys(d->domain,
 689                                                                 iova + size))
 690                                size += PAGE_SIZE;
 691
 692                        ret = iommu_map(domain->domain, iova, phys,
 693                                        size, dma->prot | domain->prot);
 694                        if (ret)
 695                                return ret;
 696
 697                        iova += size;
 698                }
 699        }
 700
 701        return 0;
 702}
 703
 704/*
 705 * We change our unmap behavior slightly depending on whether the IOMMU
 706 * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
 707 * for practically any contiguous power-of-two mapping we give it.  This means
 708 * we don't need to look for contiguous chunks ourselves to make unmapping
 709 * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
 710 * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
 711 * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
 712 * hugetlbfs is in use.
 713 */
 714static void vfio_test_domain_fgsp(struct vfio_domain *domain)
 715{
 716        struct page *pages;
 717        int ret, order = get_order(PAGE_SIZE * 2);
 718
 719        pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
 720        if (!pages)
 721                return;
 722
 723        ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
 724                        IOMMU_READ | IOMMU_WRITE | domain->prot);
 725        if (!ret) {
 726                size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
 727
 728                if (unmapped == PAGE_SIZE)
 729                        iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
 730                else
 731                        domain->fgsp = true;
 732        }
 733
 734        __free_pages(pages, order);
 735}
 736
 737static int vfio_iommu_type1_attach_group(void *iommu_data,
 738                                         struct iommu_group *iommu_group)
 739{
 740        struct vfio_iommu *iommu = iommu_data;
 741        struct vfio_group *group, *g;
 742        struct vfio_domain *domain, *d;
 743        struct bus_type *bus = NULL;
 744        int ret;
 745
 746        mutex_lock(&iommu->lock);
 747
 748        list_for_each_entry(d, &iommu->domain_list, next) {
 749                list_for_each_entry(g, &d->group_list, next) {
 750                        if (g->iommu_group != iommu_group)
 751                                continue;
 752
 753                        mutex_unlock(&iommu->lock);
 754                        return -EINVAL;
 755                }
 756        }
 757
 758        group = kzalloc(sizeof(*group), GFP_KERNEL);
 759        domain = kzalloc(sizeof(*domain), GFP_KERNEL);
 760        if (!group || !domain) {
 761                ret = -ENOMEM;
 762                goto out_free;
 763        }
 764
 765        group->iommu_group = iommu_group;
 766
 767        /* Determine bus_type in order to allocate a domain */
 768        ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
 769        if (ret)
 770                goto out_free;
 771
 772        domain->domain = iommu_domain_alloc(bus);
 773        if (!domain->domain) {
 774                ret = -EIO;
 775                goto out_free;
 776        }
 777
 778        if (iommu->nesting) {
 779                int attr = 1;
 780
 781                ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
 782                                            &attr);
 783                if (ret)
 784                        goto out_domain;
 785        }
 786
 787        ret = iommu_attach_group(domain->domain, iommu_group);
 788        if (ret)
 789                goto out_domain;
 790
 791        INIT_LIST_HEAD(&domain->group_list);
 792        list_add(&group->next, &domain->group_list);
 793
 794        if (!allow_unsafe_interrupts &&
 795            !iommu_capable(bus, IOMMU_CAP_INTR_REMAP)) {
 796                pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
 797                       __func__);
 798                ret = -EPERM;
 799                goto out_detach;
 800        }
 801
 802        if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
 803                domain->prot |= IOMMU_CACHE;
 804
 805        /*
 806         * Try to match an existing compatible domain.  We don't want to
 807         * preclude an IOMMU driver supporting multiple bus_types and being
 808         * able to include different bus_types in the same IOMMU domain, so
 809         * we test whether the domains use the same iommu_ops rather than
 810         * testing if they're on the same bus_type.
 811         */
 812        list_for_each_entry(d, &iommu->domain_list, next) {
 813                if (d->domain->ops == domain->domain->ops &&
 814                    d->prot == domain->prot) {
 815                        iommu_detach_group(domain->domain, iommu_group);
 816                        if (!iommu_attach_group(d->domain, iommu_group)) {
 817                                list_add(&group->next, &d->group_list);
 818                                iommu_domain_free(domain->domain);
 819                                kfree(domain);
 820                                mutex_unlock(&iommu->lock);
 821                                return 0;
 822                        }
 823
 824                        ret = iommu_attach_group(domain->domain, iommu_group);
 825                        if (ret)
 826                                goto out_domain;
 827                }
 828        }
 829
 830        vfio_test_domain_fgsp(domain);
 831
 832        /* replay mappings on new domains */
 833        ret = vfio_iommu_replay(iommu, domain);
 834        if (ret)
 835                goto out_detach;
 836
 837        list_add(&domain->next, &iommu->domain_list);
 838
 839        mutex_unlock(&iommu->lock);
 840
 841        return 0;
 842
 843out_detach:
 844        iommu_detach_group(domain->domain, iommu_group);
 845out_domain:
 846        iommu_domain_free(domain->domain);
 847out_free:
 848        kfree(domain);
 849        kfree(group);
 850        mutex_unlock(&iommu->lock);
 851        return ret;
 852}
 853
 854static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
 855{
 856        struct rb_node *node;
 857
 858        while ((node = rb_first(&iommu->dma_list)))
 859                vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
 860}
 861
 862static void vfio_iommu_type1_detach_group(void *iommu_data,
 863                                          struct iommu_group *iommu_group)
 864{
 865        struct vfio_iommu *iommu = iommu_data;
 866        struct vfio_domain *domain;
 867        struct vfio_group *group;
 868
 869        mutex_lock(&iommu->lock);
 870
 871        list_for_each_entry(domain, &iommu->domain_list, next) {
 872                list_for_each_entry(group, &domain->group_list, next) {
 873                        if (group->iommu_group != iommu_group)
 874                                continue;
 875
 876                        iommu_detach_group(domain->domain, iommu_group);
 877                        list_del(&group->next);
 878                        kfree(group);
 879                        /*
 880                         * Group ownership provides privilege, if the group
 881                         * list is empty, the domain goes away.  If it's the
 882                         * last domain, then all the mappings go away too.
 883                         */
 884                        if (list_empty(&domain->group_list)) {
 885                                if (list_is_singular(&iommu->domain_list))
 886                                        vfio_iommu_unmap_unpin_all(iommu);
 887                                iommu_domain_free(domain->domain);
 888                                list_del(&domain->next);
 889                                kfree(domain);
 890                        }
 891                        goto done;
 892                }
 893        }
 894
 895done:
 896        mutex_unlock(&iommu->lock);
 897}
 898
 899static void *vfio_iommu_type1_open(unsigned long arg)
 900{
 901        struct vfio_iommu *iommu;
 902
 903        iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
 904        if (!iommu)
 905                return ERR_PTR(-ENOMEM);
 906
 907        switch (arg) {
 908        case VFIO_TYPE1_IOMMU:
 909                break;
 910        case VFIO_TYPE1_NESTING_IOMMU:
 911                iommu->nesting = true;
 912        case VFIO_TYPE1v2_IOMMU:
 913                iommu->v2 = true;
 914                break;
 915        default:
 916                kfree(iommu);
 917                return ERR_PTR(-EINVAL);
 918        }
 919
 920        INIT_LIST_HEAD(&iommu->domain_list);
 921        iommu->dma_list = RB_ROOT;
 922        mutex_init(&iommu->lock);
 923
 924        return iommu;
 925}
 926
 927static void vfio_iommu_type1_release(void *iommu_data)
 928{
 929        struct vfio_iommu *iommu = iommu_data;
 930        struct vfio_domain *domain, *domain_tmp;
 931        struct vfio_group *group, *group_tmp;
 932
 933        vfio_iommu_unmap_unpin_all(iommu);
 934
 935        list_for_each_entry_safe(domain, domain_tmp,
 936                                 &iommu->domain_list, next) {
 937                list_for_each_entry_safe(group, group_tmp,
 938                                         &domain->group_list, next) {
 939                        iommu_detach_group(domain->domain, group->iommu_group);
 940                        list_del(&group->next);
 941                        kfree(group);
 942                }
 943                iommu_domain_free(domain->domain);
 944                list_del(&domain->next);
 945                kfree(domain);
 946        }
 947
 948        kfree(iommu);
 949}
 950
 951static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
 952{
 953        struct vfio_domain *domain;
 954        int ret = 1;
 955
 956        mutex_lock(&iommu->lock);
 957        list_for_each_entry(domain, &iommu->domain_list, next) {
 958                if (!(domain->prot & IOMMU_CACHE)) {
 959                        ret = 0;
 960                        break;
 961                }
 962        }
 963        mutex_unlock(&iommu->lock);
 964
 965        return ret;
 966}
 967
 968static long vfio_iommu_type1_ioctl(void *iommu_data,
 969                                   unsigned int cmd, unsigned long arg)
 970{
 971        struct vfio_iommu *iommu = iommu_data;
 972        unsigned long minsz;
 973
 974        if (cmd == VFIO_CHECK_EXTENSION) {
 975                switch (arg) {
 976                case VFIO_TYPE1_IOMMU:
 977                case VFIO_TYPE1v2_IOMMU:
 978                case VFIO_TYPE1_NESTING_IOMMU:
 979                        return 1;
 980                case VFIO_DMA_CC_IOMMU:
 981                        if (!iommu)
 982                                return 0;
 983                        return vfio_domains_have_iommu_cache(iommu);
 984                default:
 985                        return 0;
 986                }
 987        } else if (cmd == VFIO_IOMMU_GET_INFO) {
 988                struct vfio_iommu_type1_info info;
 989
 990                minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 991
 992                if (copy_from_user(&info, (void __user *)arg, minsz))
 993                        return -EFAULT;
 994
 995                if (info.argsz < minsz)
 996                        return -EINVAL;
 997
 998                info.flags = VFIO_IOMMU_INFO_PGSIZES;
 999
1000                info.iova_pgsizes = vfio_pgsize_bitmap(iommu);

1001
1002                return copy_to_user((void __user *)arg, &info, minsz) ?
1003                        -EFAULT : 0;
1004
1005        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
1006                struct vfio_iommu_type1_dma_map map;
1007                uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
1008                                VFIO_DMA_MAP_FLAG_WRITE;
1009
1010                minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
1011
1012                if (copy_from_user(&map, (void __user *)arg, minsz))
1013                        return -EFAULT;
1014
1015                if (map.argsz < minsz || map.flags & ~mask)
1016                        return -EINVAL;
1017
1018                return vfio_dma_do_map(iommu, &map);
1019
1020        } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
1021                struct vfio_iommu_type1_dma_unmap unmap;
1022                long ret;
1023
1024                minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
1025
1026                if (copy_from_user(&unmap, (void __user *)arg, minsz))
1027                        return -EFAULT;
1028
1029                if (unmap.argsz < minsz || unmap.flags)
1030                        return -EINVAL;
1031
1032                ret = vfio_dma_do_unmap(iommu, &unmap);
1033                if (ret)
1034                        return ret;
1035
1036                return copy_to_user((void __user *)arg, &unmap, minsz) ?
1037                        -EFAULT : 0;
1038        }
1039
1040        return -ENOTTY;
1041}
1042
1043static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
1044        .name           = "vfio-iommu-type1",
1045        .owner          = THIS_MODULE,
1046        .open           = vfio_iommu_type1_open,
1047        .release        = vfio_iommu_type1_release,
1048        .ioctl          = vfio_iommu_type1_ioctl,
1049        .attach_group   = vfio_iommu_type1_attach_group,
1050        .detach_group   = vfio_iommu_type1_detach_group,
1051};
1052
1053static int __init vfio_iommu_type1_init(void)
1054{
1055        return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
1056}
1057
1058static void __exit vfio_iommu_type1_cleanup(void)
1059{
1060        vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
1061}
1062
1063module_init(vfio_iommu_type1_init);
1064module_exit(vfio_iommu_type1_cleanup);
1065
1066MODULE_VERSION(DRIVER_VERSION);
1067MODULE_LICENSE("GPL v2");
1068MODULE_AUTHOR(DRIVER_AUTHOR);
1069MODULE_DESCRIPTION(DRIVER_DESC);
1070