linux/drivers/vfio/vfio_iommu_spapr_tce.c
<<
>>
Prefs
   1/*
   2 * VFIO: IOMMU DMA mapping support for TCE on POWER
   3 *
   4 * Copyright (C) 2013 IBM Corp.  All rights reserved.
   5 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio_iommu_type1.c:
  12 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
  13 *     Author: Alex Williamson <alex.williamson@redhat.com>
  14 */
  15
  16#include <linux/module.h>
  17#include <linux/pci.h>
  18#include <linux/slab.h>
  19#include <linux/uaccess.h>
  20#include <linux/err.h>
  21#include <linux/vfio.h>
  22#include <linux/vmalloc.h>
  23#include <linux/sched/mm.h>
  24#include <linux/sched/signal.h>
  25
  26#include <asm/iommu.h>
  27#include <asm/tce.h>
  28#include <asm/mmu_context.h>
  29
  30#define DRIVER_VERSION  "0.1"
  31#define DRIVER_AUTHOR   "aik@ozlabs.ru"
  32#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
  33
  34static void tce_iommu_detach_group(void *iommu_data,
  35                struct iommu_group *iommu_group);
  36
  37static long try_increment_locked_vm(struct mm_struct *mm, long npages)
  38{
  39        long ret = 0, locked, lock_limit;
  40
  41        if (WARN_ON_ONCE(!mm))
  42                return -EPERM;
  43
  44        if (!npages)
  45                return 0;
  46
  47        down_write(&mm->mmap_sem);
  48        locked = mm->locked_vm + npages;
  49        lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  50        if (locked > lock_limit && !capable(CAP_IPC_LOCK))
  51                ret = -ENOMEM;
  52        else
  53                mm->locked_vm += npages;
  54
  55        pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
  56                        npages << PAGE_SHIFT,
  57                        mm->locked_vm << PAGE_SHIFT,
  58                        rlimit(RLIMIT_MEMLOCK),
  59                        ret ? " - exceeded" : "");
  60
  61        up_write(&mm->mmap_sem);
  62
  63        return ret;
  64}
  65
  66static void decrement_locked_vm(struct mm_struct *mm, long npages)
  67{
  68        if (!mm || !npages)
  69                return;
  70
  71        down_write(&mm->mmap_sem);
  72        if (WARN_ON_ONCE(npages > mm->locked_vm))
  73                npages = mm->locked_vm;
  74        mm->locked_vm -= npages;
  75        pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
  76                        npages << PAGE_SHIFT,
  77                        mm->locked_vm << PAGE_SHIFT,
  78                        rlimit(RLIMIT_MEMLOCK));
  79        up_write(&mm->mmap_sem);
  80}
  81
  82/*
  83 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  84 *
  85 * This code handles mapping and unmapping of user data buffers
  86 * into DMA'ble space using the IOMMU
  87 */
  88
  89struct tce_iommu_group {
  90        struct list_head next;
  91        struct iommu_group *grp;
  92};
  93
  94/*
  95 * A container needs to remember which preregistered region  it has
  96 * referenced to do proper cleanup at the userspace process exit.
  97 */
  98struct tce_iommu_prereg {
  99        struct list_head next;
 100        struct mm_iommu_table_group_mem_t *mem;
 101};
 102
 103/*
 104 * The container descriptor supports only a single group per container.
 105 * Required by the API as the container is not supplied with the IOMMU group
 106 * at the moment of initialization.
 107 */
 108struct tce_container {
 109        struct mutex lock;
 110        bool enabled;
 111        bool v2;
 112        bool def_window_pending;
 113        unsigned long locked_pages;
 114        struct mm_struct *mm;
 115        struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
 116        struct list_head group_list;
 117        struct list_head prereg_list;
 118};
 119
 120static long tce_iommu_mm_set(struct tce_container *container)
 121{
 122        if (container->mm) {
 123                if (container->mm == current->mm)
 124                        return 0;
 125                return -EPERM;
 126        }
 127        BUG_ON(!current->mm);
 128        container->mm = current->mm;
 129        atomic_inc(&container->mm->mm_count);
 130
 131        return 0;
 132}
 133
 134static long tce_iommu_prereg_free(struct tce_container *container,
 135                struct tce_iommu_prereg *tcemem)
 136{
 137        long ret;
 138
 139        ret = mm_iommu_put(container->mm, tcemem->mem);
 140        if (ret)
 141                return ret;
 142
 143        list_del(&tcemem->next);
 144        kfree(tcemem);
 145
 146        return 0;
 147}
 148
 149static long tce_iommu_unregister_pages(struct tce_container *container,
 150                __u64 vaddr, __u64 size)
 151{
 152        struct mm_iommu_table_group_mem_t *mem;
 153        struct tce_iommu_prereg *tcemem;
 154        bool found = false;
 155        long ret;
 156
 157        if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
 158                return -EINVAL;
 159
 160        mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
 161        if (!mem)
 162                return -ENOENT;
 163
 164        list_for_each_entry(tcemem, &container->prereg_list, next) {
 165                if (tcemem->mem == mem) {
 166                        found = true;
 167                        break;
 168                }
 169        }
 170
 171        if (!found)
 172                ret = -ENOENT;
 173        else
 174                ret = tce_iommu_prereg_free(container, tcemem);
 175
 176        mm_iommu_put(container->mm, mem);
 177
 178        return ret;
 179}
 180
 181static long tce_iommu_register_pages(struct tce_container *container,
 182                __u64 vaddr, __u64 size)
 183{
 184        long ret = 0;
 185        struct mm_iommu_table_group_mem_t *mem = NULL;
 186        struct tce_iommu_prereg *tcemem;
 187        unsigned long entries = size >> PAGE_SHIFT;
 188
 189        if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
 190                        ((vaddr + size) < vaddr))
 191                return -EINVAL;
 192
 193        mem = mm_iommu_get(container->mm, vaddr, entries);
 194        if (mem) {
 195                list_for_each_entry(tcemem, &container->prereg_list, next) {
 196                        if (tcemem->mem == mem) {
 197                                ret = -EBUSY;
 198                                goto put_exit;
 199                        }
 200                }
 201        } else {
 202                ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
 203                if (ret)
 204                        return ret;
 205        }
 206
 207        tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
 208        if (!tcemem) {
 209                ret = -ENOMEM;
 210                goto put_exit;
 211        }
 212
 213        tcemem->mem = mem;
 214        list_add(&tcemem->next, &container->prereg_list);
 215
 216        container->enabled = true;
 217
 218        return 0;
 219
 220put_exit:
 221        mm_iommu_put(container->mm, mem);
 222        return ret;
 223}
 224
 225static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
 226                unsigned int page_shift)
 227{
 228        struct page *page;
 229        unsigned long size = 0;
 230
 231        if (mm_iommu_is_devmem(mm, hpa, page_shift, &size))
 232                return size == (1UL << page_shift);
 233
 234        page = pfn_to_page(hpa >> PAGE_SHIFT);
 235        /*
 236         * Check that the TCE table granularity is not bigger than the size of
 237         * a page we just found. Otherwise the hardware can get access to
 238         * a bigger memory chunk that it should.
 239         */
 240        return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
 241}
 242
 243static inline bool tce_groups_attached(struct tce_container *container)
 244{
 245        return !list_empty(&container->group_list);
 246}
 247
 248static long tce_iommu_find_table(struct tce_container *container,
 249                phys_addr_t ioba, struct iommu_table **ptbl)
 250{
 251        long i;
 252
 253        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 254                struct iommu_table *tbl = container->tables[i];
 255
 256                if (tbl) {
 257                        unsigned long entry = ioba >> tbl->it_page_shift;
 258                        unsigned long start = tbl->it_offset;
 259                        unsigned long end = start + tbl->it_size;
 260
 261                        if ((start <= entry) && (entry < end)) {
 262                                *ptbl = tbl;
 263                                return i;
 264                        }
 265                }
 266        }
 267
 268        return -1;
 269}
 270
 271static int tce_iommu_find_free_table(struct tce_container *container)
 272{
 273        int i;
 274
 275        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 276                if (!container->tables[i])
 277                        return i;
 278        }
 279
 280        return -ENOSPC;
 281}
 282
 283static int tce_iommu_enable(struct tce_container *container)
 284{
 285        int ret = 0;
 286        unsigned long locked;
 287        struct iommu_table_group *table_group;
 288        struct tce_iommu_group *tcegrp;
 289
 290        if (container->enabled)
 291                return -EBUSY;
 292
 293        /*
 294         * When userspace pages are mapped into the IOMMU, they are effectively
 295         * locked memory, so, theoretically, we need to update the accounting
 296         * of locked pages on each map and unmap.  For powerpc, the map unmap
 297         * paths can be very hot, though, and the accounting would kill
 298         * performance, especially since it would be difficult to impossible
 299         * to handle the accounting in real mode only.
 300         *
 301         * To address that, rather than precisely accounting every page, we
 302         * instead account for a worst case on locked memory when the iommu is
 303         * enabled and disabled.  The worst case upper bound on locked memory
 304         * is the size of the whole iommu window, which is usually relatively
 305         * small (compared to total memory sizes) on POWER hardware.
 306         *
 307         * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 308         * that would effectively kill the guest at random points, much better
 309         * enforcing the limit based on the max that the guest can map.
 310         *
 311         * Unfortunately at the moment it counts whole tables, no matter how
 312         * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
 313         * each with 2GB DMA window, 8GB will be counted here. The reason for
 314         * this is that we cannot tell here the amount of RAM used by the guest
 315         * as this information is only available from KVM and VFIO is
 316         * KVM agnostic.
 317         *
 318         * So we do not allow enabling a container without a group attached
 319         * as there is no way to know how much we should increment
 320         * the locked_vm counter.
 321         */
 322        if (!tce_groups_attached(container))
 323                return -ENODEV;
 324
 325        tcegrp = list_first_entry(&container->group_list,
 326                        struct tce_iommu_group, next);
 327        table_group = iommu_group_get_iommudata(tcegrp->grp);
 328        if (!table_group)
 329                return -ENODEV;
 330
 331        if (!table_group->tce32_size)
 332                return -EPERM;
 333
 334        ret = tce_iommu_mm_set(container);
 335        if (ret)
 336                return ret;
 337
 338        locked = table_group->tce32_size >> PAGE_SHIFT;
 339        ret = try_increment_locked_vm(container->mm, locked);
 340        if (ret)
 341                return ret;
 342
 343        container->locked_pages = locked;
 344
 345        container->enabled = true;
 346
 347        return ret;
 348}
 349
 350static void tce_iommu_disable(struct tce_container *container)
 351{
 352        if (!container->enabled)
 353                return;
 354
 355        container->enabled = false;
 356
 357        BUG_ON(!container->mm);
 358        decrement_locked_vm(container->mm, container->locked_pages);
 359}
 360
 361static void *tce_iommu_open(unsigned long arg)
 362{
 363        struct tce_container *container;
 364
 365        if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
 366                pr_err("tce_vfio: Wrong IOMMU type\n");
 367                return ERR_PTR(-EINVAL);
 368        }
 369
 370        container = kzalloc(sizeof(*container), GFP_KERNEL);
 371        if (!container)
 372                return ERR_PTR(-ENOMEM);
 373
 374        mutex_init(&container->lock);
 375        INIT_LIST_HEAD_RCU(&container->group_list);
 376        INIT_LIST_HEAD_RCU(&container->prereg_list);
 377
 378        container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 379
 380        return container;
 381}
 382
 383static int tce_iommu_clear(struct tce_container *container,
 384                struct iommu_table *tbl,
 385                unsigned long entry, unsigned long pages);
 386static void tce_iommu_free_table(struct tce_container *container,
 387                struct iommu_table *tbl);
 388
 389static void tce_iommu_release(void *iommu_data)
 390{
 391        struct tce_container *container = iommu_data;
 392        struct tce_iommu_group *tcegrp;
 393        struct tce_iommu_prereg *tcemem, *tmtmp;
 394        long i;
 395
 396        while (tce_groups_attached(container)) {
 397                tcegrp = list_first_entry(&container->group_list,
 398                                struct tce_iommu_group, next);
 399                tce_iommu_detach_group(iommu_data, tcegrp->grp);
 400        }
 401
 402        /*
 403         * If VFIO created a table, it was not disposed
 404         * by tce_iommu_detach_group() so do it now.
 405         */
 406        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 407                struct iommu_table *tbl = container->tables[i];
 408
 409                if (!tbl)
 410                        continue;
 411
 412                tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 413                tce_iommu_free_table(container, tbl);
 414        }
 415
 416        list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
 417                WARN_ON(tce_iommu_prereg_free(container, tcemem));
 418
 419        tce_iommu_disable(container);
 420        if (container->mm)
 421                mmdrop(container->mm);
 422        mutex_destroy(&container->lock);
 423
 424        kfree(container);
 425}
 426
 427static void tce_iommu_unuse_page(struct tce_container *container,
 428                unsigned long hpa)
 429{
 430        struct page *page;
 431
 432        page = pfn_to_page(hpa >> PAGE_SHIFT);
 433        put_page(page);
 434}
 435
 436static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
 437                unsigned long tce, unsigned long shift,
 438                unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
 439{
 440        long ret = 0;
 441        struct mm_iommu_table_group_mem_t *mem;
 442
 443        mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
 444        if (!mem)
 445                return -EINVAL;
 446
 447        ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
 448        if (ret)
 449                return -EINVAL;
 450
 451        *pmem = mem;
 452
 453        return 0;
 454}
 455
 456static void tce_iommu_unuse_page_v2(struct tce_container *container,
 457                struct iommu_table *tbl, unsigned long entry)
 458{
 459        struct mm_iommu_table_group_mem_t *mem = NULL;
 460        int ret;
 461        unsigned long hpa = 0;
 462        __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 463
 464        if (!pua)
 465                return;
 466
 467        ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
 468                        tbl->it_page_shift, &hpa, &mem);
 469        if (ret)
 470                pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
 471                                __func__, be64_to_cpu(*pua), entry, ret);
 472        if (mem)
 473                mm_iommu_mapped_dec(mem);
 474
 475        *pua = cpu_to_be64(0);
 476}
 477
 478static int tce_iommu_clear(struct tce_container *container,
 479                struct iommu_table *tbl,
 480                unsigned long entry, unsigned long pages)
 481{
 482        unsigned long oldhpa;
 483        long ret;
 484        enum dma_data_direction direction;
 485        unsigned long lastentry = entry + pages;
 486
 487        for ( ; entry < lastentry; ++entry) {
 488                if (tbl->it_indirect_levels && tbl->it_userspace) {
 489                        /*
 490                         * For multilevel tables, we can take a shortcut here
 491                         * and skip some TCEs as we know that the userspace
 492                         * addresses cache is a mirror of the real TCE table
 493                         * and if it is missing some indirect levels, then
 494                         * the hardware table does not have them allocated
 495                         * either and therefore does not require updating.
 496                         */
 497                        __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
 498                                        entry);
 499                        if (!pua) {
 500                                /* align to level_size which is power of two */
 501                                entry |= tbl->it_level_size - 1;
 502                                continue;
 503                        }
 504                }
 505
 506                cond_resched();
 507
 508                direction = DMA_NONE;
 509                oldhpa = 0;
 510                ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa,
 511                                &direction);
 512                if (ret)
 513                        continue;
 514
 515                if (direction == DMA_NONE)
 516                        continue;
 517
 518                if (container->v2) {
 519                        tce_iommu_unuse_page_v2(container, tbl, entry);
 520                        continue;
 521                }
 522
 523                tce_iommu_unuse_page(container, oldhpa);
 524        }
 525
 526        return 0;
 527}
 528
 529static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 530{
 531        struct page *page = NULL;
 532        enum dma_data_direction direction = iommu_tce_direction(tce);
 533
 534        if (get_user_pages_fast(tce & PAGE_MASK, 1,
 535                        direction != DMA_TO_DEVICE, &page) != 1)
 536                return -EFAULT;
 537
 538        *hpa = __pa((unsigned long) page_address(page));
 539
 540        return 0;
 541}
 542
 543static long tce_iommu_build(struct tce_container *container,
 544                struct iommu_table *tbl,
 545                unsigned long entry, unsigned long tce, unsigned long pages,
 546                enum dma_data_direction direction)
 547{
 548        long i, ret = 0;
 549        unsigned long hpa;
 550        enum dma_data_direction dirtmp;
 551
 552        for (i = 0; i < pages; ++i) {
 553                unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 554
 555                ret = tce_iommu_use_page(tce, &hpa);
 556                if (ret)
 557                        break;
 558
 559                if (!tce_page_is_contained(container->mm, hpa,
 560                                tbl->it_page_shift)) {
 561                        ret = -EPERM;
 562                        break;
 563                }
 564
 565                hpa |= offset;
 566                dirtmp = direction;
 567                ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
 568                                &dirtmp);
 569                if (ret) {
 570                        tce_iommu_unuse_page(container, hpa);
 571                        pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 572                                        __func__, entry << tbl->it_page_shift,
 573                                        tce, ret);
 574                        break;
 575                }
 576
 577                if (dirtmp != DMA_NONE)
 578                        tce_iommu_unuse_page(container, hpa);
 579
 580                tce += IOMMU_PAGE_SIZE(tbl);
 581        }
 582
 583        if (ret)
 584                tce_iommu_clear(container, tbl, entry, i);
 585
 586        return ret;
 587}
 588
 589static long tce_iommu_build_v2(struct tce_container *container,
 590                struct iommu_table *tbl,
 591                unsigned long entry, unsigned long tce, unsigned long pages,
 592                enum dma_data_direction direction)
 593{
 594        long i, ret = 0;
 595        unsigned long hpa;
 596        enum dma_data_direction dirtmp;
 597
 598        for (i = 0; i < pages; ++i) {
 599                struct mm_iommu_table_group_mem_t *mem = NULL;
 600                __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
 601
 602                ret = tce_iommu_prereg_ua_to_hpa(container,
 603                                tce, tbl->it_page_shift, &hpa, &mem);
 604                if (ret)
 605                        break;
 606
 607                if (!tce_page_is_contained(container->mm, hpa,
 608                                tbl->it_page_shift)) {
 609                        ret = -EPERM;
 610                        break;
 611                }
 612
 613                /* Preserve offset within IOMMU page */
 614                hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 615                dirtmp = direction;
 616
 617                /* The registered region is being unregistered */
 618                if (mm_iommu_mapped_inc(mem))
 619                        break;
 620
 621                ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
 622                                &dirtmp);
 623                if (ret) {
 624                        /* dirtmp cannot be DMA_NONE here */
 625                        tce_iommu_unuse_page_v2(container, tbl, entry + i);
 626                        pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 627                                        __func__, entry << tbl->it_page_shift,
 628                                        tce, ret);
 629                        break;
 630                }
 631
 632                if (dirtmp != DMA_NONE)
 633                        tce_iommu_unuse_page_v2(container, tbl, entry + i);
 634
 635                *pua = cpu_to_be64(tce);
 636
 637                tce += IOMMU_PAGE_SIZE(tbl);
 638        }
 639
 640        if (ret)
 641                tce_iommu_clear(container, tbl, entry, i);
 642
 643        return ret;
 644}
 645
 646static long tce_iommu_create_table(struct tce_container *container,
 647                        struct iommu_table_group *table_group,
 648                        int num,
 649                        __u32 page_shift,
 650                        __u64 window_size,
 651                        __u32 levels,
 652                        struct iommu_table **ptbl)
 653{
 654        long ret, table_size;
 655
 656        table_size = table_group->ops->get_table_size(page_shift, window_size,
 657                        levels);
 658        if (!table_size)
 659                return -EINVAL;
 660
 661        ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
 662        if (ret)
 663                return ret;
 664
 665        ret = table_group->ops->create_table(table_group, num,
 666                        page_shift, window_size, levels, ptbl);
 667
 668        WARN_ON(!ret && !(*ptbl)->it_ops->free);
 669        WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
 670
 671        return ret;
 672}
 673
 674static void tce_iommu_free_table(struct tce_container *container,
 675                struct iommu_table *tbl)
 676{
 677        unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 678
 679        iommu_tce_table_put(tbl);
 680        decrement_locked_vm(container->mm, pages);
 681}
 682
 683static long tce_iommu_create_window(struct tce_container *container,
 684                __u32 page_shift, __u64 window_size, __u32 levels,
 685                __u64 *start_addr)
 686{
 687        struct tce_iommu_group *tcegrp;
 688        struct iommu_table_group *table_group;
 689        struct iommu_table *tbl = NULL;
 690        long ret, num;
 691
 692        num = tce_iommu_find_free_table(container);
 693        if (num < 0)
 694                return num;
 695
 696        /* Get the first group for ops::create_table */
 697        tcegrp = list_first_entry(&container->group_list,
 698                        struct tce_iommu_group, next);
 699        table_group = iommu_group_get_iommudata(tcegrp->grp);
 700        if (!table_group)
 701                return -EFAULT;
 702
 703        if (!(table_group->pgsizes & (1ULL << page_shift)))
 704                return -EINVAL;
 705
 706        if (!table_group->ops->set_window || !table_group->ops->unset_window ||
 707                        !table_group->ops->get_table_size ||
 708                        !table_group->ops->create_table)
 709                return -EPERM;
 710
 711        /* Create TCE table */
 712        ret = tce_iommu_create_table(container, table_group, num,
 713                        page_shift, window_size, levels, &tbl);
 714        if (ret)
 715                return ret;
 716
 717        BUG_ON(!tbl->it_ops->free);
 718
 719        /*
 720         * Program the table to every group.
 721         * Groups have been tested for compatibility at the attach time.
 722         */
 723        list_for_each_entry(tcegrp, &container->group_list, next) {
 724                table_group = iommu_group_get_iommudata(tcegrp->grp);
 725
 726                ret = table_group->ops->set_window(table_group, num, tbl);
 727                if (ret)
 728                        goto unset_exit;
 729        }
 730
 731        container->tables[num] = tbl;
 732
 733        /* Return start address assigned by platform in create_table() */
 734        *start_addr = tbl->it_offset << tbl->it_page_shift;
 735
 736        return 0;
 737
 738unset_exit:
 739        list_for_each_entry(tcegrp, &container->group_list, next) {
 740                table_group = iommu_group_get_iommudata(tcegrp->grp);
 741                table_group->ops->unset_window(table_group, num);
 742        }
 743        tce_iommu_free_table(container, tbl);
 744
 745        return ret;
 746}
 747
 748static long tce_iommu_remove_window(struct tce_container *container,
 749                __u64 start_addr)
 750{
 751        struct iommu_table_group *table_group = NULL;
 752        struct iommu_table *tbl;
 753        struct tce_iommu_group *tcegrp;
 754        int num;
 755
 756        num = tce_iommu_find_table(container, start_addr, &tbl);
 757        if (num < 0)
 758                return -EINVAL;
 759
 760        BUG_ON(!tbl->it_size);
 761
 762        /* Detach groups from IOMMUs */
 763        list_for_each_entry(tcegrp, &container->group_list, next) {
 764                table_group = iommu_group_get_iommudata(tcegrp->grp);
 765
 766                /*
 767                 * SPAPR TCE IOMMU exposes the default DMA window to
 768                 * the guest via dma32_window_start/size of
 769                 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
 770                 * the userspace to remove this window, some do not so
 771                 * here we check for the platform capability.
 772                 */
 773                if (!table_group->ops || !table_group->ops->unset_window)
 774                        return -EPERM;
 775
 776                table_group->ops->unset_window(table_group, num);
 777        }
 778
 779        /* Free table */
 780        tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 781        tce_iommu_free_table(container, tbl);
 782        container->tables[num] = NULL;
 783
 784        return 0;
 785}
 786
 787static long tce_iommu_create_default_window(struct tce_container *container)
 788{
 789        long ret;
 790        __u64 start_addr = 0;
 791        struct tce_iommu_group *tcegrp;
 792        struct iommu_table_group *table_group;
 793
 794        if (!container->def_window_pending)
 795                return 0;
 796
 797        if (!tce_groups_attached(container))
 798                return -ENODEV;
 799
 800        tcegrp = list_first_entry(&container->group_list,
 801                        struct tce_iommu_group, next);
 802        table_group = iommu_group_get_iommudata(tcegrp->grp);
 803        if (!table_group)
 804                return -ENODEV;
 805
 806        ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
 807                        table_group->tce32_size, 1, &start_addr);
 808        WARN_ON_ONCE(!ret && start_addr);
 809
 810        if (!ret)
 811                container->def_window_pending = false;
 812
 813        return ret;
 814}
 815
 816static long tce_iommu_ioctl(void *iommu_data,
 817                                 unsigned int cmd, unsigned long arg)
 818{
 819        struct tce_container *container = iommu_data;
 820        unsigned long minsz, ddwsz;
 821        long ret;
 822
 823        switch (cmd) {
 824        case VFIO_CHECK_EXTENSION:
 825                switch (arg) {
 826                case VFIO_SPAPR_TCE_IOMMU:
 827                case VFIO_SPAPR_TCE_v2_IOMMU:
 828                        ret = 1;
 829                        break;
 830                default:
 831                        ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
 832                        break;
 833                }
 834
 835                return (ret < 0) ? 0 : ret;
 836        }
 837
 838        /*
 839         * Sanity check to prevent one userspace from manipulating
 840         * another userspace mm.
 841         */
 842        BUG_ON(!container);
 843        if (container->mm && container->mm != current->mm)
 844                return -EPERM;
 845
 846        switch (cmd) {
 847        case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 848                struct vfio_iommu_spapr_tce_info info;
 849                struct tce_iommu_group *tcegrp;
 850                struct iommu_table_group *table_group;
 851
 852                if (!tce_groups_attached(container))
 853                        return -ENXIO;
 854
 855                tcegrp = list_first_entry(&container->group_list,
 856                                struct tce_iommu_group, next);
 857                table_group = iommu_group_get_iommudata(tcegrp->grp);
 858
 859                if (!table_group)
 860                        return -ENXIO;
 861
 862                minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
 863                                dma32_window_size);
 864
 865                if (copy_from_user(&info, (void __user *)arg, minsz))
 866                        return -EFAULT;
 867
 868                if (info.argsz < minsz)
 869                        return -EINVAL;
 870
 871                info.dma32_window_start = table_group->tce32_start;
 872                info.dma32_window_size = table_group->tce32_size;
 873                info.flags = 0;
 874                memset(&info.ddw, 0, sizeof(info.ddw));
 875
 876                if (table_group->max_dynamic_windows_supported &&
 877                                container->v2) {
 878                        info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
 879                        info.ddw.pgsizes = table_group->pgsizes;
 880                        info.ddw.max_dynamic_windows_supported =
 881                                table_group->max_dynamic_windows_supported;
 882                        info.ddw.levels = table_group->max_levels;
 883                }
 884
 885                ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
 886
 887                if (info.argsz >= ddwsz)
 888                        minsz = ddwsz;
 889
 890                if (copy_to_user((void __user *)arg, &info, minsz))
 891                        return -EFAULT;
 892
 893                return 0;
 894        }
 895        case VFIO_IOMMU_MAP_DMA: {
 896                struct vfio_iommu_type1_dma_map param;
 897                struct iommu_table *tbl = NULL;
 898                long num;
 899                enum dma_data_direction direction;
 900
 901                if (!container->enabled)
 902                        return -EPERM;
 903
 904                minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 905
 906                if (copy_from_user(&param, (void __user *)arg, minsz))
 907                        return -EFAULT;
 908
 909                if (param.argsz < minsz)
 910                        return -EINVAL;
 911
 912                if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
 913                                VFIO_DMA_MAP_FLAG_WRITE))
 914                        return -EINVAL;
 915
 916                ret = tce_iommu_create_default_window(container);
 917                if (ret)
 918                        return ret;
 919
 920                num = tce_iommu_find_table(container, param.iova, &tbl);
 921                if (num < 0)
 922                        return -ENXIO;
 923
 924                if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
 925                                (param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 926                        return -EINVAL;
 927
 928                /* iova is checked by the IOMMU API */
 929                if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
 930                        if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 931                                direction = DMA_BIDIRECTIONAL;
 932                        else
 933                                direction = DMA_TO_DEVICE;
 934                } else {
 935                        if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 936                                direction = DMA_FROM_DEVICE;
 937                        else
 938                                return -EINVAL;
 939                }
 940
 941                ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
 942                if (ret)
 943                        return ret;
 944
 945                if (container->v2)
 946                        ret = tce_iommu_build_v2(container, tbl,
 947                                        param.iova >> tbl->it_page_shift,
 948                                        param.vaddr,
 949                                        param.size >> tbl->it_page_shift,
 950                                        direction);
 951                else
 952                        ret = tce_iommu_build(container, tbl,
 953                                        param.iova >> tbl->it_page_shift,
 954                                        param.vaddr,
 955                                        param.size >> tbl->it_page_shift,
 956                                        direction);
 957
 958                iommu_flush_tce(tbl);
 959
 960                return ret;
 961        }
 962        case VFIO_IOMMU_UNMAP_DMA: {
 963                struct vfio_iommu_type1_dma_unmap param;
 964                struct iommu_table *tbl = NULL;
 965                long num;
 966
 967                if (!container->enabled)
 968                        return -EPERM;
 969
 970                minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
 971                                size);
 972
 973                if (copy_from_user(&param, (void __user *)arg, minsz))
 974                        return -EFAULT;
 975
 976                if (param.argsz < minsz)
 977                        return -EINVAL;
 978
 979                /* No flag is supported now */
 980                if (param.flags)
 981                        return -EINVAL;
 982
 983                ret = tce_iommu_create_default_window(container);
 984                if (ret)
 985                        return ret;
 986
 987                num = tce_iommu_find_table(container, param.iova, &tbl);
 988                if (num < 0)
 989                        return -ENXIO;
 990
 991                if (param.size & ~IOMMU_PAGE_MASK(tbl))
 992                        return -EINVAL;
 993
 994                ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
 995                                param.size >> tbl->it_page_shift);
 996                if (ret)
 997                        return ret;
 998
 999                ret = tce_iommu_clear(container, tbl,
1000                                param.iova >> tbl->it_page_shift,
1001                                param.size >> tbl->it_page_shift);
1002                iommu_flush_tce(tbl);
1003
1004                return ret;
1005        }
1006        case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
1007                struct vfio_iommu_spapr_register_memory param;
1008
1009                if (!container->v2)
1010                        break;
1011
1012                minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1013                                size);
1014
1015                ret = tce_iommu_mm_set(container);
1016                if (ret)
1017                        return ret;
1018
1019                if (copy_from_user(&param, (void __user *)arg, minsz))
1020                        return -EFAULT;
1021
1022                if (param.argsz < minsz)
1023                        return -EINVAL;
1024
1025                /* No flag is supported now */
1026                if (param.flags)
1027                        return -EINVAL;
1028
1029                mutex_lock(&container->lock);
1030                ret = tce_iommu_register_pages(container, param.vaddr,
1031                                param.size);
1032                mutex_unlock(&container->lock);
1033
1034                return ret;
1035        }
1036        case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
1037                struct vfio_iommu_spapr_register_memory param;
1038
1039                if (!container->v2)
1040                        break;
1041
1042                if (!container->mm)
1043                        return -EPERM;
1044
1045                minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1046                                size);
1047
1048                if (copy_from_user(&param, (void __user *)arg, minsz))
1049                        return -EFAULT;
1050
1051                if (param.argsz < minsz)
1052                        return -EINVAL;
1053
1054                /* No flag is supported now */
1055                if (param.flags)
1056                        return -EINVAL;
1057
1058                mutex_lock(&container->lock);
1059                ret = tce_iommu_unregister_pages(container, param.vaddr,
1060                                param.size);
1061                mutex_unlock(&container->lock);
1062
1063                return ret;
1064        }
1065        case VFIO_IOMMU_ENABLE:
1066                if (container->v2)
1067                        break;
1068
1069                mutex_lock(&container->lock);
1070                ret = tce_iommu_enable(container);
1071                mutex_unlock(&container->lock);
1072                return ret;
1073
1074
1075        case VFIO_IOMMU_DISABLE:
1076                if (container->v2)
1077                        break;
1078
1079                mutex_lock(&container->lock);
1080                tce_iommu_disable(container);
1081                mutex_unlock(&container->lock);
1082                return 0;
1083
1084        case VFIO_EEH_PE_OP: {
1085                struct tce_iommu_group *tcegrp;
1086
1087                ret = 0;
1088                list_for_each_entry(tcegrp, &container->group_list, next) {
1089                        ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
1090                                        cmd, arg);
1091                        if (ret)
1092                                return ret;
1093                }
1094                return ret;
1095        }
1096
1097        case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1098                struct vfio_iommu_spapr_tce_create create;
1099
1100                if (!container->v2)
1101                        break;
1102
1103                ret = tce_iommu_mm_set(container);
1104                if (ret)
1105                        return ret;
1106
1107                if (!tce_groups_attached(container))
1108                        return -ENXIO;
1109
1110                minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1111                                start_addr);
1112
1113                if (copy_from_user(&create, (void __user *)arg, minsz))
1114                        return -EFAULT;
1115
1116                if (create.argsz < minsz)
1117                        return -EINVAL;
1118
1119                if (create.flags)
1120                        return -EINVAL;
1121
1122                mutex_lock(&container->lock);
1123
1124                ret = tce_iommu_create_default_window(container);
1125                if (!ret)
1126                        ret = tce_iommu_create_window(container,
1127                                        create.page_shift,
1128                                        create.window_size, create.levels,
1129                                        &create.start_addr);
1130
1131                mutex_unlock(&container->lock);
1132
1133                if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1134                        ret = -EFAULT;
1135
1136                return ret;
1137        }
1138        case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1139                struct vfio_iommu_spapr_tce_remove remove;
1140
1141                if (!container->v2)
1142                        break;
1143
1144                ret = tce_iommu_mm_set(container);
1145                if (ret)
1146                        return ret;
1147
1148                if (!tce_groups_attached(container))
1149                        return -ENXIO;
1150
1151                minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1152                                start_addr);
1153
1154                if (copy_from_user(&remove, (void __user *)arg, minsz))
1155                        return -EFAULT;
1156
1157                if (remove.argsz < minsz)
1158                        return -EINVAL;
1159
1160                if (remove.flags)
1161                        return -EINVAL;
1162
1163                if (container->def_window_pending && !remove.start_addr) {
1164                        container->def_window_pending = false;
1165                        return 0;
1166                }
1167
1168                mutex_lock(&container->lock);
1169
1170                ret = tce_iommu_remove_window(container, remove.start_addr);
1171
1172                mutex_unlock(&container->lock);
1173
1174                return ret;
1175        }
1176        }
1177
1178        return -ENOTTY;
1179}
1180
1181static void tce_iommu_release_ownership(struct tce_container *container,
1182                struct iommu_table_group *table_group)
1183{
1184        int i;
1185
1186        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1187                struct iommu_table *tbl = container->tables[i];
1188
1189                if (!tbl)
1190                        continue;
1191
1192                tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1193                if (tbl->it_map)
1194                        iommu_release_ownership(tbl);
1195
1196                container->tables[i] = NULL;
1197        }
1198}
1199
1200static int tce_iommu_take_ownership(struct tce_container *container,
1201                struct iommu_table_group *table_group)
1202{
1203        int i, j, rc = 0;
1204
1205        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1206                struct iommu_table *tbl = table_group->tables[i];
1207
1208                if (!tbl || !tbl->it_map)
1209                        continue;
1210
1211                rc = iommu_take_ownership(tbl);
1212                if (rc) {
1213                        for (j = 0; j < i; ++j)
1214                                iommu_release_ownership(
1215                                                table_group->tables[j]);
1216
1217                        return rc;
1218                }
1219        }
1220
1221        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1222                container->tables[i] = table_group->tables[i];
1223
1224        return 0;
1225}
1226
1227static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1228                struct iommu_table_group *table_group)
1229{
1230        long i;
1231
1232        if (!table_group->ops->unset_window) {
1233                WARN_ON_ONCE(1);
1234                return;
1235        }
1236
1237        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1238                table_group->ops->unset_window(table_group, i);
1239
1240        table_group->ops->release_ownership(table_group);
1241}
1242
1243static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1244                struct iommu_table_group *table_group)
1245{
1246        long i, ret = 0;
1247
1248        if (!table_group->ops->create_table || !table_group->ops->set_window ||
1249                        !table_group->ops->release_ownership) {
1250                WARN_ON_ONCE(1);
1251                return -EFAULT;
1252        }
1253
1254        table_group->ops->take_ownership(table_group);
1255
1256        /* Set all windows to the new group */
1257        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1258                struct iommu_table *tbl = container->tables[i];
1259
1260                if (!tbl)
1261                        continue;
1262
1263                ret = table_group->ops->set_window(table_group, i, tbl);
1264                if (ret)
1265                        goto release_exit;
1266        }
1267
1268        return 0;
1269
1270release_exit:
1271        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1272                table_group->ops->unset_window(table_group, i);
1273
1274        table_group->ops->release_ownership(table_group);
1275
1276        return ret;
1277}
1278
1279static int tce_iommu_attach_group(void *iommu_data,
1280                struct iommu_group *iommu_group)
1281{
1282        int ret;
1283        struct tce_container *container = iommu_data;
1284        struct iommu_table_group *table_group;
1285        struct tce_iommu_group *tcegrp = NULL;
1286
1287        mutex_lock(&container->lock);
1288
1289        /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1290                        iommu_group_id(iommu_group), iommu_group); */
1291        table_group = iommu_group_get_iommudata(iommu_group);
1292        if (!table_group) {
1293                ret = -ENODEV;
1294                goto unlock_exit;
1295        }
1296
1297        if (tce_groups_attached(container) && (!table_group->ops ||
1298                        !table_group->ops->take_ownership ||
1299                        !table_group->ops->release_ownership)) {
1300                ret = -EBUSY;
1301                goto unlock_exit;
1302        }
1303
1304        /* Check if new group has the same iommu_ops (i.e. compatible) */
1305        list_for_each_entry(tcegrp, &container->group_list, next) {
1306                struct iommu_table_group *table_group_tmp;
1307
1308                if (tcegrp->grp == iommu_group) {
1309                        pr_warn("tce_vfio: Group %d is already attached\n",
1310                                        iommu_group_id(iommu_group));
1311                        ret = -EBUSY;
1312                        goto unlock_exit;
1313                }
1314                table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1315                if (table_group_tmp->ops->create_table !=
1316                                table_group->ops->create_table) {
1317                        pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1318                                        iommu_group_id(iommu_group),
1319                                        iommu_group_id(tcegrp->grp));
1320                        ret = -EPERM;
1321                        goto unlock_exit;
1322                }
1323        }
1324
1325        tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1326        if (!tcegrp) {
1327                ret = -ENOMEM;
1328                goto unlock_exit;
1329        }
1330
1331        if (!table_group->ops || !table_group->ops->take_ownership ||
1332                        !table_group->ops->release_ownership) {
1333                if (container->v2) {
1334                        ret = -EPERM;
1335                        goto unlock_exit;
1336                }
1337                ret = tce_iommu_take_ownership(container, table_group);
1338        } else {
1339                if (!container->v2) {
1340                        ret = -EPERM;
1341                        goto unlock_exit;
1342                }
1343                ret = tce_iommu_take_ownership_ddw(container, table_group);
1344                if (!tce_groups_attached(container) && !container->tables[0])
1345                        container->def_window_pending = true;
1346        }
1347
1348        if (!ret) {
1349                tcegrp->grp = iommu_group;
1350                list_add(&tcegrp->next, &container->group_list);
1351        }
1352
1353unlock_exit:
1354        if (ret && tcegrp)
1355                kfree(tcegrp);
1356
1357        mutex_unlock(&container->lock);
1358
1359        return ret;
1360}
1361
1362static void tce_iommu_detach_group(void *iommu_data,
1363                struct iommu_group *iommu_group)
1364{
1365        struct tce_container *container = iommu_data;
1366        struct iommu_table_group *table_group;
1367        bool found = false;
1368        struct tce_iommu_group *tcegrp;
1369
1370        mutex_lock(&container->lock);
1371
1372        list_for_each_entry(tcegrp, &container->group_list, next) {
1373                if (tcegrp->grp == iommu_group) {
1374                        found = true;
1375                        break;
1376                }
1377        }
1378
1379        if (!found) {
1380                pr_warn("tce_vfio: detaching unattached group #%u\n",
1381                                iommu_group_id(iommu_group));
1382                goto unlock_exit;
1383        }
1384
1385        list_del(&tcegrp->next);
1386        kfree(tcegrp);
1387
1388        table_group = iommu_group_get_iommudata(iommu_group);
1389        BUG_ON(!table_group);
1390
1391        if (!table_group->ops || !table_group->ops->release_ownership)
1392                tce_iommu_release_ownership(container, table_group);
1393        else
1394                tce_iommu_release_ownership_ddw(container, table_group);
1395
1396unlock_exit:
1397        mutex_unlock(&container->lock);
1398}
1399
1400const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1401        .name           = "iommu-vfio-powerpc",
1402        .owner          = THIS_MODULE,
1403        .open           = tce_iommu_open,
1404        .release        = tce_iommu_release,
1405        .ioctl          = tce_iommu_ioctl,
1406        .attach_group   = tce_iommu_attach_group,
1407        .detach_group   = tce_iommu_detach_group,
1408};
1409
1410static int __init tce_iommu_init(void)
1411{
1412        return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1413}
1414
1415static void __exit tce_iommu_cleanup(void)
1416{
1417        vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1418}
1419
1420module_init(tce_iommu_init);
1421module_exit(tce_iommu_cleanup);
1422
1423MODULE_VERSION(DRIVER_VERSION);
1424MODULE_LICENSE("GPL v2");
1425MODULE_AUTHOR(DRIVER_AUTHOR);
1426MODULE_DESCRIPTION(DRIVER_DESC);
1427
1428