linux/drivers/vfio/vfio_iommu_spapr_tce.c
<<
>>
Prefs
   1/*
   2 * VFIO: IOMMU DMA mapping support for TCE on POWER
   3 *
   4 * Copyright (C) 2013 IBM Corp.  All rights reserved.
   5 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio_iommu_type1.c:
  12 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
  13 *     Author: Alex Williamson <alex.williamson@redhat.com>
  14 */
  15
  16#include <linux/module.h>
  17#include <linux/pci.h>
  18#include <linux/slab.h>
  19#include <linux/uaccess.h>
  20#include <linux/err.h>
  21#include <linux/vfio.h>
  22#include <linux/vmalloc.h>
  23#include <asm/iommu.h>
  24#include <asm/tce.h>
  25#include <asm/mmu_context.h>
  26
  27#define DRIVER_VERSION  "0.1"
  28#define DRIVER_AUTHOR   "aik@ozlabs.ru"
  29#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
  30
  31static void tce_iommu_detach_group(void *iommu_data,
  32                struct iommu_group *iommu_group);
  33
  34static long try_increment_locked_vm(struct mm_struct *mm, long npages)
  35{
  36        long ret = 0, locked, lock_limit;
  37
  38        if (WARN_ON_ONCE(!mm))
  39                return -EPERM;
  40
  41        if (!npages)
  42                return 0;
  43
  44        down_write(&mm->mmap_sem);
  45        locked = mm->locked_vm + npages;
  46        lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  47        if (locked > lock_limit && !capable(CAP_IPC_LOCK))
  48                ret = -ENOMEM;
  49        else
  50                mm->locked_vm += npages;
  51
  52        pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
  53                        npages << PAGE_SHIFT,
  54                        mm->locked_vm << PAGE_SHIFT,
  55                        rlimit(RLIMIT_MEMLOCK),
  56                        ret ? " - exceeded" : "");
  57
  58        up_write(&mm->mmap_sem);
  59
  60        return ret;
  61}
  62
  63static void decrement_locked_vm(struct mm_struct *mm, long npages)
  64{
  65        if (!mm || !npages)
  66                return;
  67
  68        down_write(&mm->mmap_sem);
  69        if (WARN_ON_ONCE(npages > mm->locked_vm))
  70                npages = mm->locked_vm;
  71        mm->locked_vm -= npages;
  72        pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
  73                        npages << PAGE_SHIFT,
  74                        mm->locked_vm << PAGE_SHIFT,
  75                        rlimit(RLIMIT_MEMLOCK));
  76        up_write(&mm->mmap_sem);
  77}
  78
  79/*
  80 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  81 *
  82 * This code handles mapping and unmapping of user data buffers
  83 * into DMA'ble space using the IOMMU
  84 */
  85
  86struct tce_iommu_group {
  87        struct list_head next;
  88        struct iommu_group *grp;
  89};
  90
  91/*
  92 * A container needs to remember which preregistered region  it has
  93 * referenced to do proper cleanup at the userspace process exit.
  94 */
  95struct tce_iommu_prereg {
  96        struct list_head next;
  97        struct mm_iommu_table_group_mem_t *mem;
  98};
  99
 100/*
 101 * The container descriptor supports only a single group per container.
 102 * Required by the API as the container is not supplied with the IOMMU group
 103 * at the moment of initialization.
 104 */
 105struct tce_container {
 106        struct mutex lock;
 107        bool enabled;
 108        bool v2;
 109        bool def_window_pending;
 110        unsigned long locked_pages;
 111        struct mm_struct *mm;
 112        struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
 113        struct list_head group_list;
 114        struct list_head prereg_list;
 115};
 116
 117static long tce_iommu_mm_set(struct tce_container *container)
 118{
 119        if (container->mm) {
 120                if (container->mm == current->mm)
 121                        return 0;
 122                return -EPERM;
 123        }
 124        BUG_ON(!current->mm);
 125        container->mm = current->mm;
 126        atomic_inc(&container->mm->mm_count);
 127
 128        return 0;
 129}
 130
 131static long tce_iommu_prereg_free(struct tce_container *container,
 132                struct tce_iommu_prereg *tcemem)
 133{
 134        long ret;
 135
 136        ret = mm_iommu_put(container->mm, tcemem->mem);
 137        if (ret)
 138                return ret;
 139
 140        list_del(&tcemem->next);
 141        kfree(tcemem);
 142
 143        return 0;
 144}
 145
 146static long tce_iommu_unregister_pages(struct tce_container *container,
 147                __u64 vaddr, __u64 size)
 148{
 149        struct mm_iommu_table_group_mem_t *mem;
 150        struct tce_iommu_prereg *tcemem;
 151        bool found = false;
 152
 153        if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
 154                return -EINVAL;
 155
 156        mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT);
 157        if (!mem)
 158                return -ENOENT;
 159
 160        list_for_each_entry(tcemem, &container->prereg_list, next) {
 161                if (tcemem->mem == mem) {
 162                        found = true;
 163                        break;
 164                }
 165        }
 166
 167        if (!found)
 168                return -ENOENT;
 169
 170        return tce_iommu_prereg_free(container, tcemem);
 171}
 172
 173static long tce_iommu_register_pages(struct tce_container *container,
 174                __u64 vaddr, __u64 size)
 175{
 176        long ret = 0;
 177        struct mm_iommu_table_group_mem_t *mem = NULL;
 178        struct tce_iommu_prereg *tcemem;
 179        unsigned long entries = size >> PAGE_SHIFT;
 180
 181        if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
 182                        ((vaddr + size) < vaddr))
 183                return -EINVAL;
 184
 185        mem = mm_iommu_find(container->mm, vaddr, entries);
 186        if (mem) {
 187                list_for_each_entry(tcemem, &container->prereg_list, next) {
 188                        if (tcemem->mem == mem)
 189                                return -EBUSY;
 190                }
 191        }
 192
 193        ret = mm_iommu_get(container->mm, vaddr, entries, &mem);
 194        if (ret)
 195                return ret;
 196
 197        tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
 198        if (!tcemem) {
 199                mm_iommu_put(container->mm, mem);
 200                return -ENOMEM;
 201        }
 202
 203        tcemem->mem = mem;
 204        list_add(&tcemem->next, &container->prereg_list);
 205
 206        container->enabled = true;
 207
 208        return 0;
 209}
 210
 211static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl,
 212                struct mm_struct *mm)
 213{
 214        unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
 215                        tbl->it_size, PAGE_SIZE);
 216        unsigned long *uas;
 217        long ret;
 218
 219        BUG_ON(tbl->it_userspace);
 220
 221        ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT);
 222        if (ret)
 223                return ret;
 224
 225        uas = vzalloc(cb);
 226        if (!uas) {
 227                decrement_locked_vm(mm, cb >> PAGE_SHIFT);
 228                return -ENOMEM;
 229        }
 230        tbl->it_userspace = uas;
 231
 232        return 0;
 233}
 234
 235static void tce_iommu_userspace_view_free(struct iommu_table *tbl,
 236                struct mm_struct *mm)
 237{
 238        unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
 239                        tbl->it_size, PAGE_SIZE);
 240
 241        if (!tbl->it_userspace)
 242                return;
 243
 244        vfree(tbl->it_userspace);
 245        tbl->it_userspace = NULL;
 246        decrement_locked_vm(mm, cb >> PAGE_SHIFT);
 247}
 248
 249static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 250{
 251        /*
 252         * Check that the TCE table granularity is not bigger than the size of
 253         * a page we just found. Otherwise the hardware can get access to
 254         * a bigger memory chunk that it should.
 255         */
 256        return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
 257}
 258
 259static inline bool tce_groups_attached(struct tce_container *container)
 260{
 261        return !list_empty(&container->group_list);
 262}
 263
 264static long tce_iommu_find_table(struct tce_container *container,
 265                phys_addr_t ioba, struct iommu_table **ptbl)
 266{
 267        long i;
 268
 269        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 270                struct iommu_table *tbl = container->tables[i];
 271
 272                if (tbl) {
 273                        unsigned long entry = ioba >> tbl->it_page_shift;
 274                        unsigned long start = tbl->it_offset;
 275                        unsigned long end = start + tbl->it_size;
 276
 277                        if ((start <= entry) && (entry < end)) {
 278                                *ptbl = tbl;
 279                                return i;
 280                        }
 281                }
 282        }
 283
 284        return -1;
 285}
 286
 287static int tce_iommu_find_free_table(struct tce_container *container)
 288{
 289        int i;
 290
 291        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 292                if (!container->tables[i])
 293                        return i;
 294        }
 295
 296        return -ENOSPC;
 297}
 298
 299static int tce_iommu_enable(struct tce_container *container)
 300{
 301        int ret = 0;
 302        unsigned long locked;
 303        struct iommu_table_group *table_group;
 304        struct tce_iommu_group *tcegrp;
 305
 306        if (container->enabled)
 307                return -EBUSY;
 308
 309        /*
 310         * When userspace pages are mapped into the IOMMU, they are effectively
 311         * locked memory, so, theoretically, we need to update the accounting
 312         * of locked pages on each map and unmap.  For powerpc, the map unmap
 313         * paths can be very hot, though, and the accounting would kill
 314         * performance, especially since it would be difficult to impossible
 315         * to handle the accounting in real mode only.
 316         *
 317         * To address that, rather than precisely accounting every page, we
 318         * instead account for a worst case on locked memory when the iommu is
 319         * enabled and disabled.  The worst case upper bound on locked memory
 320         * is the size of the whole iommu window, which is usually relatively
 321         * small (compared to total memory sizes) on POWER hardware.
 322         *
 323         * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 324         * that would effectively kill the guest at random points, much better
 325         * enforcing the limit based on the max that the guest can map.
 326         *
 327         * Unfortunately at the moment it counts whole tables, no matter how
 328         * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
 329         * each with 2GB DMA window, 8GB will be counted here. The reason for
 330         * this is that we cannot tell here the amount of RAM used by the guest
 331         * as this information is only available from KVM and VFIO is
 332         * KVM agnostic.
 333         *
 334         * So we do not allow enabling a container without a group attached
 335         * as there is no way to know how much we should increment
 336         * the locked_vm counter.
 337         */
 338        if (!tce_groups_attached(container))
 339                return -ENODEV;
 340
 341        tcegrp = list_first_entry(&container->group_list,
 342                        struct tce_iommu_group, next);
 343        table_group = iommu_group_get_iommudata(tcegrp->grp);
 344        if (!table_group)
 345                return -ENODEV;
 346
 347        if (!table_group->tce32_size)
 348                return -EPERM;
 349
 350        ret = tce_iommu_mm_set(container);
 351        if (ret)
 352                return ret;
 353
 354        locked = table_group->tce32_size >> PAGE_SHIFT;
 355        ret = try_increment_locked_vm(container->mm, locked);
 356        if (ret)
 357                return ret;
 358
 359        container->locked_pages = locked;
 360
 361        container->enabled = true;
 362
 363        return ret;
 364}
 365
 366static void tce_iommu_disable(struct tce_container *container)
 367{
 368        if (!container->enabled)
 369                return;
 370
 371        container->enabled = false;
 372
 373        BUG_ON(!container->mm);
 374        decrement_locked_vm(container->mm, container->locked_pages);
 375}
 376
 377static void *tce_iommu_open(unsigned long arg)
 378{
 379        struct tce_container *container;
 380
 381        if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
 382                pr_err("tce_vfio: Wrong IOMMU type\n");
 383                return ERR_PTR(-EINVAL);
 384        }
 385
 386        container = kzalloc(sizeof(*container), GFP_KERNEL);
 387        if (!container)
 388                return ERR_PTR(-ENOMEM);
 389
 390        mutex_init(&container->lock);
 391        INIT_LIST_HEAD_RCU(&container->group_list);
 392        INIT_LIST_HEAD_RCU(&container->prereg_list);
 393
 394        container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 395
 396        return container;
 397}
 398
 399static int tce_iommu_clear(struct tce_container *container,
 400                struct iommu_table *tbl,
 401                unsigned long entry, unsigned long pages);
 402static void tce_iommu_free_table(struct tce_container *container,
 403                struct iommu_table *tbl);
 404
 405static void tce_iommu_release(void *iommu_data)
 406{
 407        struct tce_container *container = iommu_data;
 408        struct tce_iommu_group *tcegrp;
 409        long i;
 410
 411        while (tce_groups_attached(container)) {
 412                tcegrp = list_first_entry(&container->group_list,
 413                                struct tce_iommu_group, next);
 414                tce_iommu_detach_group(iommu_data, tcegrp->grp);
 415        }
 416
 417        /*
 418         * If VFIO created a table, it was not disposed
 419         * by tce_iommu_detach_group() so do it now.
 420         */
 421        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 422                struct iommu_table *tbl = container->tables[i];
 423
 424                if (!tbl)
 425                        continue;
 426
 427                tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 428                tce_iommu_free_table(container, tbl);
 429        }
 430
 431        while (!list_empty(&container->prereg_list)) {
 432                struct tce_iommu_prereg *tcemem;
 433
 434                tcemem = list_first_entry(&container->prereg_list,
 435                                struct tce_iommu_prereg, next);
 436                WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem));
 437        }
 438
 439        tce_iommu_disable(container);
 440        if (container->mm)
 441                mmdrop(container->mm);
 442        mutex_destroy(&container->lock);
 443
 444        kfree(container);
 445}
 446
 447static void tce_iommu_unuse_page(struct tce_container *container,
 448                unsigned long hpa)
 449{
 450        struct page *page;
 451
 452        page = pfn_to_page(hpa >> PAGE_SHIFT);
 453        put_page(page);
 454}
 455
 456static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
 457                unsigned long tce, unsigned long size,
 458                unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
 459{
 460        long ret = 0;
 461        struct mm_iommu_table_group_mem_t *mem;
 462
 463        mem = mm_iommu_lookup(container->mm, tce, size);
 464        if (!mem)
 465                return -EINVAL;
 466
 467        ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
 468        if (ret)
 469                return -EINVAL;
 470
 471        *pmem = mem;
 472
 473        return 0;
 474}
 475
 476static void tce_iommu_unuse_page_v2(struct tce_container *container,
 477                struct iommu_table *tbl, unsigned long entry)
 478{
 479        struct mm_iommu_table_group_mem_t *mem = NULL;
 480        int ret;
 481        unsigned long hpa = 0;
 482        unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
 483
 484        if (!pua)
 485                return;
 486
 487        ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl),
 488                        &hpa, &mem);
 489        if (ret)
 490                pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
 491                                __func__, *pua, entry, ret);
 492        if (mem)
 493                mm_iommu_mapped_dec(mem);
 494
 495        *pua = 0;
 496}
 497
 498static int tce_iommu_clear(struct tce_container *container,
 499                struct iommu_table *tbl,
 500                unsigned long entry, unsigned long pages)
 501{
 502        unsigned long oldhpa;
 503        long ret;
 504        enum dma_data_direction direction;
 505
 506        for ( ; pages; --pages, ++entry) {
 507                direction = DMA_NONE;
 508                oldhpa = 0;
 509                ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
 510                if (ret)
 511                        continue;
 512
 513                if (direction == DMA_NONE)
 514                        continue;
 515
 516                if (container->v2) {
 517                        tce_iommu_unuse_page_v2(container, tbl, entry);
 518                        continue;
 519                }
 520
 521                tce_iommu_unuse_page(container, oldhpa);
 522        }
 523
 524        return 0;
 525}
 526
 527static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 528{
 529        struct page *page = NULL;
 530        enum dma_data_direction direction = iommu_tce_direction(tce);
 531
 532        if (get_user_pages_fast(tce & PAGE_MASK, 1,
 533                        direction != DMA_TO_DEVICE, &page) != 1)
 534                return -EFAULT;
 535
 536        *hpa = __pa((unsigned long) page_address(page));
 537
 538        return 0;
 539}
 540
 541static long tce_iommu_build(struct tce_container *container,
 542                struct iommu_table *tbl,
 543                unsigned long entry, unsigned long tce, unsigned long pages,
 544                enum dma_data_direction direction)
 545{
 546        long i, ret = 0;
 547        struct page *page;
 548        unsigned long hpa;
 549        enum dma_data_direction dirtmp;
 550
 551        for (i = 0; i < pages; ++i) {
 552                unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 553
 554                ret = tce_iommu_use_page(tce, &hpa);
 555                if (ret)
 556                        break;
 557
 558                page = pfn_to_page(hpa >> PAGE_SHIFT);
 559                if (!tce_page_is_contained(page, tbl->it_page_shift)) {
 560                        ret = -EPERM;
 561                        break;
 562                }
 563
 564                hpa |= offset;
 565                dirtmp = direction;
 566                ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
 567                if (ret) {
 568                        tce_iommu_unuse_page(container, hpa);
 569                        pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 570                                        __func__, entry << tbl->it_page_shift,
 571                                        tce, ret);
 572                        break;
 573                }
 574
 575                if (dirtmp != DMA_NONE)
 576                        tce_iommu_unuse_page(container, hpa);
 577
 578                tce += IOMMU_PAGE_SIZE(tbl);
 579        }
 580
 581        if (ret)
 582                tce_iommu_clear(container, tbl, entry, i);
 583
 584        return ret;
 585}
 586
 587static long tce_iommu_build_v2(struct tce_container *container,
 588                struct iommu_table *tbl,
 589                unsigned long entry, unsigned long tce, unsigned long pages,
 590                enum dma_data_direction direction)
 591{
 592        long i, ret = 0;
 593        struct page *page;
 594        unsigned long hpa;
 595        enum dma_data_direction dirtmp;
 596
 597        if (!tbl->it_userspace) {
 598                ret = tce_iommu_userspace_view_alloc(tbl, container->mm);
 599                if (ret)
 600                        return ret;
 601        }
 602
 603        for (i = 0; i < pages; ++i) {
 604                struct mm_iommu_table_group_mem_t *mem = NULL;
 605                unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
 606                                entry + i);
 607
 608                ret = tce_iommu_prereg_ua_to_hpa(container,
 609                                tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem);
 610                if (ret)
 611                        break;
 612
 613                page = pfn_to_page(hpa >> PAGE_SHIFT);
 614                if (!tce_page_is_contained(page, tbl->it_page_shift)) {
 615                        ret = -EPERM;
 616                        break;
 617                }
 618
 619                /* Preserve offset within IOMMU page */
 620                hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 621                dirtmp = direction;
 622
 623                /* The registered region is being unregistered */
 624                if (mm_iommu_mapped_inc(mem))
 625                        break;
 626
 627                ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
 628                if (ret) {
 629                        /* dirtmp cannot be DMA_NONE here */
 630                        tce_iommu_unuse_page_v2(container, tbl, entry + i);
 631                        pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 632                                        __func__, entry << tbl->it_page_shift,
 633                                        tce, ret);
 634                        break;
 635                }
 636
 637                if (dirtmp != DMA_NONE)
 638                        tce_iommu_unuse_page_v2(container, tbl, entry + i);
 639
 640                *pua = tce;
 641
 642                tce += IOMMU_PAGE_SIZE(tbl);
 643        }
 644
 645        if (ret)
 646                tce_iommu_clear(container, tbl, entry, i);
 647
 648        return ret;
 649}
 650
 651static long tce_iommu_create_table(struct tce_container *container,
 652                        struct iommu_table_group *table_group,
 653                        int num,
 654                        __u32 page_shift,
 655                        __u64 window_size,
 656                        __u32 levels,
 657                        struct iommu_table **ptbl)
 658{
 659        long ret, table_size;
 660
 661        table_size = table_group->ops->get_table_size(page_shift, window_size,
 662                        levels);
 663        if (!table_size)
 664                return -EINVAL;
 665
 666        ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
 667        if (ret)
 668                return ret;
 669
 670        ret = table_group->ops->create_table(table_group, num,
 671                        page_shift, window_size, levels, ptbl);
 672
 673        WARN_ON(!ret && !(*ptbl)->it_ops->free);
 674        WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
 675
 676        return ret;
 677}
 678
 679static void tce_iommu_free_table(struct tce_container *container,
 680                struct iommu_table *tbl)
 681{
 682        unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 683
 684        tce_iommu_userspace_view_free(tbl, container->mm);
 685        tbl->it_ops->free(tbl);
 686        decrement_locked_vm(container->mm, pages);
 687}
 688
 689static long tce_iommu_create_window(struct tce_container *container,
 690                __u32 page_shift, __u64 window_size, __u32 levels,
 691                __u64 *start_addr)
 692{
 693        struct tce_iommu_group *tcegrp;
 694        struct iommu_table_group *table_group;
 695        struct iommu_table *tbl = NULL;
 696        long ret, num;
 697
 698        num = tce_iommu_find_free_table(container);
 699        if (num < 0)
 700                return num;
 701
 702        /* Get the first group for ops::create_table */
 703        tcegrp = list_first_entry(&container->group_list,
 704                        struct tce_iommu_group, next);
 705        table_group = iommu_group_get_iommudata(tcegrp->grp);
 706        if (!table_group)
 707                return -EFAULT;
 708
 709        if (!(table_group->pgsizes & (1ULL << page_shift)))
 710                return -EINVAL;
 711
 712        if (!table_group->ops->set_window || !table_group->ops->unset_window ||
 713                        !table_group->ops->get_table_size ||
 714                        !table_group->ops->create_table)
 715                return -EPERM;
 716
 717        /* Create TCE table */
 718        ret = tce_iommu_create_table(container, table_group, num,
 719                        page_shift, window_size, levels, &tbl);
 720        if (ret)
 721                return ret;
 722
 723        BUG_ON(!tbl->it_ops->free);
 724
 725        /*
 726         * Program the table to every group.
 727         * Groups have been tested for compatibility at the attach time.
 728         */
 729        list_for_each_entry(tcegrp, &container->group_list, next) {
 730                table_group = iommu_group_get_iommudata(tcegrp->grp);
 731
 732                ret = table_group->ops->set_window(table_group, num, tbl);
 733                if (ret)
 734                        goto unset_exit;
 735        }
 736
 737        container->tables[num] = tbl;
 738
 739        /* Return start address assigned by platform in create_table() */
 740        *start_addr = tbl->it_offset << tbl->it_page_shift;
 741
 742        return 0;
 743
 744unset_exit:
 745        list_for_each_entry(tcegrp, &container->group_list, next) {
 746                table_group = iommu_group_get_iommudata(tcegrp->grp);
 747                table_group->ops->unset_window(table_group, num);
 748        }
 749        tce_iommu_free_table(container, tbl);
 750
 751        return ret;
 752}
 753
 754static long tce_iommu_remove_window(struct tce_container *container,
 755                __u64 start_addr)
 756{
 757        struct iommu_table_group *table_group = NULL;
 758        struct iommu_table *tbl;
 759        struct tce_iommu_group *tcegrp;
 760        int num;
 761
 762        num = tce_iommu_find_table(container, start_addr, &tbl);
 763        if (num < 0)
 764                return -EINVAL;
 765
 766        BUG_ON(!tbl->it_size);
 767
 768        /* Detach groups from IOMMUs */
 769        list_for_each_entry(tcegrp, &container->group_list, next) {
 770                table_group = iommu_group_get_iommudata(tcegrp->grp);
 771
 772                /*
 773                 * SPAPR TCE IOMMU exposes the default DMA window to
 774                 * the guest via dma32_window_start/size of
 775                 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
 776                 * the userspace to remove this window, some do not so
 777                 * here we check for the platform capability.
 778                 */
 779                if (!table_group->ops || !table_group->ops->unset_window)
 780                        return -EPERM;
 781
 782                table_group->ops->unset_window(table_group, num);
 783        }
 784
 785        /* Free table */
 786        tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 787        tce_iommu_free_table(container, tbl);
 788        container->tables[num] = NULL;
 789
 790        return 0;
 791}
 792
 793static long tce_iommu_create_default_window(struct tce_container *container)
 794{
 795        long ret;
 796        __u64 start_addr = 0;
 797        struct tce_iommu_group *tcegrp;
 798        struct iommu_table_group *table_group;
 799
 800        if (!container->def_window_pending)
 801                return 0;
 802
 803        if (!tce_groups_attached(container))
 804                return -ENODEV;
 805
 806        tcegrp = list_first_entry(&container->group_list,
 807                        struct tce_iommu_group, next);
 808        table_group = iommu_group_get_iommudata(tcegrp->grp);
 809        if (!table_group)
 810                return -ENODEV;
 811
 812        ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
 813                        table_group->tce32_size, 1, &start_addr);
 814        WARN_ON_ONCE(!ret && start_addr);
 815
 816        if (!ret)
 817                container->def_window_pending = false;
 818
 819        return ret;
 820}
 821
 822static long tce_iommu_ioctl(void *iommu_data,
 823                                 unsigned int cmd, unsigned long arg)
 824{
 825        struct tce_container *container = iommu_data;
 826        unsigned long minsz, ddwsz;
 827        long ret;
 828
 829        switch (cmd) {
 830        case VFIO_CHECK_EXTENSION:
 831                switch (arg) {
 832                case VFIO_SPAPR_TCE_IOMMU:
 833                case VFIO_SPAPR_TCE_v2_IOMMU:
 834                        ret = 1;
 835                        break;
 836                default:
 837                        ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
 838                        break;
 839                }
 840
 841                return (ret < 0) ? 0 : ret;
 842        }
 843
 844        /*
 845         * Sanity check to prevent one userspace from manipulating
 846         * another userspace mm.
 847         */
 848        BUG_ON(!container);
 849        if (container->mm && container->mm != current->mm)
 850                return -EPERM;
 851
 852        switch (cmd) {
 853        case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 854                struct vfio_iommu_spapr_tce_info info;
 855                struct tce_iommu_group *tcegrp;
 856                struct iommu_table_group *table_group;
 857
 858                if (!tce_groups_attached(container))
 859                        return -ENXIO;
 860
 861                tcegrp = list_first_entry(&container->group_list,
 862                                struct tce_iommu_group, next);
 863                table_group = iommu_group_get_iommudata(tcegrp->grp);
 864
 865                if (!table_group)
 866                        return -ENXIO;
 867
 868                minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
 869                                dma32_window_size);
 870
 871                if (copy_from_user(&info, (void __user *)arg, minsz))
 872                        return -EFAULT;
 873
 874                if (info.argsz < minsz)
 875                        return -EINVAL;
 876
 877                info.dma32_window_start = table_group->tce32_start;
 878                info.dma32_window_size = table_group->tce32_size;
 879                info.flags = 0;
 880                memset(&info.ddw, 0, sizeof(info.ddw));
 881
 882                if (table_group->max_dynamic_windows_supported &&
 883                                container->v2) {
 884                        info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
 885                        info.ddw.pgsizes = table_group->pgsizes;
 886                        info.ddw.max_dynamic_windows_supported =
 887                                table_group->max_dynamic_windows_supported;
 888                        info.ddw.levels = table_group->max_levels;
 889                }
 890
 891                ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
 892
 893                if (info.argsz >= ddwsz)
 894                        minsz = ddwsz;
 895
 896                if (copy_to_user((void __user *)arg, &info, minsz))
 897                        return -EFAULT;
 898
 899                return 0;
 900        }
 901        case VFIO_IOMMU_MAP_DMA: {
 902                struct vfio_iommu_type1_dma_map param;
 903                struct iommu_table *tbl = NULL;
 904                long num;
 905                enum dma_data_direction direction;
 906
 907                if (!container->enabled)
 908                        return -EPERM;
 909
 910                minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 911
 912                if (copy_from_user(&param, (void __user *)arg, minsz))
 913                        return -EFAULT;
 914
 915                if (param.argsz < minsz)
 916                        return -EINVAL;
 917
 918                if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
 919                                VFIO_DMA_MAP_FLAG_WRITE))
 920                        return -EINVAL;
 921
 922                ret = tce_iommu_create_default_window(container);
 923                if (ret)
 924                        return ret;
 925
 926                num = tce_iommu_find_table(container, param.iova, &tbl);
 927                if (num < 0)
 928                        return -ENXIO;
 929
 930                if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
 931                                (param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 932                        return -EINVAL;
 933
 934                /* iova is checked by the IOMMU API */
 935                if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
 936                        if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 937                                direction = DMA_BIDIRECTIONAL;
 938                        else
 939                                direction = DMA_TO_DEVICE;
 940                } else {
 941                        if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 942                                direction = DMA_FROM_DEVICE;
 943                        else
 944                                return -EINVAL;
 945                }
 946
 947                ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
 948                if (ret)
 949                        return ret;
 950
 951                if (container->v2)
 952                        ret = tce_iommu_build_v2(container, tbl,
 953                                        param.iova >> tbl->it_page_shift,
 954                                        param.vaddr,
 955                                        param.size >> tbl->it_page_shift,
 956                                        direction);
 957                else
 958                        ret = tce_iommu_build(container, tbl,
 959                                        param.iova >> tbl->it_page_shift,
 960                                        param.vaddr,
 961                                        param.size >> tbl->it_page_shift,
 962                                        direction);
 963
 964                iommu_flush_tce(tbl);
 965
 966                return ret;
 967        }
 968        case VFIO_IOMMU_UNMAP_DMA: {
 969                struct vfio_iommu_type1_dma_unmap param;
 970                struct iommu_table *tbl = NULL;
 971                long num;
 972
 973                if (!container->enabled)
 974                        return -EPERM;
 975
 976                minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
 977                                size);
 978
 979                if (copy_from_user(&param, (void __user *)arg, minsz))
 980                        return -EFAULT;
 981
 982                if (param.argsz < minsz)
 983                        return -EINVAL;
 984
 985                /* No flag is supported now */
 986                if (param.flags)
 987                        return -EINVAL;
 988
 989                ret = tce_iommu_create_default_window(container);
 990                if (ret)
 991                        return ret;
 992
 993                num = tce_iommu_find_table(container, param.iova, &tbl);
 994                if (num < 0)
 995                        return -ENXIO;
 996
 997                if (param.size & ~IOMMU_PAGE_MASK(tbl))
 998                        return -EINVAL;
 999
1000                ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
1001                                param.size >> tbl->it_page_shift);
1002                if (ret)
1003                        return ret;
1004
1005                ret = tce_iommu_clear(container, tbl,
1006                                param.iova >> tbl->it_page_shift,
1007                                param.size >> tbl->it_page_shift);
1008                iommu_flush_tce(tbl);
1009
1010                return ret;
1011        }
1012        case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
1013                struct vfio_iommu_spapr_register_memory param;
1014
1015                if (!container->v2)
1016                        break;
1017
1018                minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1019                                size);
1020
1021                ret = tce_iommu_mm_set(container);
1022                if (ret)
1023                        return ret;
1024
1025                if (copy_from_user(&param, (void __user *)arg, minsz))
1026                        return -EFAULT;
1027
1028                if (param.argsz < minsz)
1029                        return -EINVAL;
1030
1031                /* No flag is supported now */
1032                if (param.flags)
1033                        return -EINVAL;
1034
1035                mutex_lock(&container->lock);
1036                ret = tce_iommu_register_pages(container, param.vaddr,
1037                                param.size);
1038                mutex_unlock(&container->lock);
1039
1040                return ret;
1041        }
1042        case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
1043                struct vfio_iommu_spapr_register_memory param;
1044
1045                if (!container->v2)
1046                        break;
1047
1048                if (!container->mm)
1049                        return -EPERM;
1050
1051                minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1052                                size);
1053
1054                if (copy_from_user(&param, (void __user *)arg, minsz))
1055                        return -EFAULT;
1056
1057                if (param.argsz < minsz)
1058                        return -EINVAL;
1059
1060                /* No flag is supported now */
1061                if (param.flags)
1062                        return -EINVAL;
1063
1064                mutex_lock(&container->lock);
1065                ret = tce_iommu_unregister_pages(container, param.vaddr,
1066                                param.size);
1067                mutex_unlock(&container->lock);
1068
1069                return ret;
1070        }
1071        case VFIO_IOMMU_ENABLE:
1072                if (container->v2)
1073                        break;
1074
1075                mutex_lock(&container->lock);
1076                ret = tce_iommu_enable(container);
1077                mutex_unlock(&container->lock);
1078                return ret;
1079
1080
1081        case VFIO_IOMMU_DISABLE:
1082                if (container->v2)
1083                        break;
1084
1085                mutex_lock(&container->lock);
1086                tce_iommu_disable(container);
1087                mutex_unlock(&container->lock);
1088                return 0;
1089
1090        case VFIO_EEH_PE_OP: {
1091                struct tce_iommu_group *tcegrp;
1092
1093                ret = 0;
1094                list_for_each_entry(tcegrp, &container->group_list, next) {
1095                        ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
1096                                        cmd, arg);
1097                        if (ret)
1098                                return ret;
1099                }
1100                return ret;
1101        }
1102
1103        case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1104                struct vfio_iommu_spapr_tce_create create;
1105
1106                if (!container->v2)
1107                        break;
1108
1109                ret = tce_iommu_mm_set(container);
1110                if (ret)
1111                        return ret;
1112
1113                if (!tce_groups_attached(container))
1114                        return -ENXIO;
1115
1116                minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1117                                start_addr);
1118
1119                if (copy_from_user(&create, (void __user *)arg, minsz))
1120                        return -EFAULT;
1121
1122                if (create.argsz < minsz)
1123                        return -EINVAL;
1124
1125                if (create.flags)
1126                        return -EINVAL;
1127
1128                mutex_lock(&container->lock);
1129
1130                ret = tce_iommu_create_default_window(container);
1131                if (!ret)
1132                        ret = tce_iommu_create_window(container,
1133                                        create.page_shift,
1134                                        create.window_size, create.levels,
1135                                        &create.start_addr);
1136
1137                mutex_unlock(&container->lock);
1138
1139                if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1140                        ret = -EFAULT;
1141
1142                return ret;
1143        }
1144        case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1145                struct vfio_iommu_spapr_tce_remove remove;
1146
1147                if (!container->v2)
1148                        break;
1149
1150                ret = tce_iommu_mm_set(container);
1151                if (ret)
1152                        return ret;
1153
1154                if (!tce_groups_attached(container))
1155                        return -ENXIO;
1156
1157                minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1158                                start_addr);
1159
1160                if (copy_from_user(&remove, (void __user *)arg, minsz))
1161                        return -EFAULT;
1162
1163                if (remove.argsz < minsz)
1164                        return -EINVAL;
1165
1166                if (remove.flags)
1167                        return -EINVAL;
1168
1169                if (container->def_window_pending && !remove.start_addr) {
1170                        container->def_window_pending = false;
1171                        return 0;
1172                }
1173
1174                mutex_lock(&container->lock);
1175
1176                ret = tce_iommu_remove_window(container, remove.start_addr);
1177
1178                mutex_unlock(&container->lock);
1179
1180                return ret;
1181        }
1182        }
1183
1184        return -ENOTTY;
1185}
1186
1187static void tce_iommu_release_ownership(struct tce_container *container,
1188                struct iommu_table_group *table_group)
1189{
1190        int i;
1191
1192        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1193                struct iommu_table *tbl = container->tables[i];
1194
1195                if (!tbl)
1196                        continue;
1197
1198                tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1199                tce_iommu_userspace_view_free(tbl, container->mm);
1200                if (tbl->it_map)
1201                        iommu_release_ownership(tbl);
1202
1203                container->tables[i] = NULL;
1204        }
1205}
1206
1207static int tce_iommu_take_ownership(struct tce_container *container,
1208                struct iommu_table_group *table_group)
1209{
1210        int i, j, rc = 0;
1211
1212        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1213                struct iommu_table *tbl = table_group->tables[i];
1214
1215                if (!tbl || !tbl->it_map)
1216                        continue;
1217
1218                rc = iommu_take_ownership(tbl);
1219                if (rc) {
1220                        for (j = 0; j < i; ++j)
1221                                iommu_release_ownership(
1222                                                table_group->tables[j]);
1223
1224                        return rc;
1225                }
1226        }
1227
1228        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1229                container->tables[i] = table_group->tables[i];
1230
1231        return 0;
1232}
1233
1234static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1235                struct iommu_table_group *table_group)
1236{
1237        long i;
1238
1239        if (!table_group->ops->unset_window) {
1240                WARN_ON_ONCE(1);
1241                return;
1242        }
1243
1244        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1245                table_group->ops->unset_window(table_group, i);
1246
1247        table_group->ops->release_ownership(table_group);
1248}
1249
1250static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1251                struct iommu_table_group *table_group)
1252{
1253        long i, ret = 0;
1254
1255        if (!table_group->ops->create_table || !table_group->ops->set_window ||
1256                        !table_group->ops->release_ownership) {
1257                WARN_ON_ONCE(1);
1258                return -EFAULT;
1259        }
1260
1261        table_group->ops->take_ownership(table_group);
1262
1263        /* Set all windows to the new group */
1264        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1265                struct iommu_table *tbl = container->tables[i];
1266
1267                if (!tbl)
1268                        continue;
1269
1270                ret = table_group->ops->set_window(table_group, i, tbl);
1271                if (ret)
1272                        goto release_exit;
1273        }
1274
1275        return 0;
1276
1277release_exit:
1278        for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1279                table_group->ops->unset_window(table_group, i);
1280
1281        table_group->ops->release_ownership(table_group);
1282
1283        return ret;
1284}
1285
1286static int tce_iommu_attach_group(void *iommu_data,
1287                struct iommu_group *iommu_group)
1288{
1289        int ret;
1290        struct tce_container *container = iommu_data;
1291        struct iommu_table_group *table_group;
1292        struct tce_iommu_group *tcegrp = NULL;
1293
1294        mutex_lock(&container->lock);
1295
1296        /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1297                        iommu_group_id(iommu_group), iommu_group); */
1298        table_group = iommu_group_get_iommudata(iommu_group);
1299
1300        if (tce_groups_attached(container) && (!table_group->ops ||
1301                        !table_group->ops->take_ownership ||
1302                        !table_group->ops->release_ownership)) {
1303                ret = -EBUSY;
1304                goto unlock_exit;
1305        }
1306
1307        /* Check if new group has the same iommu_ops (i.e. compatible) */
1308        list_for_each_entry(tcegrp, &container->group_list, next) {
1309                struct iommu_table_group *table_group_tmp;
1310
1311                if (tcegrp->grp == iommu_group) {
1312                        pr_warn("tce_vfio: Group %d is already attached\n",
1313                                        iommu_group_id(iommu_group));
1314                        ret = -EBUSY;
1315                        goto unlock_exit;
1316                }
1317                table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1318                if (table_group_tmp->ops != table_group->ops) {
1319                        pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1320                                        iommu_group_id(iommu_group),
1321                                        iommu_group_id(tcegrp->grp));
1322                        ret = -EPERM;
1323                        goto unlock_exit;
1324                }
1325        }
1326
1327        tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1328        if (!tcegrp) {
1329                ret = -ENOMEM;
1330                goto unlock_exit;
1331        }
1332
1333        if (!table_group->ops || !table_group->ops->take_ownership ||
1334                        !table_group->ops->release_ownership) {
1335                ret = tce_iommu_take_ownership(container, table_group);
1336        } else {
1337                ret = tce_iommu_take_ownership_ddw(container, table_group);
1338                if (!tce_groups_attached(container) && !container->tables[0])
1339                        container->def_window_pending = true;
1340        }
1341
1342        if (!ret) {
1343                tcegrp->grp = iommu_group;
1344                list_add(&tcegrp->next, &container->group_list);
1345        }
1346
1347unlock_exit:
1348        if (ret && tcegrp)
1349                kfree(tcegrp);
1350
1351        mutex_unlock(&container->lock);
1352
1353        return ret;
1354}
1355
1356static void tce_iommu_detach_group(void *iommu_data,
1357                struct iommu_group *iommu_group)
1358{
1359        struct tce_container *container = iommu_data;
1360        struct iommu_table_group *table_group;
1361        bool found = false;
1362        struct tce_iommu_group *tcegrp;
1363
1364        mutex_lock(&container->lock);
1365
1366        list_for_each_entry(tcegrp, &container->group_list, next) {
1367                if (tcegrp->grp == iommu_group) {
1368                        found = true;
1369                        break;
1370                }
1371        }
1372
1373        if (!found) {
1374                pr_warn("tce_vfio: detaching unattached group #%u\n",
1375                                iommu_group_id(iommu_group));
1376                goto unlock_exit;
1377        }
1378
1379        list_del(&tcegrp->next);
1380        kfree(tcegrp);
1381
1382        table_group = iommu_group_get_iommudata(iommu_group);
1383        BUG_ON(!table_group);
1384
1385        if (!table_group->ops || !table_group->ops->release_ownership)
1386                tce_iommu_release_ownership(container, table_group);
1387        else
1388                tce_iommu_release_ownership_ddw(container, table_group);
1389
1390unlock_exit:
1391        mutex_unlock(&container->lock);
1392}
1393
1394const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1395        .name           = "iommu-vfio-powerpc",
1396        .owner          = THIS_MODULE,
1397        .open           = tce_iommu_open,
1398        .release        = tce_iommu_release,
1399        .ioctl          = tce_iommu_ioctl,
1400        .attach_group   = tce_iommu_attach_group,
1401        .detach_group   = tce_iommu_detach_group,
1402};
1403
1404static int __init tce_iommu_init(void)
1405{
1406        return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1407}
1408
1409static void __exit tce_iommu_cleanup(void)
1410{
1411        vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1412}
1413
1414module_init(tce_iommu_init);
1415module_exit(tce_iommu_cleanup);
1416
1417MODULE_VERSION(DRIVER_VERSION);
1418MODULE_LICENSE("GPL v2");
1419MODULE_AUTHOR(DRIVER_AUTHOR);
1420MODULE_DESCRIPTION(DRIVER_DESC);
1421
1422