linux/drivers/infiniband/core/umem_odp.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 */
  32
  33#include <linux/types.h>
  34#include <linux/sched.h>
  35#include <linux/sched/mm.h>
  36#include <linux/sched/task.h>
  37#include <linux/pid.h>
  38#include <linux/slab.h>
  39#include <linux/export.h>
  40#include <linux/vmalloc.h>
  41#include <linux/hugetlb.h>
  42#include <linux/interval_tree.h>
  43#include <linux/pagemap.h>
  44
  45#include <rdma/ib_verbs.h>
  46#include <rdma/ib_umem.h>
  47#include <rdma/ib_umem_odp.h>
  48
  49#include "uverbs.h"
  50
  51static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
  52                                   const struct mmu_interval_notifier_ops *ops)
  53{
  54        int ret;
  55
  56        umem_odp->umem.is_odp = 1;
  57        mutex_init(&umem_odp->umem_mutex);
  58
  59        if (!umem_odp->is_implicit_odp) {
  60                size_t page_size = 1UL << umem_odp->page_shift;
  61                unsigned long start;
  62                unsigned long end;
  63                size_t pages;
  64
  65                start = ALIGN_DOWN(umem_odp->umem.address, page_size);
  66                if (check_add_overflow(umem_odp->umem.address,
  67                                       (unsigned long)umem_odp->umem.length,
  68                                       &end))
  69                        return -EOVERFLOW;
  70                end = ALIGN(end, page_size);
  71                if (unlikely(end < page_size))
  72                        return -EOVERFLOW;
  73
  74                pages = (end - start) >> umem_odp->page_shift;
  75                if (!pages)
  76                        return -EINVAL;
  77
  78                umem_odp->page_list = kvcalloc(
  79                        pages, sizeof(*umem_odp->page_list), GFP_KERNEL);
  80                if (!umem_odp->page_list)
  81                        return -ENOMEM;
  82
  83                umem_odp->dma_list = kvcalloc(
  84                        pages, sizeof(*umem_odp->dma_list), GFP_KERNEL);
  85                if (!umem_odp->dma_list) {
  86                        ret = -ENOMEM;
  87                        goto out_page_list;
  88                }
  89
  90                ret = mmu_interval_notifier_insert(&umem_odp->notifier,
  91                                                   umem_odp->umem.owning_mm,
  92                                                   start, end - start, ops);
  93                if (ret)
  94                        goto out_dma_list;
  95        }
  96
  97        return 0;
  98
  99out_dma_list:
 100        kvfree(umem_odp->dma_list);
 101out_page_list:
 102        kvfree(umem_odp->page_list);
 103        return ret;
 104}
 105
 106/**
 107 * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem
 108 *
 109 * Implicit ODP umems do not have a VA range and do not have any page lists.
 110 * They exist only to hold the per_mm reference to help the driver create
 111 * children umems.
 112 *
 113 * @udata: udata from the syscall being used to create the umem
 114 * @access: ib_reg_mr access flags
 115 */
 116struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
 117                                               int access)
 118{
 119        struct ib_ucontext *context =
 120                container_of(udata, struct uverbs_attr_bundle, driver_udata)
 121                        ->context;
 122        struct ib_umem *umem;
 123        struct ib_umem_odp *umem_odp;
 124        int ret;
 125
 126        if (access & IB_ACCESS_HUGETLB)
 127                return ERR_PTR(-EINVAL);
 128
 129        if (!context)
 130                return ERR_PTR(-EIO);
 131
 132        umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
 133        if (!umem_odp)
 134                return ERR_PTR(-ENOMEM);
 135        umem = &umem_odp->umem;
 136        umem->ibdev = context->device;
 137        umem->writable = ib_access_writable(access);
 138        umem->owning_mm = current->mm;
 139        umem_odp->is_implicit_odp = 1;
 140        umem_odp->page_shift = PAGE_SHIFT;
 141
 142        umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
 143        ret = ib_init_umem_odp(umem_odp, NULL);
 144        if (ret) {
 145                put_pid(umem_odp->tgid);
 146                kfree(umem_odp);
 147                return ERR_PTR(ret);
 148        }
 149        return umem_odp;
 150}
 151EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
 152
 153/**
 154 * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit
 155 *                           parent ODP umem
 156 *
 157 * @root: The parent umem enclosing the child. This must be allocated using
 158 *        ib_alloc_implicit_odp_umem()
 159 * @addr: The starting userspace VA
 160 * @size: The length of the userspace VA
 161 */
 162struct ib_umem_odp *
 163ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
 164                        size_t size,
 165                        const struct mmu_interval_notifier_ops *ops)
 166{
 167        /*
 168         * Caller must ensure that root cannot be freed during the call to
 169         * ib_alloc_odp_umem.
 170         */
 171        struct ib_umem_odp *odp_data;
 172        struct ib_umem *umem;
 173        int ret;
 174
 175        if (WARN_ON(!root->is_implicit_odp))
 176                return ERR_PTR(-EINVAL);
 177
 178        odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
 179        if (!odp_data)
 180                return ERR_PTR(-ENOMEM);
 181        umem = &odp_data->umem;
 182        umem->ibdev = root->umem.ibdev;
 183        umem->length     = size;
 184        umem->address    = addr;
 185        umem->writable   = root->umem.writable;
 186        umem->owning_mm  = root->umem.owning_mm;
 187        odp_data->page_shift = PAGE_SHIFT;
 188        odp_data->notifier.ops = ops;
 189
 190        odp_data->tgid = get_pid(root->tgid);
 191        ret = ib_init_umem_odp(odp_data, ops);
 192        if (ret) {
 193                put_pid(odp_data->tgid);
 194                kfree(odp_data);
 195                return ERR_PTR(ret);
 196        }
 197        return odp_data;
 198}
 199EXPORT_SYMBOL(ib_umem_odp_alloc_child);
 200
 201/**
 202 * ib_umem_odp_get - Create a umem_odp for a userspace va
 203 *
 204 * @udata: userspace context to pin memory for
 205 * @addr: userspace virtual address to start at
 206 * @size: length of region to pin
 207 * @access: IB_ACCESS_xxx flags for memory being pinned
 208 *
 209 * The driver should use when the access flags indicate ODP memory. It avoids
 210 * pinning, instead, stores the mm for future page fault handling in
 211 * conjunction with MMU notifiers.
 212 */
 213struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
 214                                    size_t size, int access,
 215                                    const struct mmu_interval_notifier_ops *ops)
 216{
 217        struct ib_umem_odp *umem_odp;
 218        struct ib_ucontext *context;
 219        struct mm_struct *mm;
 220        int ret;
 221
 222        if (!udata)
 223                return ERR_PTR(-EIO);
 224
 225        context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
 226                          ->context;
 227        if (!context)
 228                return ERR_PTR(-EIO);
 229
 230        if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
 231                return ERR_PTR(-EINVAL);
 232
 233        umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
 234        if (!umem_odp)
 235                return ERR_PTR(-ENOMEM);
 236
 237        umem_odp->umem.ibdev = context->device;
 238        umem_odp->umem.length = size;
 239        umem_odp->umem.address = addr;
 240        umem_odp->umem.writable = ib_access_writable(access);
 241        umem_odp->umem.owning_mm = mm = current->mm;
 242        umem_odp->notifier.ops = ops;
 243
 244        umem_odp->page_shift = PAGE_SHIFT;
 245        if (access & IB_ACCESS_HUGETLB) {
 246                struct vm_area_struct *vma;
 247                struct hstate *h;
 248
 249                down_read(&mm->mmap_sem);
 250                vma = find_vma(mm, ib_umem_start(umem_odp));
 251                if (!vma || !is_vm_hugetlb_page(vma)) {
 252                        up_read(&mm->mmap_sem);
 253                        ret = -EINVAL;
 254                        goto err_free;
 255                }
 256                h = hstate_vma(vma);
 257                umem_odp->page_shift = huge_page_shift(h);
 258                up_read(&mm->mmap_sem);
 259        }
 260
 261        umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
 262        ret = ib_init_umem_odp(umem_odp, ops);
 263        if (ret)
 264                goto err_put_pid;
 265        return umem_odp;
 266
 267err_put_pid:
 268        put_pid(umem_odp->tgid);
 269err_free:
 270        kfree(umem_odp);
 271        return ERR_PTR(ret);
 272}
 273EXPORT_SYMBOL(ib_umem_odp_get);
 274
 275void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
 276{
 277        /*
 278         * Ensure that no more pages are mapped in the umem.
 279         *
 280         * It is the driver's responsibility to ensure, before calling us,
 281         * that the hardware will not attempt to access the MR any more.
 282         */
 283        if (!umem_odp->is_implicit_odp) {
 284                mutex_lock(&umem_odp->umem_mutex);
 285                ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
 286                                            ib_umem_end(umem_odp));
 287                mutex_unlock(&umem_odp->umem_mutex);
 288                mmu_interval_notifier_remove(&umem_odp->notifier);
 289                kvfree(umem_odp->dma_list);
 290                kvfree(umem_odp->page_list);
 291                put_pid(umem_odp->tgid);
 292        }
 293        kfree(umem_odp);
 294}
 295EXPORT_SYMBOL(ib_umem_odp_release);
 296
 297/*
 298 * Map for DMA and insert a single page into the on-demand paging page tables.
 299 *
 300 * @umem: the umem to insert the page to.
 301 * @page_index: index in the umem to add the page to.
 302 * @page: the page struct to map and add.
 303 * @access_mask: access permissions needed for this page.
 304 * @current_seq: sequence number for synchronization with invalidations.
 305 *               the sequence number is taken from
 306 *               umem_odp->notifiers_seq.
 307 *
 308 * The function returns -EFAULT if the DMA mapping operation fails. It returns
 309 * -EAGAIN if a concurrent invalidation prevents us from updating the page.
 310 *
 311 * The page is released via put_user_page even if the operation failed. For
 312 * on-demand pinning, the page is released whenever it isn't stored in the
 313 * umem.
 314 */
 315static int ib_umem_odp_map_dma_single_page(
 316                struct ib_umem_odp *umem_odp,
 317                unsigned int page_index,
 318                struct page *page,
 319                u64 access_mask,
 320                unsigned long current_seq)
 321{
 322        struct ib_device *dev = umem_odp->umem.ibdev;
 323        dma_addr_t dma_addr;
 324        int ret = 0;
 325
 326        if (mmu_interval_check_retry(&umem_odp->notifier, current_seq)) {
 327                ret = -EAGAIN;
 328                goto out;
 329        }
 330        if (!(umem_odp->dma_list[page_index])) {
 331                dma_addr =
 332                        ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift),
 333                                        DMA_BIDIRECTIONAL);
 334                if (ib_dma_mapping_error(dev, dma_addr)) {
 335                        ret = -EFAULT;
 336                        goto out;
 337                }
 338                umem_odp->dma_list[page_index] = dma_addr | access_mask;
 339                umem_odp->page_list[page_index] = page;
 340                umem_odp->npages++;
 341        } else if (umem_odp->page_list[page_index] == page) {
 342                umem_odp->dma_list[page_index] |= access_mask;
 343        } else {
 344                /*
 345                 * This is a race here where we could have done:
 346                 *
 347                 *         CPU0                             CPU1
 348                 *   get_user_pages()
 349                 *                                       invalidate()
 350                 *                                       page_fault()
 351                 *   mutex_lock(umem_mutex)
 352                 *    page from GUP != page in ODP
 353                 *
 354                 * It should be prevented by the retry test above as reading
 355                 * the seq number should be reliable under the
 356                 * umem_mutex. Thus something is really not working right if
 357                 * things get here.
 358                 */
 359                WARN(true,
 360                     "Got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
 361                     umem_odp->page_list[page_index], page);
 362                ret = -EAGAIN;
 363        }
 364
 365out:
 366        put_user_page(page);
 367        return ret;
 368}
 369
 370/**
 371 * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
 372 *
 373 * Pins the range of pages passed in the argument, and maps them to
 374 * DMA addresses. The DMA addresses of the mapped pages is updated in
 375 * umem_odp->dma_list.
 376 *
 377 * Returns the number of pages mapped in success, negative error code
 378 * for failure.
 379 * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
 380 * the function from completing its task.
 381 * An -ENOENT error code indicates that userspace process is being terminated
 382 * and mm was already destroyed.
 383 * @umem_odp: the umem to map and pin
 384 * @user_virt: the address from which we need to map.
 385 * @bcnt: the minimal number of bytes to pin and map. The mapping might be
 386 *        bigger due to alignment, and may also be smaller in case of an error
 387 *        pinning or mapping a page. The actual pages mapped is returned in
 388 *        the return value.
 389 * @access_mask: bit mask of the requested access permissions for the given
 390 *               range.
 391 * @current_seq: the MMU notifiers sequance value for synchronization with
 392 *               invalidations. the sequance number is read from
 393 *               umem_odp->notifiers_seq before calling this function
 394 */
 395int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
 396                              u64 bcnt, u64 access_mask,
 397                              unsigned long current_seq)
 398{
 399        struct task_struct *owning_process  = NULL;
 400        struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
 401        struct page       **local_page_list = NULL;
 402        u64 page_mask, off;
 403        int j, k, ret = 0, start_idx, npages = 0;
 404        unsigned int flags = 0, page_shift;
 405        phys_addr_t p = 0;
 406
 407        if (access_mask == 0)
 408                return -EINVAL;
 409
 410        if (user_virt < ib_umem_start(umem_odp) ||
 411            user_virt + bcnt > ib_umem_end(umem_odp))
 412                return -EFAULT;
 413
 414        local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
 415        if (!local_page_list)
 416                return -ENOMEM;
 417
 418        page_shift = umem_odp->page_shift;
 419        page_mask = ~(BIT(page_shift) - 1);
 420        off = user_virt & (~page_mask);
 421        user_virt = user_virt & page_mask;
 422        bcnt += off; /* Charge for the first page offset as well. */
 423
 424        /*
 425         * owning_process is allowed to be NULL, this means somehow the mm is
 426         * existing beyond the lifetime of the originating process.. Presumably
 427         * mmget_not_zero will fail in this case.
 428         */
 429        owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID);
 430        if (!owning_process || !mmget_not_zero(owning_mm)) {
 431                ret = -EINVAL;
 432                goto out_put_task;
 433        }
 434
 435        if (access_mask & ODP_WRITE_ALLOWED_BIT)
 436                flags |= FOLL_WRITE;
 437
 438        start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift;
 439        k = start_idx;
 440
 441        while (bcnt > 0) {
 442                const size_t gup_num_pages = min_t(size_t,
 443                                (bcnt + BIT(page_shift) - 1) >> page_shift,
 444                                PAGE_SIZE / sizeof(struct page *));
 445
 446                down_read(&owning_mm->mmap_sem);
 447                /*
 448                 * Note: this might result in redundent page getting. We can
 449                 * avoid this by checking dma_list to be 0 before calling
 450                 * get_user_pages. However, this make the code much more
 451                 * complex (and doesn't gain us much performance in most use
 452                 * cases).
 453                 */
 454                npages = get_user_pages_remote(owning_process, owning_mm,
 455                                user_virt, gup_num_pages,
 456                                flags, local_page_list, NULL, NULL);
 457                up_read(&owning_mm->mmap_sem);
 458
 459                if (npages < 0) {
 460                        if (npages != -EAGAIN)
 461                                pr_warn("fail to get %zu user pages with error %d\n", gup_num_pages, npages);
 462                        else
 463                                pr_debug("fail to get %zu user pages with error %d\n", gup_num_pages, npages);
 464                        break;
 465                }
 466
 467                bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
 468                mutex_lock(&umem_odp->umem_mutex);
 469                for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) {
 470                        if (user_virt & ~page_mask) {
 471                                p += PAGE_SIZE;
 472                                if (page_to_phys(local_page_list[j]) != p) {
 473                                        ret = -EFAULT;
 474                                        break;
 475                                }
 476                                put_user_page(local_page_list[j]);
 477                                continue;
 478                        }
 479
 480                        ret = ib_umem_odp_map_dma_single_page(
 481                                        umem_odp, k, local_page_list[j],
 482                                        access_mask, current_seq);
 483                        if (ret < 0) {
 484                                if (ret != -EAGAIN)
 485                                        pr_warn("ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
 486                                else
 487                                        pr_debug("ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
 488                                break;
 489                        }
 490
 491                        p = page_to_phys(local_page_list[j]);
 492                        k++;
 493                }
 494                mutex_unlock(&umem_odp->umem_mutex);
 495
 496                if (ret < 0) {
 497                        /*
 498                         * Release pages, remembering that the first page
 499                         * to hit an error was already released by
 500                         * ib_umem_odp_map_dma_single_page().
 501                         */
 502                        if (npages - (j + 1) > 0)
 503                                put_user_pages(&local_page_list[j+1],
 504                                               npages - (j + 1));
 505                        break;
 506                }
 507        }
 508
 509        if (ret >= 0) {
 510                if (npages < 0 && k == start_idx)
 511                        ret = npages;
 512                else
 513                        ret = k - start_idx;
 514        }
 515
 516        mmput(owning_mm);
 517out_put_task:
 518        if (owning_process)
 519                put_task_struct(owning_process);
 520        free_page((unsigned long)local_page_list);
 521        return ret;
 522}
 523EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
 524
 525void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
 526                                 u64 bound)
 527{
 528        int idx;
 529        u64 addr;
 530        struct ib_device *dev = umem_odp->umem.ibdev;
 531
 532        lockdep_assert_held(&umem_odp->umem_mutex);
 533
 534        virt = max_t(u64, virt, ib_umem_start(umem_odp));
 535        bound = min_t(u64, bound, ib_umem_end(umem_odp));
 536        /* Note that during the run of this function, the
 537         * notifiers_count of the MR is > 0, preventing any racing
 538         * faults from completion. We might be racing with other
 539         * invalidations, so we must make sure we free each page only
 540         * once. */
 541        for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
 542                idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
 543                if (umem_odp->page_list[idx]) {
 544                        struct page *page = umem_odp->page_list[idx];
 545                        dma_addr_t dma = umem_odp->dma_list[idx];
 546                        dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
 547
 548                        WARN_ON(!dma_addr);
 549
 550                        ib_dma_unmap_page(dev, dma_addr,
 551                                          BIT(umem_odp->page_shift),
 552                                          DMA_BIDIRECTIONAL);
 553                        if (dma & ODP_WRITE_ALLOWED_BIT) {
 554                                struct page *head_page = compound_head(page);
 555                                /*
 556                                 * set_page_dirty prefers being called with
 557                                 * the page lock. However, MMU notifiers are
 558                                 * called sometimes with and sometimes without
 559                                 * the lock. We rely on the umem_mutex instead
 560                                 * to prevent other mmu notifiers from
 561                                 * continuing and allowing the page mapping to
 562                                 * be removed.
 563                                 */
 564                                set_page_dirty(head_page);
 565                        }
 566                        umem_odp->page_list[idx] = NULL;
 567                        umem_odp->dma_list[idx] = 0;
 568                        umem_odp->npages--;
 569                }
 570        }
 571}
 572EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
 573