linux/drivers/infiniband/core/umem.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
   3 * Copyright (c) 2005 Cisco Systems.  All rights reserved.
   4 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
   5 *
   6 * This software is available to you under a choice of one of two
   7 * licenses.  You may choose to be licensed under the terms of the GNU
   8 * General Public License (GPL) Version 2, available from the file
   9 * COPYING in the main directory of this source tree, or the
  10 * OpenIB.org BSD license below:
  11 *
  12 *     Redistribution and use in source and binary forms, with or
  13 *     without modification, are permitted provided that the following
  14 *     conditions are met:
  15 *
  16 *      - Redistributions of source code must retain the above
  17 *        copyright notice, this list of conditions and the following
  18 *        disclaimer.
  19 *
  20 *      - Redistributions in binary form must reproduce the above
  21 *        copyright notice, this list of conditions and the following
  22 *        disclaimer in the documentation and/or other materials
  23 *        provided with the distribution.
  24 *
  25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32 * SOFTWARE.
  33 */
  34
  35#include <linux/mm.h>
  36#include <linux/dma-mapping.h>
  37#include <linux/sched/signal.h>
  38#include <linux/sched/mm.h>
  39#include <linux/export.h>
  40#include <linux/slab.h>
  41#include <linux/pagemap.h>
  42#include <rdma/ib_umem_odp.h>
  43
  44#include "uverbs.h"
  45
  46static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
  47{
  48        struct sg_page_iter sg_iter;
  49        struct page *page;
  50
  51        if (umem->nmap > 0)
  52                ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
  53                                DMA_BIDIRECTIONAL);
  54
  55        for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
  56                page = sg_page_iter_page(&sg_iter);
  57                if (umem->writable && dirty)
  58                        put_user_pages_dirty_lock(&page, 1);
  59                else
  60                        put_user_page(page);
  61        }
  62
  63        sg_free_table(&umem->sg_head);
  64}
  65
  66/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
  67 *
  68 * sg: current scatterlist entry
  69 * page_list: array of npage struct page pointers
  70 * npages: number of pages in page_list
  71 * max_seg_sz: maximum segment size in bytes
  72 * nents: [out] number of entries in the scatterlist
  73 *
  74 * Return new end of scatterlist
  75 */
  76static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
  77                                                struct page **page_list,
  78                                                unsigned long npages,
  79                                                unsigned int max_seg_sz,
  80                                                int *nents)
  81{
  82        unsigned long first_pfn;
  83        unsigned long i = 0;
  84        bool update_cur_sg = false;
  85        bool first = !sg_page(sg);
  86
  87        /* Check if new page_list is contiguous with end of previous page_list.
  88         * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0.
  89         */
  90        if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
  91                       page_to_pfn(page_list[0])))
  92                update_cur_sg = true;
  93
  94        while (i != npages) {
  95                unsigned long len;
  96                struct page *first_page = page_list[i];
  97
  98                first_pfn = page_to_pfn(first_page);
  99
 100                /* Compute the number of contiguous pages we have starting
 101                 * at i
 102                 */
 103                for (len = 0; i != npages &&
 104                              first_pfn + len == page_to_pfn(page_list[i]) &&
 105                              len < (max_seg_sz >> PAGE_SHIFT);
 106                     len++)
 107                        i++;
 108
 109                /* Squash N contiguous pages from page_list into current sge */
 110                if (update_cur_sg) {
 111                        if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) {
 112                                sg_set_page(sg, sg_page(sg),
 113                                            sg->length + (len << PAGE_SHIFT),
 114                                            0);
 115                                update_cur_sg = false;
 116                                continue;
 117                        }
 118                        update_cur_sg = false;
 119                }
 120
 121                /* Squash N contiguous pages into next sge or first sge */
 122                if (!first)
 123                        sg = sg_next(sg);
 124
 125                (*nents)++;
 126                sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
 127                first = false;
 128        }
 129
 130        return sg;
 131}
 132
 133/**
 134 * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
 135 *
 136 * @umem: umem struct
 137 * @pgsz_bitmap: bitmap of HW supported page sizes
 138 * @virt: IOVA
 139 *
 140 * This helper is intended for HW that support multiple page
 141 * sizes but can do only a single page size in an MR.
 142 *
 143 * Returns 0 if the umem requires page sizes not supported by
 144 * the driver to be mapped. Drivers always supporting PAGE_SIZE
 145 * or smaller will never see a 0 result.
 146 */
 147unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
 148                                     unsigned long pgsz_bitmap,
 149                                     unsigned long virt)
 150{
 151        struct scatterlist *sg;
 152        unsigned int best_pg_bit;
 153        unsigned long va, pgoff;
 154        dma_addr_t mask;
 155        int i;
 156
 157        /* At minimum, drivers must support PAGE_SIZE or smaller */
 158        if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0))))
 159                return 0;
 160
 161        va = virt;
 162        /* max page size not to exceed MR length */
 163        mask = roundup_pow_of_two(umem->length);
 164        /* offset into first SGL */
 165        pgoff = umem->address & ~PAGE_MASK;
 166
 167        for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
 168                /* Walk SGL and reduce max page size if VA/PA bits differ
 169                 * for any address.
 170                 */
 171                mask |= (sg_dma_address(sg) + pgoff) ^ va;
 172                if (i && i != (umem->nmap - 1))
 173                        /* restrict by length as well for interior SGEs */
 174                        mask |= sg_dma_len(sg);
 175                va += sg_dma_len(sg) - pgoff;
 176                pgoff = 0;
 177        }
 178        best_pg_bit = rdma_find_pg_bit(mask, pgsz_bitmap);
 179
 180        return BIT_ULL(best_pg_bit);
 181}
 182EXPORT_SYMBOL(ib_umem_find_best_pgsz);
 183
 184/**
 185 * ib_umem_get - Pin and DMA map userspace memory.
 186 *
 187 * If access flags indicate ODP memory, avoid pinning. Instead, stores
 188 * the mm for future page fault handling in conjunction with MMU notifiers.
 189 *
 190 * @udata: userspace context to pin memory for
 191 * @addr: userspace virtual address to start at
 192 * @size: length of region to pin
 193 * @access: IB_ACCESS_xxx flags for memory being pinned
 194 * @dmasync: flush in-flight DMA when the memory region is written
 195 */
 196struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 197                            size_t size, int access, int dmasync)
 198{
 199        struct ib_ucontext *context;
 200        struct ib_umem *umem;
 201        struct page **page_list;
 202        unsigned long lock_limit;
 203        unsigned long new_pinned;
 204        unsigned long cur_base;
 205        struct mm_struct *mm;
 206        unsigned long npages;
 207        int ret;
 208        unsigned long dma_attrs = 0;
 209        struct scatterlist *sg;
 210        unsigned int gup_flags = FOLL_WRITE;
 211
 212        if (!udata)
 213                return ERR_PTR(-EIO);
 214
 215        context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
 216                          ->context;
 217        if (!context)
 218                return ERR_PTR(-EIO);
 219
 220        if (dmasync)
 221                dma_attrs |= DMA_ATTR_WRITE_BARRIER;
 222
 223        /*
 224         * If the combination of the addr and size requested for this memory
 225         * region causes an integer overflow, return error.
 226         */
 227        if (((addr + size) < addr) ||
 228            PAGE_ALIGN(addr + size) < (addr + size))
 229                return ERR_PTR(-EINVAL);
 230
 231        if (!can_do_mlock())
 232                return ERR_PTR(-EPERM);
 233
 234        if (access & IB_ACCESS_ON_DEMAND) {
 235                umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
 236                if (!umem)
 237                        return ERR_PTR(-ENOMEM);
 238                umem->is_odp = 1;
 239        } else {
 240                umem = kzalloc(sizeof(*umem), GFP_KERNEL);
 241                if (!umem)
 242                        return ERR_PTR(-ENOMEM);
 243        }
 244
 245        umem->context    = context;
 246        umem->length     = size;
 247        umem->address    = addr;
 248        umem->writable   = ib_access_writable(access);
 249        umem->owning_mm = mm = current->mm;
 250        mmgrab(mm);
 251
 252        if (access & IB_ACCESS_ON_DEMAND) {
 253                if (WARN_ON_ONCE(!context->invalidate_range)) {
 254                        ret = -EINVAL;
 255                        goto umem_kfree;
 256                }
 257
 258                ret = ib_umem_odp_get(to_ib_umem_odp(umem), access);
 259                if (ret)
 260                        goto umem_kfree;
 261                return umem;
 262        }
 263
 264        page_list = (struct page **) __get_free_page(GFP_KERNEL);
 265        if (!page_list) {
 266                ret = -ENOMEM;
 267                goto umem_kfree;
 268        }
 269
 270        npages = ib_umem_num_pages(umem);
 271        if (npages == 0 || npages > UINT_MAX) {
 272                ret = -EINVAL;
 273                goto out;
 274        }
 275
 276        lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 277
 278        new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
 279        if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
 280                atomic64_sub(npages, &mm->pinned_vm);
 281                ret = -ENOMEM;
 282                goto out;
 283        }
 284
 285        cur_base = addr & PAGE_MASK;
 286
 287        ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
 288        if (ret)
 289                goto vma;
 290
 291        if (!umem->writable)
 292                gup_flags |= FOLL_FORCE;
 293
 294        sg = umem->sg_head.sgl;
 295
 296        while (npages) {
 297                down_read(&mm->mmap_sem);
 298                ret = get_user_pages(cur_base,
 299                                     min_t(unsigned long, npages,
 300                                           PAGE_SIZE / sizeof (struct page *)),
 301                                     gup_flags | FOLL_LONGTERM,
 302                                     page_list, NULL);
 303                if (ret < 0) {
 304                        up_read(&mm->mmap_sem);
 305                        goto umem_release;
 306                }
 307
 308                cur_base += ret * PAGE_SIZE;
 309                npages   -= ret;
 310
 311                sg = ib_umem_add_sg_table(sg, page_list, ret,
 312                        dma_get_max_seg_size(context->device->dma_device),
 313                        &umem->sg_nents);
 314
 315                up_read(&mm->mmap_sem);
 316        }
 317
 318        sg_mark_end(sg);
 319
 320        umem->nmap = ib_dma_map_sg_attrs(context->device,
 321                                  umem->sg_head.sgl,
 322                                  umem->sg_nents,
 323                                  DMA_BIDIRECTIONAL,
 324                                  dma_attrs);
 325
 326        if (!umem->nmap) {
 327                ret = -ENOMEM;
 328                goto umem_release;
 329        }
 330
 331        ret = 0;
 332        goto out;
 333
 334umem_release:
 335        __ib_umem_release(context->device, umem, 0);
 336vma:
 337        atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
 338out:
 339        free_page((unsigned long) page_list);
 340umem_kfree:
 341        if (ret) {
 342                mmdrop(umem->owning_mm);
 343                kfree(umem);
 344        }
 345        return ret ? ERR_PTR(ret) : umem;
 346}
 347EXPORT_SYMBOL(ib_umem_get);
 348
 349static void __ib_umem_release_tail(struct ib_umem *umem)
 350{
 351        mmdrop(umem->owning_mm);
 352        if (umem->is_odp)
 353                kfree(to_ib_umem_odp(umem));
 354        else
 355                kfree(umem);
 356}
 357
 358/**
 359 * ib_umem_release - release memory pinned with ib_umem_get
 360 * @umem: umem struct to release
 361 */
 362void ib_umem_release(struct ib_umem *umem)
 363{
 364        if (!umem)
 365                return;
 366
 367        if (umem->is_odp) {
 368                ib_umem_odp_release(to_ib_umem_odp(umem));
 369                __ib_umem_release_tail(umem);
 370                return;
 371        }
 372
 373        __ib_umem_release(umem->context->device, umem, 1);
 374
 375        atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
 376        __ib_umem_release_tail(umem);
 377}
 378EXPORT_SYMBOL(ib_umem_release);
 379
 380int ib_umem_page_count(struct ib_umem *umem)
 381{
 382        int i, n = 0;
 383        struct scatterlist *sg;
 384
 385        for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
 386                n += sg_dma_len(sg) >> PAGE_SHIFT;
 387
 388        return n;
 389}
 390EXPORT_SYMBOL(ib_umem_page_count);
 391
 392/*
 393 * Copy from the given ib_umem's pages to the given buffer.
 394 *
 395 * umem - the umem to copy from
 396 * offset - offset to start copying from
 397 * dst - destination buffer
 398 * length - buffer length
 399 *
 400 * Returns 0 on success, or an error code.
 401 */
 402int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 403                      size_t length)
 404{
 405        size_t end = offset + length;
 406        int ret;
 407
 408        if (offset > umem->length || length > umem->length - offset) {
 409                pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
 410                       offset, umem->length, end);
 411                return -EINVAL;
 412        }
 413
 414        ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length,
 415                                 offset + ib_umem_offset(umem));
 416
 417        if (ret < 0)
 418                return ret;
 419        else if (ret != length)
 420                return -EINVAL;
 421        else
 422                return 0;
 423}
 424EXPORT_SYMBOL(ib_umem_copy_from);
 425