linux/drivers/infiniband/core/umem.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
   3 * Copyright (c) 2005 Cisco Systems.  All rights reserved.
   4 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
   5 * Copyright (c) 2020 Intel Corporation. All rights reserved.
   6 *
   7 * This software is available to you under a choice of one of two
   8 * licenses.  You may choose to be licensed under the terms of the GNU
   9 * General Public License (GPL) Version 2, available from the file
  10 * COPYING in the main directory of this source tree, or the
  11 * OpenIB.org BSD license below:
  12 *
  13 *     Redistribution and use in source and binary forms, with or
  14 *     without modification, are permitted provided that the following
  15 *     conditions are met:
  16 *
  17 *      - Redistributions of source code must retain the above
  18 *        copyright notice, this list of conditions and the following
  19 *        disclaimer.
  20 *
  21 *      - Redistributions in binary form must reproduce the above
  22 *        copyright notice, this list of conditions and the following
  23 *        disclaimer in the documentation and/or other materials
  24 *        provided with the distribution.
  25 *
  26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  27 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  29 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  30 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  31 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  32 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  33 * SOFTWARE.
  34 */
  35
  36#include <linux/mm.h>
  37#include <linux/dma-mapping.h>
  38#include <linux/sched/signal.h>
  39#include <linux/sched/mm.h>
  40#include <linux/export.h>
  41#include <linux/slab.h>
  42#include <linux/pagemap.h>
  43#include <linux/count_zeros.h>
  44#include <rdma/ib_umem_odp.h>
  45
  46#include "uverbs.h"
  47
  48static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
  49{
  50        bool make_dirty = umem->writable && dirty;
  51        struct scatterlist *sg;
  52        unsigned int i;
  53
  54        if (dirty)
  55                ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt,
  56                                           DMA_BIDIRECTIONAL, 0);
  57
  58        for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i)
  59                unpin_user_page_range_dirty_lock(sg_page(sg),
  60                        DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty);
  61
  62        sg_free_append_table(&umem->sgt_append);
  63}
  64
  65/**
  66 * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
  67 *
  68 * @umem: umem struct
  69 * @pgsz_bitmap: bitmap of HW supported page sizes
  70 * @virt: IOVA
  71 *
  72 * This helper is intended for HW that support multiple page
  73 * sizes but can do only a single page size in an MR.
  74 *
  75 * Returns 0 if the umem requires page sizes not supported by
  76 * the driver to be mapped. Drivers always supporting PAGE_SIZE
  77 * or smaller will never see a 0 result.
  78 */
  79unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
  80                                     unsigned long pgsz_bitmap,
  81                                     unsigned long virt)
  82{
  83        struct scatterlist *sg;
  84        unsigned long va, pgoff;
  85        dma_addr_t mask;
  86        int i;
  87
  88        if (umem->is_odp) {
  89                unsigned int page_size = BIT(to_ib_umem_odp(umem)->page_shift);
  90
  91                /* ODP must always be self consistent. */
  92                if (!(pgsz_bitmap & page_size))
  93                        return 0;
  94                return page_size;
  95        }
  96
  97        /* rdma_for_each_block() has a bug if the page size is smaller than the
  98         * page size used to build the umem. For now prevent smaller page sizes
  99         * from being returned.
 100         */
 101        pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, PAGE_SHIFT);
 102
 103        umem->iova = va = virt;
 104        /* The best result is the smallest page size that results in the minimum
 105         * number of required pages. Compute the largest page size that could
 106         * work based on VA address bits that don't change.
 107         */
 108        mask = pgsz_bitmap &
 109               GENMASK(BITS_PER_LONG - 1,
 110                       bits_per((umem->length - 1 + virt) ^ virt));
 111        /* offset into first SGL */
 112        pgoff = umem->address & ~PAGE_MASK;
 113
 114        for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) {
 115                /* Walk SGL and reduce max page size if VA/PA bits differ
 116                 * for any address.
 117                 */
 118                mask |= (sg_dma_address(sg) + pgoff) ^ va;
 119                va += sg_dma_len(sg) - pgoff;
 120                /* Except for the last entry, the ending iova alignment sets
 121                 * the maximum possible page size as the low bits of the iova
 122                 * must be zero when starting the next chunk.
 123                 */
 124                if (i != (umem->sgt_append.sgt.nents - 1))
 125                        mask |= va;
 126                pgoff = 0;
 127        }
 128
 129        /* The mask accumulates 1's in each position where the VA and physical
 130         * address differ, thus the length of trailing 0 is the largest page
 131         * size that can pass the VA through to the physical.
 132         */
 133        if (mask)
 134                pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0);
 135        return pgsz_bitmap ? rounddown_pow_of_two(pgsz_bitmap) : 0;
 136}
 137EXPORT_SYMBOL(ib_umem_find_best_pgsz);
 138
 139/**
 140 * ib_umem_get - Pin and DMA map userspace memory.
 141 *
 142 * @device: IB device to connect UMEM
 143 * @addr: userspace virtual address to start at
 144 * @size: length of region to pin
 145 * @access: IB_ACCESS_xxx flags for memory being pinned
 146 */
 147struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
 148                            size_t size, int access)
 149{
 150        struct ib_umem *umem;
 151        struct page **page_list;
 152        unsigned long lock_limit;
 153        unsigned long new_pinned;
 154        unsigned long cur_base;
 155        unsigned long dma_attr = 0;
 156        struct mm_struct *mm;
 157        unsigned long npages;
 158        int pinned, ret;
 159        unsigned int gup_flags = FOLL_WRITE;
 160
 161        /*
 162         * If the combination of the addr and size requested for this memory
 163         * region causes an integer overflow, return error.
 164         */
 165        if (((addr + size) < addr) ||
 166            PAGE_ALIGN(addr + size) < (addr + size))
 167                return ERR_PTR(-EINVAL);
 168
 169        if (!can_do_mlock())
 170                return ERR_PTR(-EPERM);
 171
 172        if (access & IB_ACCESS_ON_DEMAND)
 173                return ERR_PTR(-EOPNOTSUPP);
 174
 175        umem = kzalloc(sizeof(*umem), GFP_KERNEL);
 176        if (!umem)
 177                return ERR_PTR(-ENOMEM);
 178        umem->ibdev      = device;
 179        umem->length     = size;
 180        umem->address    = addr;
 181        /*
 182         * Drivers should call ib_umem_find_best_pgsz() to set the iova
 183         * correctly.
 184         */
 185        umem->iova = addr;
 186        umem->writable   = ib_access_writable(access);
 187        umem->owning_mm = mm = current->mm;
 188        mmgrab(mm);
 189
 190        page_list = (struct page **) __get_free_page(GFP_KERNEL);
 191        if (!page_list) {
 192                ret = -ENOMEM;
 193                goto umem_kfree;
 194        }
 195
 196        npages = ib_umem_num_pages(umem);
 197        if (npages == 0 || npages > UINT_MAX) {
 198                ret = -EINVAL;
 199                goto out;
 200        }
 201
 202        lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 203
 204        new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
 205        if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
 206                atomic64_sub(npages, &mm->pinned_vm);
 207                ret = -ENOMEM;
 208                goto out;
 209        }
 210
 211        cur_base = addr & PAGE_MASK;
 212
 213        if (!umem->writable)
 214                gup_flags |= FOLL_FORCE;
 215
 216        while (npages) {
 217                cond_resched();
 218                pinned = pin_user_pages_fast(cur_base,
 219                                          min_t(unsigned long, npages,
 220                                                PAGE_SIZE /
 221                                                sizeof(struct page *)),
 222                                          gup_flags | FOLL_LONGTERM, page_list);
 223                if (pinned < 0) {
 224                        ret = pinned;
 225                        goto umem_release;
 226                }
 227
 228                cur_base += pinned * PAGE_SIZE;
 229                npages -= pinned;
 230                ret = sg_alloc_append_table_from_pages(
 231                        &umem->sgt_append, page_list, pinned, 0,
 232                        pinned << PAGE_SHIFT, ib_dma_max_seg_size(device),
 233                        npages, GFP_KERNEL);
 234                if (ret) {
 235                        unpin_user_pages_dirty_lock(page_list, pinned, 0);
 236                        goto umem_release;
 237                }
 238        }
 239
 240        if (access & IB_ACCESS_RELAXED_ORDERING)
 241                dma_attr |= DMA_ATTR_WEAK_ORDERING;
 242
 243        ret = ib_dma_map_sgtable_attrs(device, &umem->sgt_append.sgt,
 244                                       DMA_BIDIRECTIONAL, dma_attr);
 245        if (ret)
 246                goto umem_release;
 247        goto out;
 248
 249umem_release:
 250        __ib_umem_release(device, umem, 0);
 251        atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
 252out:
 253        free_page((unsigned long) page_list);
 254umem_kfree:
 255        if (ret) {
 256                mmdrop(umem->owning_mm);
 257                kfree(umem);
 258        }
 259        return ret ? ERR_PTR(ret) : umem;
 260}
 261EXPORT_SYMBOL(ib_umem_get);
 262
 263/**
 264 * ib_umem_release - release memory pinned with ib_umem_get
 265 * @umem: umem struct to release
 266 */
 267void ib_umem_release(struct ib_umem *umem)
 268{
 269        if (!umem)
 270                return;
 271        if (umem->is_dmabuf)
 272                return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
 273        if (umem->is_odp)
 274                return ib_umem_odp_release(to_ib_umem_odp(umem));
 275
 276        __ib_umem_release(umem->ibdev, umem, 1);
 277
 278        atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
 279        mmdrop(umem->owning_mm);
 280        kfree(umem);
 281}
 282EXPORT_SYMBOL(ib_umem_release);
 283
 284/*
 285 * Copy from the given ib_umem's pages to the given buffer.
 286 *
 287 * umem - the umem to copy from
 288 * offset - offset to start copying from
 289 * dst - destination buffer
 290 * length - buffer length
 291 *
 292 * Returns 0 on success, or an error code.
 293 */
 294int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 295                      size_t length)
 296{
 297        size_t end = offset + length;
 298        int ret;
 299
 300        if (offset > umem->length || length > umem->length - offset) {
 301                pr_err("%s not in range. offset: %zd umem length: %zd end: %zd\n",
 302                       __func__, offset, umem->length, end);
 303                return -EINVAL;
 304        }
 305
 306        ret = sg_pcopy_to_buffer(umem->sgt_append.sgt.sgl,
 307                                 umem->sgt_append.sgt.orig_nents, dst, length,
 308                                 offset + ib_umem_offset(umem));
 309
 310        if (ret < 0)
 311                return ret;
 312        else if (ret != length)
 313                return -EINVAL;
 314        else
 315                return 0;
 316}
 317EXPORT_SYMBOL(ib_umem_copy_from);
 318