linux/drivers/infiniband/core/umem.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
   3 * Copyright (c) 2005 Cisco Systems.  All rights reserved.
   4 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
   5 *
   6 * This software is available to you under a choice of one of two
   7 * licenses.  You may choose to be licensed under the terms of the GNU
   8 * General Public License (GPL) Version 2, available from the file
   9 * COPYING in the main directory of this source tree, or the
  10 * OpenIB.org BSD license below:
  11 *
  12 *     Redistribution and use in source and binary forms, with or
  13 *     without modification, are permitted provided that the following
  14 *     conditions are met:
  15 *
  16 *      - Redistributions of source code must retain the above
  17 *        copyright notice, this list of conditions and the following
  18 *        disclaimer.
  19 *
  20 *      - Redistributions in binary form must reproduce the above
  21 *        copyright notice, this list of conditions and the following
  22 *        disclaimer in the documentation and/or other materials
  23 *        provided with the distribution.
  24 *
  25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32 * SOFTWARE.
  33 */
  34
  35#include <linux/mm.h>
  36#include <linux/dma-mapping.h>
  37#include <linux/sched/signal.h>
  38#include <linux/sched/mm.h>
  39#include <linux/export.h>
  40#include <linux/slab.h>
  41#include <linux/pagemap.h>
  42#include <linux/count_zeros.h>
  43#include <rdma/ib_umem_odp.h>
  44
  45#include "uverbs.h"
  46
  47static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
  48{
  49        struct sg_page_iter sg_iter;
  50        struct page *page;
  51
  52        if (umem->nmap > 0)
  53                ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
  54                                DMA_BIDIRECTIONAL);
  55
  56        for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
  57                page = sg_page_iter_page(&sg_iter);
  58                unpin_user_pages_dirty_lock(&page, 1, umem->writable && dirty);
  59        }
  60
  61        sg_free_table(&umem->sg_head);
  62}
  63
  64/**
  65 * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
  66 *
  67 * @umem: umem struct
  68 * @pgsz_bitmap: bitmap of HW supported page sizes
  69 * @virt: IOVA
  70 *
  71 * This helper is intended for HW that support multiple page
  72 * sizes but can do only a single page size in an MR.
  73 *
  74 * Returns 0 if the umem requires page sizes not supported by
  75 * the driver to be mapped. Drivers always supporting PAGE_SIZE
  76 * or smaller will never see a 0 result.
  77 */
  78unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
  79                                     unsigned long pgsz_bitmap,
  80                                     unsigned long virt)
  81{
  82        struct scatterlist *sg;
  83        unsigned long va, pgoff;
  84        dma_addr_t mask;
  85        int i;
  86
  87        /* rdma_for_each_block() has a bug if the page size is smaller than the
  88         * page size used to build the umem. For now prevent smaller page sizes
  89         * from being returned.
  90         */
  91        pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, PAGE_SHIFT);
  92
  93        /* At minimum, drivers must support PAGE_SIZE or smaller */
  94        if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0))))
  95                return 0;
  96
  97        umem->iova = va = virt;
  98        /* The best result is the smallest page size that results in the minimum
  99         * number of required pages. Compute the largest page size that could
 100         * work based on VA address bits that don't change.
 101         */
 102        mask = pgsz_bitmap &
 103               GENMASK(BITS_PER_LONG - 1,
 104                       bits_per((umem->length - 1 + virt) ^ virt));
 105        /* offset into first SGL */
 106        pgoff = umem->address & ~PAGE_MASK;
 107
 108        for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
 109                /* Walk SGL and reduce max page size if VA/PA bits differ
 110                 * for any address.
 111                 */
 112                mask |= (sg_dma_address(sg) + pgoff) ^ va;
 113                va += sg_dma_len(sg) - pgoff;
 114                /* Except for the last entry, the ending iova alignment sets
 115                 * the maximum possible page size as the low bits of the iova
 116                 * must be zero when starting the next chunk.
 117                 */
 118                if (i != (umem->nmap - 1))
 119                        mask |= va;
 120                pgoff = 0;
 121        }
 122
 123        /* The mask accumulates 1's in each position where the VA and physical
 124         * address differ, thus the length of trailing 0 is the largest page
 125         * size that can pass the VA through to the physical.
 126         */
 127        if (mask)
 128                pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0);
 129        return rounddown_pow_of_two(pgsz_bitmap);
 130}
 131EXPORT_SYMBOL(ib_umem_find_best_pgsz);
 132
 133/**
 134 * ib_umem_get - Pin and DMA map userspace memory.
 135 *
 136 * @device: IB device to connect UMEM
 137 * @addr: userspace virtual address to start at
 138 * @size: length of region to pin
 139 * @access: IB_ACCESS_xxx flags for memory being pinned
 140 */
 141struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
 142                            size_t size, int access)
 143{
 144        struct ib_umem *umem;
 145        struct page **page_list;
 146        unsigned long lock_limit;
 147        unsigned long new_pinned;
 148        unsigned long cur_base;
 149        unsigned long dma_attr = 0;
 150        struct mm_struct *mm;
 151        unsigned long npages;
 152        int ret;
 153        struct scatterlist *sg = NULL;
 154        unsigned int gup_flags = FOLL_WRITE;
 155
 156        /*
 157         * If the combination of the addr and size requested for this memory
 158         * region causes an integer overflow, return error.
 159         */
 160        if (((addr + size) < addr) ||
 161            PAGE_ALIGN(addr + size) < (addr + size))
 162                return ERR_PTR(-EINVAL);
 163
 164        if (!can_do_mlock())
 165                return ERR_PTR(-EPERM);
 166
 167        if (access & IB_ACCESS_ON_DEMAND)
 168                return ERR_PTR(-EOPNOTSUPP);
 169
 170        umem = kzalloc(sizeof(*umem), GFP_KERNEL);
 171        if (!umem)
 172                return ERR_PTR(-ENOMEM);
 173        umem->ibdev      = device;
 174        umem->length     = size;
 175        umem->address    = addr;
 176        /*
 177         * Drivers should call ib_umem_find_best_pgsz() to set the iova
 178         * correctly.
 179         */
 180        umem->iova = addr;
 181        umem->writable   = ib_access_writable(access);
 182        umem->owning_mm = mm = current->mm;
 183        mmgrab(mm);
 184
 185        page_list = (struct page **) __get_free_page(GFP_KERNEL);
 186        if (!page_list) {
 187                ret = -ENOMEM;
 188                goto umem_kfree;
 189        }
 190
 191        npages = ib_umem_num_pages(umem);
 192        if (npages == 0 || npages > UINT_MAX) {
 193                ret = -EINVAL;
 194                goto out;
 195        }
 196
 197        lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 198
 199        new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
 200        if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
 201                atomic64_sub(npages, &mm->pinned_vm);
 202                ret = -ENOMEM;
 203                goto out;
 204        }
 205
 206        cur_base = addr & PAGE_MASK;
 207
 208        if (!umem->writable)
 209                gup_flags |= FOLL_FORCE;
 210
 211        while (npages) {
 212                cond_resched();
 213                ret = pin_user_pages_fast(cur_base,
 214                                          min_t(unsigned long, npages,
 215                                                PAGE_SIZE /
 216                                                sizeof(struct page *)),
 217                                          gup_flags | FOLL_LONGTERM, page_list);
 218                if (ret < 0)
 219                        goto umem_release;
 220
 221                cur_base += ret * PAGE_SIZE;
 222                npages -= ret;
 223                sg = __sg_alloc_table_from_pages(
 224                        &umem->sg_head, page_list, ret, 0, ret << PAGE_SHIFT,
 225                        dma_get_max_seg_size(device->dma_device), sg, npages,
 226                        GFP_KERNEL);
 227                umem->sg_nents = umem->sg_head.nents;
 228                if (IS_ERR(sg)) {
 229                        unpin_user_pages_dirty_lock(page_list, ret, 0);
 230                        ret = PTR_ERR(sg);
 231                        goto umem_release;
 232                }
 233        }
 234
 235        if (access & IB_ACCESS_RELAXED_ORDERING)
 236                dma_attr |= DMA_ATTR_WEAK_ORDERING;
 237
 238        umem->nmap =
 239                ib_dma_map_sg_attrs(device, umem->sg_head.sgl, umem->sg_nents,
 240                                    DMA_BIDIRECTIONAL, dma_attr);
 241
 242        if (!umem->nmap) {
 243                ret = -ENOMEM;
 244                goto umem_release;
 245        }
 246
 247        ret = 0;
 248        goto out;
 249
 250umem_release:
 251        __ib_umem_release(device, umem, 0);
 252        atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
 253out:
 254        free_page((unsigned long) page_list);
 255umem_kfree:
 256        if (ret) {
 257                mmdrop(umem->owning_mm);
 258                kfree(umem);
 259        }
 260        return ret ? ERR_PTR(ret) : umem;
 261}
 262EXPORT_SYMBOL(ib_umem_get);
 263
 264/**
 265 * ib_umem_release - release memory pinned with ib_umem_get
 266 * @umem: umem struct to release
 267 */
 268void ib_umem_release(struct ib_umem *umem)
 269{
 270        if (!umem)
 271                return;
 272        if (umem->is_odp)
 273                return ib_umem_odp_release(to_ib_umem_odp(umem));
 274
 275        __ib_umem_release(umem->ibdev, umem, 1);
 276
 277        atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
 278        mmdrop(umem->owning_mm);
 279        kfree(umem);
 280}
 281EXPORT_SYMBOL(ib_umem_release);
 282
 283/*
 284 * Copy from the given ib_umem's pages to the given buffer.
 285 *
 286 * umem - the umem to copy from
 287 * offset - offset to start copying from
 288 * dst - destination buffer
 289 * length - buffer length
 290 *
 291 * Returns 0 on success, or an error code.
 292 */
 293int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 294                      size_t length)
 295{
 296        size_t end = offset + length;
 297        int ret;
 298
 299        if (offset > umem->length || length > umem->length - offset) {
 300                pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
 301                       offset, umem->length, end);
 302                return -EINVAL;
 303        }
 304
 305        ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length,
 306                                 offset + ib_umem_offset(umem));
 307
 308        if (ret < 0)
 309                return ret;
 310        else if (ret != length)
 311                return -EINVAL;
 312        else
 313                return 0;
 314}
 315EXPORT_SYMBOL(ib_umem_copy_from);
 316