linux/drivers/infiniband/sw/rdmavt/mr.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2016 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47
  48#include <linux/slab.h>
  49#include <linux/vmalloc.h>
  50#include <rdma/ib_umem.h>
  51#include <rdma/rdma_vt.h>
  52#include "vt.h"
  53#include "mr.h"
  54#include "trace.h"
  55
  56/**
  57 * rvt_driver_mr_init - Init MR resources per driver
  58 * @rdi: rvt dev struct
  59 *
  60 * Do any intilization needed when a driver registers with rdmavt.
  61 *
  62 * Return: 0 on success or errno on failure
  63 */
  64int rvt_driver_mr_init(struct rvt_dev_info *rdi)
  65{
  66        unsigned int lkey_table_size = rdi->dparms.lkey_table_size;
  67        unsigned lk_tab_size;
  68        int i;
  69
  70        /*
  71         * The top hfi1_lkey_table_size bits are used to index the
  72         * table.  The lower 8 bits can be owned by the user (copied from
  73         * the LKEY).  The remaining bits act as a generation number or tag.
  74         */
  75        if (!lkey_table_size)
  76                return -EINVAL;
  77
  78        spin_lock_init(&rdi->lkey_table.lock);
  79
  80        /* ensure generation is at least 4 bits */
  81        if (lkey_table_size > RVT_MAX_LKEY_TABLE_BITS) {
  82                rvt_pr_warn(rdi, "lkey bits %u too large, reduced to %u\n",
  83                            lkey_table_size, RVT_MAX_LKEY_TABLE_BITS);
  84                rdi->dparms.lkey_table_size = RVT_MAX_LKEY_TABLE_BITS;
  85                lkey_table_size = rdi->dparms.lkey_table_size;
  86        }
  87        rdi->lkey_table.max = 1 << lkey_table_size;
  88        rdi->lkey_table.shift = 32 - lkey_table_size;
  89        lk_tab_size = rdi->lkey_table.max * sizeof(*rdi->lkey_table.table);
  90        rdi->lkey_table.table = (struct rvt_mregion __rcu **)
  91                               vmalloc_node(lk_tab_size, rdi->dparms.node);
  92        if (!rdi->lkey_table.table)
  93                return -ENOMEM;
  94
  95        RCU_INIT_POINTER(rdi->dma_mr, NULL);
  96        for (i = 0; i < rdi->lkey_table.max; i++)
  97                RCU_INIT_POINTER(rdi->lkey_table.table[i], NULL);
  98
  99        rdi->dparms.props.max_mr = rdi->lkey_table.max;
 100        rdi->dparms.props.max_fmr = rdi->lkey_table.max;
 101        return 0;
 102}
 103
 104/**
 105 *rvt_mr_exit: clean up MR
 106 *@rdi: rvt dev structure
 107 *
 108 * called when drivers have unregistered or perhaps failed to register with us
 109 */
 110void rvt_mr_exit(struct rvt_dev_info *rdi)
 111{
 112        if (rdi->dma_mr)
 113                rvt_pr_err(rdi, "DMA MR not null!\n");
 114
 115        vfree(rdi->lkey_table.table);
 116}
 117
 118static void rvt_deinit_mregion(struct rvt_mregion *mr)
 119{
 120        int i = mr->mapsz;
 121
 122        mr->mapsz = 0;
 123        while (i)
 124                kfree(mr->map[--i]);
 125        percpu_ref_exit(&mr->refcount);
 126}
 127
 128static void __rvt_mregion_complete(struct percpu_ref *ref)
 129{
 130        struct rvt_mregion *mr = container_of(ref, struct rvt_mregion,
 131                                              refcount);
 132
 133        complete(&mr->comp);
 134}
 135
 136static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd,
 137                            int count, unsigned int percpu_flags)
 138{
 139        int m, i = 0;
 140        struct rvt_dev_info *dev = ib_to_rvt(pd->device);
 141
 142        mr->mapsz = 0;
 143        m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
 144        for (; i < m; i++) {
 145                mr->map[i] = kzalloc_node(sizeof(*mr->map[0]), GFP_KERNEL,
 146                                          dev->dparms.node);
 147                if (!mr->map[i])
 148                        goto bail;
 149                mr->mapsz++;
 150        }
 151        init_completion(&mr->comp);
 152        /* count returning the ptr to user */
 153        if (percpu_ref_init(&mr->refcount, &__rvt_mregion_complete,
 154                            percpu_flags, GFP_KERNEL))
 155                goto bail;
 156
 157        atomic_set(&mr->lkey_invalid, 0);
 158        mr->pd = pd;
 159        mr->max_segs = count;
 160        return 0;
 161bail:
 162        rvt_deinit_mregion(mr);
 163        return -ENOMEM;
 164}
 165
 166/**
 167 * rvt_alloc_lkey - allocate an lkey
 168 * @mr: memory region that this lkey protects
 169 * @dma_region: 0->normal key, 1->restricted DMA key
 170 *
 171 * Returns 0 if successful, otherwise returns -errno.
 172 *
 173 * Increments mr reference count as required.
 174 *
 175 * Sets the lkey field mr for non-dma regions.
 176 *
 177 */
 178static int rvt_alloc_lkey(struct rvt_mregion *mr, int dma_region)
 179{
 180        unsigned long flags;
 181        u32 r;
 182        u32 n;
 183        int ret = 0;
 184        struct rvt_dev_info *dev = ib_to_rvt(mr->pd->device);
 185        struct rvt_lkey_table *rkt = &dev->lkey_table;
 186
 187        rvt_get_mr(mr);
 188        spin_lock_irqsave(&rkt->lock, flags);
 189
 190        /* special case for dma_mr lkey == 0 */
 191        if (dma_region) {
 192                struct rvt_mregion *tmr;
 193
 194                tmr = rcu_access_pointer(dev->dma_mr);
 195                if (!tmr) {
 196                        mr->lkey_published = 1;
 197                        /* Insure published written first */
 198                        rcu_assign_pointer(dev->dma_mr, mr);
 199                        rvt_get_mr(mr);
 200                }
 201                goto success;
 202        }
 203
 204        /* Find the next available LKEY */
 205        r = rkt->next;
 206        n = r;
 207        for (;;) {
 208                if (!rcu_access_pointer(rkt->table[r]))
 209                        break;
 210                r = (r + 1) & (rkt->max - 1);
 211                if (r == n)
 212                        goto bail;
 213        }
 214        rkt->next = (r + 1) & (rkt->max - 1);
 215        /*
 216         * Make sure lkey is never zero which is reserved to indicate an
 217         * unrestricted LKEY.
 218         */
 219        rkt->gen++;
 220        /*
 221         * bits are capped to ensure enough bits for generation number
 222         */
 223        mr->lkey = (r << (32 - dev->dparms.lkey_table_size)) |
 224                ((((1 << (24 - dev->dparms.lkey_table_size)) - 1) & rkt->gen)
 225                 << 8);
 226        if (mr->lkey == 0) {
 227                mr->lkey |= 1 << 8;
 228                rkt->gen++;
 229        }
 230        mr->lkey_published = 1;
 231        /* Insure published written first */
 232        rcu_assign_pointer(rkt->table[r], mr);
 233success:
 234        spin_unlock_irqrestore(&rkt->lock, flags);
 235out:
 236        return ret;
 237bail:
 238        rvt_put_mr(mr);
 239        spin_unlock_irqrestore(&rkt->lock, flags);
 240        ret = -ENOMEM;
 241        goto out;
 242}
 243
 244/**
 245 * rvt_free_lkey - free an lkey
 246 * @mr: mr to free from tables
 247 */
 248static void rvt_free_lkey(struct rvt_mregion *mr)
 249{
 250        unsigned long flags;
 251        u32 lkey = mr->lkey;
 252        u32 r;
 253        struct rvt_dev_info *dev = ib_to_rvt(mr->pd->device);
 254        struct rvt_lkey_table *rkt = &dev->lkey_table;
 255        int freed = 0;
 256
 257        spin_lock_irqsave(&rkt->lock, flags);
 258        if (!lkey) {
 259                if (mr->lkey_published) {
 260                        mr->lkey_published = 0;
 261                        /* insure published is written before pointer */
 262                        rcu_assign_pointer(dev->dma_mr, NULL);
 263                        rvt_put_mr(mr);
 264                }
 265        } else {
 266                if (!mr->lkey_published)
 267                        goto out;
 268                r = lkey >> (32 - dev->dparms.lkey_table_size);
 269                mr->lkey_published = 0;
 270                /* insure published is written before pointer */
 271                rcu_assign_pointer(rkt->table[r], NULL);
 272        }
 273        freed++;
 274out:
 275        spin_unlock_irqrestore(&rkt->lock, flags);
 276        if (freed)
 277                percpu_ref_kill(&mr->refcount);
 278}
 279
 280static struct rvt_mr *__rvt_alloc_mr(int count, struct ib_pd *pd)
 281{
 282        struct rvt_mr *mr;
 283        int rval = -ENOMEM;
 284        int m;
 285
 286        /* Allocate struct plus pointers to first level page tables. */
 287        m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
 288        mr = kzalloc(struct_size(mr, mr.map, m), GFP_KERNEL);
 289        if (!mr)
 290                goto bail;
 291
 292        rval = rvt_init_mregion(&mr->mr, pd, count, 0);
 293        if (rval)
 294                goto bail;
 295        /*
 296         * ib_reg_phys_mr() will initialize mr->ibmr except for
 297         * lkey and rkey.
 298         */
 299        rval = rvt_alloc_lkey(&mr->mr, 0);
 300        if (rval)
 301                goto bail_mregion;
 302        mr->ibmr.lkey = mr->mr.lkey;
 303        mr->ibmr.rkey = mr->mr.lkey;
 304done:
 305        return mr;
 306
 307bail_mregion:
 308        rvt_deinit_mregion(&mr->mr);
 309bail:
 310        kfree(mr);
 311        mr = ERR_PTR(rval);
 312        goto done;
 313}
 314
 315static void __rvt_free_mr(struct rvt_mr *mr)
 316{
 317        rvt_free_lkey(&mr->mr);
 318        rvt_deinit_mregion(&mr->mr);
 319        kfree(mr);
 320}
 321
 322/**
 323 * rvt_get_dma_mr - get a DMA memory region
 324 * @pd: protection domain for this memory region
 325 * @acc: access flags
 326 *
 327 * Return: the memory region on success, otherwise returns an errno.
 328 * Note that all DMA addresses should be created via the functions in
 329 * struct dma_virt_ops.
 330 */
 331struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc)
 332{
 333        struct rvt_mr *mr;
 334        struct ib_mr *ret;
 335        int rval;
 336
 337        if (ibpd_to_rvtpd(pd)->user)
 338                return ERR_PTR(-EPERM);
 339
 340        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 341        if (!mr) {
 342                ret = ERR_PTR(-ENOMEM);
 343                goto bail;
 344        }
 345
 346        rval = rvt_init_mregion(&mr->mr, pd, 0, 0);
 347        if (rval) {
 348                ret = ERR_PTR(rval);
 349                goto bail;
 350        }
 351
 352        rval = rvt_alloc_lkey(&mr->mr, 1);
 353        if (rval) {
 354                ret = ERR_PTR(rval);
 355                goto bail_mregion;
 356        }
 357
 358        mr->mr.access_flags = acc;
 359        ret = &mr->ibmr;
 360done:
 361        return ret;
 362
 363bail_mregion:
 364        rvt_deinit_mregion(&mr->mr);
 365bail:
 366        kfree(mr);
 367        goto done;
 368}
 369
 370/**
 371 * rvt_reg_user_mr - register a userspace memory region
 372 * @pd: protection domain for this memory region
 373 * @start: starting userspace address
 374 * @length: length of region to register
 375 * @mr_access_flags: access flags for this memory region
 376 * @udata: unused by the driver
 377 *
 378 * Return: the memory region on success, otherwise returns an errno.
 379 */
 380struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 381                              u64 virt_addr, int mr_access_flags,
 382                              struct ib_udata *udata)
 383{
 384        struct rvt_mr *mr;
 385        struct ib_umem *umem;
 386        struct sg_page_iter sg_iter;
 387        int n, m;
 388        struct ib_mr *ret;
 389
 390        if (length == 0)
 391                return ERR_PTR(-EINVAL);
 392
 393        umem = ib_umem_get(udata, start, length, mr_access_flags, 0);
 394        if (IS_ERR(umem))
 395                return (void *)umem;
 396
 397        n = ib_umem_num_pages(umem);
 398
 399        mr = __rvt_alloc_mr(n, pd);
 400        if (IS_ERR(mr)) {
 401                ret = (struct ib_mr *)mr;
 402                goto bail_umem;
 403        }
 404
 405        mr->mr.user_base = start;
 406        mr->mr.iova = virt_addr;
 407        mr->mr.length = length;
 408        mr->mr.offset = ib_umem_offset(umem);
 409        mr->mr.access_flags = mr_access_flags;
 410        mr->umem = umem;
 411
 412        mr->mr.page_shift = PAGE_SHIFT;
 413        m = 0;
 414        n = 0;
 415        for_each_sg_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
 416                void *vaddr;
 417
 418                vaddr = page_address(sg_page_iter_page(&sg_iter));
 419                if (!vaddr) {
 420                        ret = ERR_PTR(-EINVAL);
 421                        goto bail_inval;
 422                }
 423                mr->mr.map[m]->segs[n].vaddr = vaddr;
 424                mr->mr.map[m]->segs[n].length = PAGE_SIZE;
 425                trace_rvt_mr_user_seg(&mr->mr, m, n, vaddr, PAGE_SIZE);
 426                if (++n == RVT_SEGSZ) {
 427                        m++;
 428                        n = 0;
 429                }
 430        }
 431        return &mr->ibmr;
 432
 433bail_inval:
 434        __rvt_free_mr(mr);
 435
 436bail_umem:
 437        ib_umem_release(umem);
 438
 439        return ret;
 440}
 441
 442/**
 443 * rvt_dereg_clean_qp_cb - callback from iterator
 444 * @qp - the qp
 445 * @v - the mregion (as u64)
 446 *
 447 * This routine fields the callback for all QPs and
 448 * for QPs in the same PD as the MR will call the
 449 * rvt_qp_mr_clean() to potentially cleanup references.
 450 */
 451static void rvt_dereg_clean_qp_cb(struct rvt_qp *qp, u64 v)
 452{
 453        struct rvt_mregion *mr = (struct rvt_mregion *)v;
 454
 455        /* skip PDs that are not ours */
 456        if (mr->pd != qp->ibqp.pd)
 457                return;
 458        rvt_qp_mr_clean(qp, mr->lkey);
 459}
 460
 461/**
 462 * rvt_dereg_clean_qps - find QPs for reference cleanup
 463 * @mr - the MR that is being deregistered
 464 *
 465 * This routine iterates RC QPs looking for references
 466 * to the lkey noted in mr.
 467 */
 468static void rvt_dereg_clean_qps(struct rvt_mregion *mr)
 469{
 470        struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
 471
 472        rvt_qp_iter(rdi, (u64)mr, rvt_dereg_clean_qp_cb);
 473}
 474
 475/**
 476 * rvt_check_refs - check references
 477 * @mr - the megion
 478 * @t - the caller identification
 479 *
 480 * This routine checks MRs holding a reference during
 481 * when being de-registered.
 482 *
 483 * If the count is non-zero, the code calls a clean routine then
 484 * waits for the timeout for the count to zero.
 485 */
 486static int rvt_check_refs(struct rvt_mregion *mr, const char *t)
 487{
 488        unsigned long timeout;
 489        struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
 490
 491        if (mr->lkey) {
 492                /* avoid dma mr */
 493                rvt_dereg_clean_qps(mr);
 494                /* @mr was indexed on rcu protected @lkey_table */
 495                synchronize_rcu();
 496        }
 497
 498        timeout = wait_for_completion_timeout(&mr->comp, 5 * HZ);
 499        if (!timeout) {
 500                rvt_pr_err(rdi,
 501                           "%s timeout mr %p pd %p lkey %x refcount %ld\n",
 502                           t, mr, mr->pd, mr->lkey,
 503                           atomic_long_read(&mr->refcount.count));
 504                rvt_get_mr(mr);
 505                return -EBUSY;
 506        }
 507        return 0;
 508}
 509
 510/**
 511 * rvt_mr_has_lkey - is MR
 512 * @mr - the mregion
 513 * @lkey - the lkey
 514 */
 515bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey)
 516{
 517        return mr && lkey == mr->lkey;
 518}
 519
 520/**
 521 * rvt_ss_has_lkey - is mr in sge tests
 522 * @ss - the sge state
 523 * @lkey
 524 *
 525 * This code tests for an MR in the indicated
 526 * sge state.
 527 */
 528bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey)
 529{
 530        int i;
 531        bool rval = false;
 532
 533        if (!ss->num_sge)
 534                return rval;
 535        /* first one */
 536        rval = rvt_mr_has_lkey(ss->sge.mr, lkey);
 537        /* any others */
 538        for (i = 0; !rval && i < ss->num_sge - 1; i++)
 539                rval = rvt_mr_has_lkey(ss->sg_list[i].mr, lkey);
 540        return rval;
 541}
 542
 543/**
 544 * rvt_dereg_mr - unregister and free a memory region
 545 * @ibmr: the memory region to free
 546 *
 547 *
 548 * Note that this is called to free MRs created by rvt_get_dma_mr()
 549 * or rvt_reg_user_mr().
 550 *
 551 * Returns 0 on success.
 552 */
 553int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 554{
 555        struct rvt_mr *mr = to_imr(ibmr);
 556        int ret;
 557
 558        rvt_free_lkey(&mr->mr);
 559
 560        rvt_put_mr(&mr->mr); /* will set completion if last */
 561        ret = rvt_check_refs(&mr->mr, __func__);
 562        if (ret)
 563                goto out;
 564        rvt_deinit_mregion(&mr->mr);
 565        ib_umem_release(mr->umem);
 566        kfree(mr);
 567out:
 568        return ret;
 569}
 570
 571/**
 572 * rvt_alloc_mr - Allocate a memory region usable with the
 573 * @pd: protection domain for this memory region
 574 * @mr_type: mem region type
 575 * @max_num_sg: Max number of segments allowed
 576 *
 577 * Return: the memory region on success, otherwise return an errno.
 578 */
 579struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
 580                           u32 max_num_sg, struct ib_udata *udata)
 581{
 582        struct rvt_mr *mr;
 583
 584        if (mr_type != IB_MR_TYPE_MEM_REG)
 585                return ERR_PTR(-EINVAL);
 586
 587        mr = __rvt_alloc_mr(max_num_sg, pd);
 588        if (IS_ERR(mr))
 589                return (struct ib_mr *)mr;
 590
 591        return &mr->ibmr;
 592}
 593
 594/**
 595 * rvt_set_page - page assignment function called by ib_sg_to_pages
 596 * @ibmr: memory region
 597 * @addr: dma address of mapped page
 598 *
 599 * Return: 0 on success
 600 */
 601static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
 602{
 603        struct rvt_mr *mr = to_imr(ibmr);
 604        u32 ps = 1 << mr->mr.page_shift;
 605        u32 mapped_segs = mr->mr.length >> mr->mr.page_shift;
 606        int m, n;
 607
 608        if (unlikely(mapped_segs == mr->mr.max_segs))
 609                return -ENOMEM;
 610
 611        m = mapped_segs / RVT_SEGSZ;
 612        n = mapped_segs % RVT_SEGSZ;
 613        mr->mr.map[m]->segs[n].vaddr = (void *)addr;
 614        mr->mr.map[m]->segs[n].length = ps;
 615        mr->mr.length += ps;
 616        trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps);
 617
 618        return 0;
 619}
 620
 621/**
 622 * rvt_map_mr_sg - map sg list and set it the memory region
 623 * @ibmr: memory region
 624 * @sg: dma mapped scatterlist
 625 * @sg_nents: number of entries in sg
 626 * @sg_offset: offset in bytes into sg
 627 *
 628 * Overwrite rvt_mr length with mr length calculated by ib_sg_to_pages.
 629 *
 630 * Return: number of sg elements mapped to the memory region
 631 */
 632int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 633                  int sg_nents, unsigned int *sg_offset)
 634{
 635        struct rvt_mr *mr = to_imr(ibmr);
 636        int ret;
 637
 638        mr->mr.length = 0;
 639        mr->mr.page_shift = PAGE_SHIFT;
 640        ret = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rvt_set_page);
 641        mr->mr.user_base = ibmr->iova;
 642        mr->mr.iova = ibmr->iova;
 643        mr->mr.offset = ibmr->iova - (u64)mr->mr.map[0]->segs[0].vaddr;
 644        mr->mr.length = (size_t)ibmr->length;
 645        trace_rvt_map_mr_sg(ibmr, sg_nents, sg_offset);
 646        return ret;
 647}
 648
 649/**
 650 * rvt_fast_reg_mr - fast register physical MR
 651 * @qp: the queue pair where the work request comes from
 652 * @ibmr: the memory region to be registered
 653 * @key: updated key for this memory region
 654 * @access: access flags for this memory region
 655 *
 656 * Returns 0 on success.
 657 */
 658int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
 659                    int access)
 660{
 661        struct rvt_mr *mr = to_imr(ibmr);
 662
 663        if (qp->ibqp.pd != mr->mr.pd)
 664                return -EACCES;
 665
 666        /* not applicable to dma MR or user MR */
 667        if (!mr->mr.lkey || mr->umem)
 668                return -EINVAL;
 669
 670        if ((key & 0xFFFFFF00) != (mr->mr.lkey & 0xFFFFFF00))
 671                return -EINVAL;
 672
 673        ibmr->lkey = key;
 674        ibmr->rkey = key;
 675        mr->mr.lkey = key;
 676        mr->mr.access_flags = access;
 677        mr->mr.iova = ibmr->iova;
 678        atomic_set(&mr->mr.lkey_invalid, 0);
 679
 680        return 0;
 681}
 682EXPORT_SYMBOL(rvt_fast_reg_mr);
 683
 684/**
 685 * rvt_invalidate_rkey - invalidate an MR rkey
 686 * @qp: queue pair associated with the invalidate op
 687 * @rkey: rkey to invalidate
 688 *
 689 * Returns 0 on success.
 690 */
 691int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey)
 692{
 693        struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
 694        struct rvt_lkey_table *rkt = &dev->lkey_table;
 695        struct rvt_mregion *mr;
 696
 697        if (rkey == 0)
 698                return -EINVAL;
 699
 700        rcu_read_lock();
 701        mr = rcu_dereference(
 702                rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
 703        if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
 704                goto bail;
 705
 706        atomic_set(&mr->lkey_invalid, 1);
 707        rcu_read_unlock();
 708        return 0;
 709
 710bail:
 711        rcu_read_unlock();
 712        return -EINVAL;
 713}
 714EXPORT_SYMBOL(rvt_invalidate_rkey);
 715
 716/**
 717 * rvt_alloc_fmr - allocate a fast memory region
 718 * @pd: the protection domain for this memory region
 719 * @mr_access_flags: access flags for this memory region
 720 * @fmr_attr: fast memory region attributes
 721 *
 722 * Return: the memory region on success, otherwise returns an errno.
 723 */
 724struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
 725                             struct ib_fmr_attr *fmr_attr)
 726{
 727        struct rvt_fmr *fmr;
 728        int m;
 729        struct ib_fmr *ret;
 730        int rval = -ENOMEM;
 731
 732        /* Allocate struct plus pointers to first level page tables. */
 733        m = (fmr_attr->max_pages + RVT_SEGSZ - 1) / RVT_SEGSZ;
 734        fmr = kzalloc(struct_size(fmr, mr.map, m), GFP_KERNEL);
 735        if (!fmr)
 736                goto bail;
 737
 738        rval = rvt_init_mregion(&fmr->mr, pd, fmr_attr->max_pages,
 739                                PERCPU_REF_INIT_ATOMIC);
 740        if (rval)
 741                goto bail;
 742
 743        /*
 744         * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
 745         * rkey.
 746         */
 747        rval = rvt_alloc_lkey(&fmr->mr, 0);
 748        if (rval)
 749                goto bail_mregion;
 750        fmr->ibfmr.rkey = fmr->mr.lkey;
 751        fmr->ibfmr.lkey = fmr->mr.lkey;
 752        /*
 753         * Resources are allocated but no valid mapping (RKEY can't be
 754         * used).
 755         */
 756        fmr->mr.access_flags = mr_access_flags;
 757        fmr->mr.max_segs = fmr_attr->max_pages;
 758        fmr->mr.page_shift = fmr_attr->page_shift;
 759
 760        ret = &fmr->ibfmr;
 761done:
 762        return ret;
 763
 764bail_mregion:
 765        rvt_deinit_mregion(&fmr->mr);
 766bail:
 767        kfree(fmr);
 768        ret = ERR_PTR(rval);
 769        goto done;
 770}
 771
 772/**
 773 * rvt_map_phys_fmr - set up a fast memory region
 774 * @ibfmr: the fast memory region to set up
 775 * @page_list: the list of pages to associate with the fast memory region
 776 * @list_len: the number of pages to associate with the fast memory region
 777 * @iova: the virtual address of the start of the fast memory region
 778 *
 779 * This may be called from interrupt context.
 780 *
 781 * Return: 0 on success
 782 */
 783
 784int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
 785                     int list_len, u64 iova)
 786{
 787        struct rvt_fmr *fmr = to_ifmr(ibfmr);
 788        struct rvt_lkey_table *rkt;
 789        unsigned long flags;
 790        int m, n;
 791        unsigned long i;
 792        u32 ps;
 793        struct rvt_dev_info *rdi = ib_to_rvt(ibfmr->device);
 794
 795        i = atomic_long_read(&fmr->mr.refcount.count);
 796        if (i > 2)
 797                return -EBUSY;
 798
 799        if (list_len > fmr->mr.max_segs)
 800                return -EINVAL;
 801
 802        rkt = &rdi->lkey_table;
 803        spin_lock_irqsave(&rkt->lock, flags);
 804        fmr->mr.user_base = iova;
 805        fmr->mr.iova = iova;
 806        ps = 1 << fmr->mr.page_shift;
 807        fmr->mr.length = list_len * ps;
 808        m = 0;
 809        n = 0;
 810        for (i = 0; i < list_len; i++) {
 811                fmr->mr.map[m]->segs[n].vaddr = (void *)page_list[i];
 812                fmr->mr.map[m]->segs[n].length = ps;
 813                trace_rvt_mr_fmr_seg(&fmr->mr, m, n, (void *)page_list[i], ps);
 814                if (++n == RVT_SEGSZ) {
 815                        m++;
 816                        n = 0;
 817                }
 818        }
 819        spin_unlock_irqrestore(&rkt->lock, flags);
 820        return 0;
 821}
 822
 823/**
 824 * rvt_unmap_fmr - unmap fast memory regions
 825 * @fmr_list: the list of fast memory regions to unmap
 826 *
 827 * Return: 0 on success.
 828 */
 829int rvt_unmap_fmr(struct list_head *fmr_list)
 830{
 831        struct rvt_fmr *fmr;
 832        struct rvt_lkey_table *rkt;
 833        unsigned long flags;
 834        struct rvt_dev_info *rdi;
 835
 836        list_for_each_entry(fmr, fmr_list, ibfmr.list) {
 837                rdi = ib_to_rvt(fmr->ibfmr.device);
 838                rkt = &rdi->lkey_table;
 839                spin_lock_irqsave(&rkt->lock, flags);
 840                fmr->mr.user_base = 0;
 841                fmr->mr.iova = 0;
 842                fmr->mr.length = 0;
 843                spin_unlock_irqrestore(&rkt->lock, flags);
 844        }
 845        return 0;
 846}
 847
 848/**
 849 * rvt_dealloc_fmr - deallocate a fast memory region
 850 * @ibfmr: the fast memory region to deallocate
 851 *
 852 * Return: 0 on success.
 853 */
 854int rvt_dealloc_fmr(struct ib_fmr *ibfmr)
 855{
 856        struct rvt_fmr *fmr = to_ifmr(ibfmr);
 857        int ret = 0;
 858
 859        rvt_free_lkey(&fmr->mr);
 860        rvt_put_mr(&fmr->mr); /* will set completion if last */
 861        ret = rvt_check_refs(&fmr->mr, __func__);
 862        if (ret)
 863                goto out;
 864        rvt_deinit_mregion(&fmr->mr);
 865        kfree(fmr);
 866out:
 867        return ret;
 868}
 869
 870/**
 871 * rvt_sge_adjacent - is isge compressible
 872 * @last_sge: last outgoing SGE written
 873 * @sge: SGE to check
 874 *
 875 * If adjacent will update last_sge to add length.
 876 *
 877 * Return: true if isge is adjacent to last sge
 878 */
 879static inline bool rvt_sge_adjacent(struct rvt_sge *last_sge,
 880                                    struct ib_sge *sge)
 881{
 882        if (last_sge && sge->lkey == last_sge->mr->lkey &&
 883            ((uint64_t)(last_sge->vaddr + last_sge->length) == sge->addr)) {
 884                if (sge->lkey) {
 885                        if (unlikely((sge->addr - last_sge->mr->user_base +
 886                              sge->length > last_sge->mr->length)))
 887                                return false; /* overrun, caller will catch */
 888                } else {
 889                        last_sge->length += sge->length;
 890                }
 891                last_sge->sge_length += sge->length;
 892                trace_rvt_sge_adjacent(last_sge, sge);
 893                return true;
 894        }
 895        return false;
 896}
 897
 898/**
 899 * rvt_lkey_ok - check IB SGE for validity and initialize
 900 * @rkt: table containing lkey to check SGE against
 901 * @pd: protection domain
 902 * @isge: outgoing internal SGE
 903 * @last_sge: last outgoing SGE written
 904 * @sge: SGE to check
 905 * @acc: access flags
 906 *
 907 * Check the IB SGE for validity and initialize our internal version
 908 * of it.
 909 *
 910 * Increments the reference count when a new sge is stored.
 911 *
 912 * Return: 0 if compressed, 1 if added , otherwise returns -errno.
 913 */
 914int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
 915                struct rvt_sge *isge, struct rvt_sge *last_sge,
 916                struct ib_sge *sge, int acc)
 917{
 918        struct rvt_mregion *mr;
 919        unsigned n, m;
 920        size_t off;
 921
 922        /*
 923         * We use LKEY == zero for kernel virtual addresses
 924         * (see rvt_get_dma_mr() and dma_virt_ops).
 925         */
 926        if (sge->lkey == 0) {
 927                struct rvt_dev_info *dev = ib_to_rvt(pd->ibpd.device);
 928
 929                if (pd->user)
 930                        return -EINVAL;
 931                if (rvt_sge_adjacent(last_sge, sge))
 932                        return 0;
 933                rcu_read_lock();
 934                mr = rcu_dereference(dev->dma_mr);
 935                if (!mr)
 936                        goto bail;
 937                rvt_get_mr(mr);
 938                rcu_read_unlock();
 939
 940                isge->mr = mr;
 941                isge->vaddr = (void *)sge->addr;
 942                isge->length = sge->length;
 943                isge->sge_length = sge->length;
 944                isge->m = 0;
 945                isge->n = 0;
 946                goto ok;
 947        }
 948        if (rvt_sge_adjacent(last_sge, sge))
 949                return 0;
 950        rcu_read_lock();
 951        mr = rcu_dereference(rkt->table[sge->lkey >> rkt->shift]);
 952        if (!mr)
 953                goto bail;
 954        rvt_get_mr(mr);
 955        if (!READ_ONCE(mr->lkey_published))
 956                goto bail_unref;
 957
 958        if (unlikely(atomic_read(&mr->lkey_invalid) ||
 959                     mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
 960                goto bail_unref;
 961
 962        off = sge->addr - mr->user_base;
 963        if (unlikely(sge->addr < mr->user_base ||
 964                     off + sge->length > mr->length ||
 965                     (mr->access_flags & acc) != acc))
 966                goto bail_unref;
 967        rcu_read_unlock();
 968
 969        off += mr->offset;
 970        if (mr->page_shift) {
 971                /*
 972                 * page sizes are uniform power of 2 so no loop is necessary
 973                 * entries_spanned_by_off is the number of times the loop below
 974                 * would have executed.
 975                */
 976                size_t entries_spanned_by_off;
 977
 978                entries_spanned_by_off = off >> mr->page_shift;
 979                off -= (entries_spanned_by_off << mr->page_shift);
 980                m = entries_spanned_by_off / RVT_SEGSZ;
 981                n = entries_spanned_by_off % RVT_SEGSZ;
 982        } else {
 983                m = 0;
 984                n = 0;
 985                while (off >= mr->map[m]->segs[n].length) {
 986                        off -= mr->map[m]->segs[n].length;
 987                        n++;
 988                        if (n >= RVT_SEGSZ) {
 989                                m++;
 990                                n = 0;
 991                        }
 992                }
 993        }
 994        isge->mr = mr;
 995        isge->vaddr = mr->map[m]->segs[n].vaddr + off;
 996        isge->length = mr->map[m]->segs[n].length - off;
 997        isge->sge_length = sge->length;
 998        isge->m = m;
 999        isge->n = n;
1000ok:
1001        trace_rvt_sge_new(isge, sge);
1002        return 1;
1003bail_unref:
1004        rvt_put_mr(mr);
1005bail:
1006        rcu_read_unlock();
1007        return -EINVAL;
1008}
1009EXPORT_SYMBOL(rvt_lkey_ok);
1010
1011/**
1012 * rvt_rkey_ok - check the IB virtual address, length, and RKEY
1013 * @qp: qp for validation
1014 * @sge: SGE state
1015 * @len: length of data
1016 * @vaddr: virtual address to place data
1017 * @rkey: rkey to check
1018 * @acc: access flags
1019 *
1020 * Return: 1 if successful, otherwise 0.
1021 *
1022 * increments the reference count upon success
1023 */
1024int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
1025                u32 len, u64 vaddr, u32 rkey, int acc)
1026{
1027        struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
1028        struct rvt_lkey_table *rkt = &dev->lkey_table;
1029        struct rvt_mregion *mr;
1030        unsigned n, m;
1031        size_t off;
1032
1033        /*
1034         * We use RKEY == zero for kernel virtual addresses
1035         * (see rvt_get_dma_mr() and dma_virt_ops).
1036         */
1037        rcu_read_lock();
1038        if (rkey == 0) {
1039                struct rvt_pd *pd = ibpd_to_rvtpd(qp->ibqp.pd);
1040                struct rvt_dev_info *rdi = ib_to_rvt(pd->ibpd.device);
1041
1042                if (pd->user)
1043                        goto bail;
1044                mr = rcu_dereference(rdi->dma_mr);
1045                if (!mr)
1046                        goto bail;
1047                rvt_get_mr(mr);
1048                rcu_read_unlock();
1049
1050                sge->mr = mr;
1051                sge->vaddr = (void *)vaddr;
1052                sge->length = len;
1053                sge->sge_length = len;
1054                sge->m = 0;
1055                sge->n = 0;
1056                goto ok;
1057        }
1058
1059        mr = rcu_dereference(rkt->table[rkey >> rkt->shift]);
1060        if (!mr)
1061                goto bail;
1062        rvt_get_mr(mr);
1063        /* insure mr read is before test */
1064        if (!READ_ONCE(mr->lkey_published))
1065                goto bail_unref;
1066        if (unlikely(atomic_read(&mr->lkey_invalid) ||
1067                     mr->lkey != rkey || qp->ibqp.pd != mr->pd))
1068                goto bail_unref;
1069
1070        off = vaddr - mr->iova;
1071        if (unlikely(vaddr < mr->iova || off + len > mr->length ||
1072                     (mr->access_flags & acc) == 0))
1073                goto bail_unref;
1074        rcu_read_unlock();
1075
1076        off += mr->offset;
1077        if (mr->page_shift) {
1078                /*
1079                 * page sizes are uniform power of 2 so no loop is necessary
1080                 * entries_spanned_by_off is the number of times the loop below
1081                 * would have executed.
1082                */
1083                size_t entries_spanned_by_off;
1084
1085                entries_spanned_by_off = off >> mr->page_shift;
1086                off -= (entries_spanned_by_off << mr->page_shift);
1087                m = entries_spanned_by_off / RVT_SEGSZ;
1088                n = entries_spanned_by_off % RVT_SEGSZ;
1089        } else {
1090                m = 0;
1091                n = 0;
1092                while (off >= mr->map[m]->segs[n].length) {
1093                        off -= mr->map[m]->segs[n].length;
1094                        n++;
1095                        if (n >= RVT_SEGSZ) {
1096                                m++;
1097                                n = 0;
1098                        }
1099                }
1100        }
1101        sge->mr = mr;
1102        sge->vaddr = mr->map[m]->segs[n].vaddr + off;
1103        sge->length = mr->map[m]->segs[n].length - off;
1104        sge->sge_length = len;
1105        sge->m = m;
1106        sge->n = n;
1107ok:
1108        return 1;
1109bail_unref:
1110        rvt_put_mr(mr);
1111bail:
1112        rcu_read_unlock();
1113        return 0;
1114}
1115EXPORT_SYMBOL(rvt_rkey_ok);
1116