linux/drivers/infiniband/sw/rdmavt/mr.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2016 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47
  48#include <linux/slab.h>
  49#include <linux/vmalloc.h>
  50#include <rdma/ib_umem.h>
  51#include <rdma/rdma_vt.h>
  52#include "vt.h"
  53#include "mr.h"
  54#include "trace.h"
  55
  56/**
  57 * rvt_driver_mr_init - Init MR resources per driver
  58 * @rdi: rvt dev struct
  59 *
  60 * Do any intilization needed when a driver registers with rdmavt.
  61 *
  62 * Return: 0 on success or errno on failure
  63 */
  64int rvt_driver_mr_init(struct rvt_dev_info *rdi)
  65{
  66        unsigned int lkey_table_size = rdi->dparms.lkey_table_size;
  67        unsigned lk_tab_size;
  68        int i;
  69
  70        /*
  71         * The top hfi1_lkey_table_size bits are used to index the
  72         * table.  The lower 8 bits can be owned by the user (copied from
  73         * the LKEY).  The remaining bits act as a generation number or tag.
  74         */
  75        if (!lkey_table_size)
  76                return -EINVAL;
  77
  78        spin_lock_init(&rdi->lkey_table.lock);
  79
  80        /* ensure generation is at least 4 bits */
  81        if (lkey_table_size > RVT_MAX_LKEY_TABLE_BITS) {
  82                rvt_pr_warn(rdi, "lkey bits %u too large, reduced to %u\n",
  83                            lkey_table_size, RVT_MAX_LKEY_TABLE_BITS);
  84                rdi->dparms.lkey_table_size = RVT_MAX_LKEY_TABLE_BITS;
  85                lkey_table_size = rdi->dparms.lkey_table_size;
  86        }
  87        rdi->lkey_table.max = 1 << lkey_table_size;
  88        rdi->lkey_table.shift = 32 - lkey_table_size;
  89        lk_tab_size = rdi->lkey_table.max * sizeof(*rdi->lkey_table.table);
  90        rdi->lkey_table.table = (struct rvt_mregion __rcu **)
  91                               vmalloc_node(lk_tab_size, rdi->dparms.node);
  92        if (!rdi->lkey_table.table)
  93                return -ENOMEM;
  94
  95        RCU_INIT_POINTER(rdi->dma_mr, NULL);
  96        for (i = 0; i < rdi->lkey_table.max; i++)
  97                RCU_INIT_POINTER(rdi->lkey_table.table[i], NULL);
  98
  99        return 0;
 100}
 101
 102/**
 103 *rvt_mr_exit: clean up MR
 104 *@rdi: rvt dev structure
 105 *
 106 * called when drivers have unregistered or perhaps failed to register with us
 107 */
 108void rvt_mr_exit(struct rvt_dev_info *rdi)
 109{
 110        if (rdi->dma_mr)
 111                rvt_pr_err(rdi, "DMA MR not null!\n");
 112
 113        vfree(rdi->lkey_table.table);
 114}
 115
 116static void rvt_deinit_mregion(struct rvt_mregion *mr)
 117{
 118        int i = mr->mapsz;
 119
 120        mr->mapsz = 0;
 121        while (i)
 122                kfree(mr->map[--i]);
 123        percpu_ref_exit(&mr->refcount);
 124}
 125
 126static void __rvt_mregion_complete(struct percpu_ref *ref)
 127{
 128        struct rvt_mregion *mr = container_of(ref, struct rvt_mregion,
 129                                              refcount);
 130
 131        complete(&mr->comp);
 132}
 133
 134static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd,
 135                            int count, unsigned int percpu_flags)
 136{
 137        int m, i = 0;
 138        struct rvt_dev_info *dev = ib_to_rvt(pd->device);
 139
 140        mr->mapsz = 0;
 141        m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
 142        for (; i < m; i++) {
 143                mr->map[i] = kzalloc_node(sizeof(*mr->map[0]), GFP_KERNEL,
 144                                          dev->dparms.node);
 145                if (!mr->map[i])
 146                        goto bail;
 147                mr->mapsz++;
 148        }
 149        init_completion(&mr->comp);
 150        /* count returning the ptr to user */
 151        if (percpu_ref_init(&mr->refcount, &__rvt_mregion_complete,
 152                            percpu_flags, GFP_KERNEL))
 153                goto bail;
 154
 155        atomic_set(&mr->lkey_invalid, 0);
 156        mr->pd = pd;
 157        mr->max_segs = count;
 158        return 0;
 159bail:
 160        rvt_deinit_mregion(mr);
 161        return -ENOMEM;
 162}
 163
 164/**
 165 * rvt_alloc_lkey - allocate an lkey
 166 * @mr: memory region that this lkey protects
 167 * @dma_region: 0->normal key, 1->restricted DMA key
 168 *
 169 * Returns 0 if successful, otherwise returns -errno.
 170 *
 171 * Increments mr reference count as required.
 172 *
 173 * Sets the lkey field mr for non-dma regions.
 174 *
 175 */
 176static int rvt_alloc_lkey(struct rvt_mregion *mr, int dma_region)
 177{
 178        unsigned long flags;
 179        u32 r;
 180        u32 n;
 181        int ret = 0;
 182        struct rvt_dev_info *dev = ib_to_rvt(mr->pd->device);
 183        struct rvt_lkey_table *rkt = &dev->lkey_table;
 184
 185        rvt_get_mr(mr);
 186        spin_lock_irqsave(&rkt->lock, flags);
 187
 188        /* special case for dma_mr lkey == 0 */
 189        if (dma_region) {
 190                struct rvt_mregion *tmr;
 191
 192                tmr = rcu_access_pointer(dev->dma_mr);
 193                if (!tmr) {
 194                        mr->lkey_published = 1;
 195                        /* Insure published written first */
 196                        rcu_assign_pointer(dev->dma_mr, mr);
 197                        rvt_get_mr(mr);
 198                }
 199                goto success;
 200        }
 201
 202        /* Find the next available LKEY */
 203        r = rkt->next;
 204        n = r;
 205        for (;;) {
 206                if (!rcu_access_pointer(rkt->table[r]))
 207                        break;
 208                r = (r + 1) & (rkt->max - 1);
 209                if (r == n)
 210                        goto bail;
 211        }
 212        rkt->next = (r + 1) & (rkt->max - 1);
 213        /*
 214         * Make sure lkey is never zero which is reserved to indicate an
 215         * unrestricted LKEY.
 216         */
 217        rkt->gen++;
 218        /*
 219         * bits are capped to ensure enough bits for generation number
 220         */
 221        mr->lkey = (r << (32 - dev->dparms.lkey_table_size)) |
 222                ((((1 << (24 - dev->dparms.lkey_table_size)) - 1) & rkt->gen)
 223                 << 8);
 224        if (mr->lkey == 0) {
 225                mr->lkey |= 1 << 8;
 226                rkt->gen++;
 227        }
 228        mr->lkey_published = 1;
 229        /* Insure published written first */
 230        rcu_assign_pointer(rkt->table[r], mr);
 231success:
 232        spin_unlock_irqrestore(&rkt->lock, flags);
 233out:
 234        return ret;
 235bail:
 236        rvt_put_mr(mr);
 237        spin_unlock_irqrestore(&rkt->lock, flags);
 238        ret = -ENOMEM;
 239        goto out;
 240}
 241
 242/**
 243 * rvt_free_lkey - free an lkey
 244 * @mr: mr to free from tables
 245 */
 246static void rvt_free_lkey(struct rvt_mregion *mr)
 247{
 248        unsigned long flags;
 249        u32 lkey = mr->lkey;
 250        u32 r;
 251        struct rvt_dev_info *dev = ib_to_rvt(mr->pd->device);
 252        struct rvt_lkey_table *rkt = &dev->lkey_table;
 253        int freed = 0;
 254
 255        spin_lock_irqsave(&rkt->lock, flags);
 256        if (!lkey) {
 257                if (mr->lkey_published) {
 258                        mr->lkey_published = 0;
 259                        /* insure published is written before pointer */
 260                        rcu_assign_pointer(dev->dma_mr, NULL);
 261                        rvt_put_mr(mr);
 262                }
 263        } else {
 264                if (!mr->lkey_published)
 265                        goto out;
 266                r = lkey >> (32 - dev->dparms.lkey_table_size);
 267                mr->lkey_published = 0;
 268                /* insure published is written before pointer */
 269                rcu_assign_pointer(rkt->table[r], NULL);
 270        }
 271        freed++;
 272out:
 273        spin_unlock_irqrestore(&rkt->lock, flags);
 274        if (freed)
 275                percpu_ref_kill(&mr->refcount);
 276}
 277
 278static struct rvt_mr *__rvt_alloc_mr(int count, struct ib_pd *pd)
 279{
 280        struct rvt_mr *mr;
 281        int rval = -ENOMEM;
 282        int m;
 283
 284        /* Allocate struct plus pointers to first level page tables. */
 285        m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
 286        mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL);
 287        if (!mr)
 288                goto bail;
 289
 290        rval = rvt_init_mregion(&mr->mr, pd, count, 0);
 291        if (rval)
 292                goto bail;
 293        /*
 294         * ib_reg_phys_mr() will initialize mr->ibmr except for
 295         * lkey and rkey.
 296         */
 297        rval = rvt_alloc_lkey(&mr->mr, 0);
 298        if (rval)
 299                goto bail_mregion;
 300        mr->ibmr.lkey = mr->mr.lkey;
 301        mr->ibmr.rkey = mr->mr.lkey;
 302done:
 303        return mr;
 304
 305bail_mregion:
 306        rvt_deinit_mregion(&mr->mr);
 307bail:
 308        kfree(mr);
 309        mr = ERR_PTR(rval);
 310        goto done;
 311}
 312
 313static void __rvt_free_mr(struct rvt_mr *mr)
 314{
 315        rvt_free_lkey(&mr->mr);
 316        rvt_deinit_mregion(&mr->mr);
 317        kfree(mr);
 318}
 319
 320/**
 321 * rvt_get_dma_mr - get a DMA memory region
 322 * @pd: protection domain for this memory region
 323 * @acc: access flags
 324 *
 325 * Return: the memory region on success, otherwise returns an errno.
 326 * Note that all DMA addresses should be created via the functions in
 327 * struct dma_virt_ops.
 328 */
 329struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc)
 330{
 331        struct rvt_mr *mr;
 332        struct ib_mr *ret;
 333        int rval;
 334
 335        if (ibpd_to_rvtpd(pd)->user)
 336                return ERR_PTR(-EPERM);
 337
 338        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 339        if (!mr) {
 340                ret = ERR_PTR(-ENOMEM);
 341                goto bail;
 342        }
 343
 344        rval = rvt_init_mregion(&mr->mr, pd, 0, 0);
 345        if (rval) {
 346                ret = ERR_PTR(rval);
 347                goto bail;
 348        }
 349
 350        rval = rvt_alloc_lkey(&mr->mr, 1);
 351        if (rval) {
 352                ret = ERR_PTR(rval);
 353                goto bail_mregion;
 354        }
 355
 356        mr->mr.access_flags = acc;
 357        ret = &mr->ibmr;
 358done:
 359        return ret;
 360
 361bail_mregion:
 362        rvt_deinit_mregion(&mr->mr);
 363bail:
 364        kfree(mr);
 365        goto done;
 366}
 367
 368/**
 369 * rvt_reg_user_mr - register a userspace memory region
 370 * @pd: protection domain for this memory region
 371 * @start: starting userspace address
 372 * @length: length of region to register
 373 * @mr_access_flags: access flags for this memory region
 374 * @udata: unused by the driver
 375 *
 376 * Return: the memory region on success, otherwise returns an errno.
 377 */
 378struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 379                              u64 virt_addr, int mr_access_flags,
 380                              struct ib_udata *udata)
 381{
 382        struct rvt_mr *mr;
 383        struct ib_umem *umem;
 384        struct scatterlist *sg;
 385        int n, m, entry;
 386        struct ib_mr *ret;
 387
 388        if (length == 0)
 389                return ERR_PTR(-EINVAL);
 390
 391        umem = ib_umem_get(pd->uobject->context, start, length,
 392                           mr_access_flags, 0);
 393        if (IS_ERR(umem))
 394                return (void *)umem;
 395
 396        n = umem->nmap;
 397
 398        mr = __rvt_alloc_mr(n, pd);
 399        if (IS_ERR(mr)) {
 400                ret = (struct ib_mr *)mr;
 401                goto bail_umem;
 402        }
 403
 404        mr->mr.user_base = start;
 405        mr->mr.iova = virt_addr;
 406        mr->mr.length = length;
 407        mr->mr.offset = ib_umem_offset(umem);
 408        mr->mr.access_flags = mr_access_flags;
 409        mr->umem = umem;
 410
 411        mr->mr.page_shift = umem->page_shift;
 412        m = 0;
 413        n = 0;
 414        for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
 415                void *vaddr;
 416
 417                vaddr = page_address(sg_page(sg));
 418                if (!vaddr) {
 419                        ret = ERR_PTR(-EINVAL);
 420                        goto bail_inval;
 421                }
 422                mr->mr.map[m]->segs[n].vaddr = vaddr;
 423                mr->mr.map[m]->segs[n].length = BIT(umem->page_shift);
 424                trace_rvt_mr_user_seg(&mr->mr, m, n, vaddr,
 425                                      BIT(umem->page_shift));
 426                n++;
 427                if (n == RVT_SEGSZ) {
 428                        m++;
 429                        n = 0;
 430                }
 431        }
 432        return &mr->ibmr;
 433
 434bail_inval:
 435        __rvt_free_mr(mr);
 436
 437bail_umem:
 438        ib_umem_release(umem);
 439
 440        return ret;
 441}
 442
 443/**
 444 * rvt_dereg_clean_qp_cb - callback from iterator
 445 * @qp - the qp
 446 * @v - the mregion (as u64)
 447 *
 448 * This routine fields the callback for all QPs and
 449 * for QPs in the same PD as the MR will call the
 450 * rvt_qp_mr_clean() to potentially cleanup references.
 451 */
 452static void rvt_dereg_clean_qp_cb(struct rvt_qp *qp, u64 v)
 453{
 454        struct rvt_mregion *mr = (struct rvt_mregion *)v;
 455
 456        /* skip PDs that are not ours */
 457        if (mr->pd != qp->ibqp.pd)
 458                return;
 459        rvt_qp_mr_clean(qp, mr->lkey);
 460}
 461
 462/**
 463 * rvt_dereg_clean_qps - find QPs for reference cleanup
 464 * @mr - the MR that is being deregistered
 465 *
 466 * This routine iterates RC QPs looking for references
 467 * to the lkey noted in mr.
 468 */
 469static void rvt_dereg_clean_qps(struct rvt_mregion *mr)
 470{
 471        struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
 472
 473        rvt_qp_iter(rdi, (u64)mr, rvt_dereg_clean_qp_cb);
 474}
 475
 476/**
 477 * rvt_check_refs - check references
 478 * @mr - the megion
 479 * @t - the caller identification
 480 *
 481 * This routine checks MRs holding a reference during
 482 * when being de-registered.
 483 *
 484 * If the count is non-zero, the code calls a clean routine then
 485 * waits for the timeout for the count to zero.
 486 */
 487static int rvt_check_refs(struct rvt_mregion *mr, const char *t)
 488{
 489        unsigned long timeout;
 490        struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
 491
 492        if (percpu_ref_is_zero(&mr->refcount))
 493                return 0;
 494        /* avoid dma mr */
 495        if (mr->lkey)
 496                rvt_dereg_clean_qps(mr);
 497        timeout = wait_for_completion_timeout(&mr->comp, 5 * HZ);
 498        if (!timeout) {
 499                rvt_pr_err(rdi,
 500                           "%s timeout mr %p pd %p lkey %x refcount %ld\n",
 501                           t, mr, mr->pd, mr->lkey,
 502                           atomic_long_read(&mr->refcount.count));
 503                rvt_get_mr(mr);
 504                return -EBUSY;
 505        }
 506        return 0;
 507}
 508
 509/**
 510 * rvt_mr_has_lkey - is MR
 511 * @mr - the mregion
 512 * @lkey - the lkey
 513 */
 514bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey)
 515{
 516        return mr && lkey == mr->lkey;
 517}
 518
 519/**
 520 * rvt_ss_has_lkey - is mr in sge tests
 521 * @ss - the sge state
 522 * @lkey
 523 *
 524 * This code tests for an MR in the indicated
 525 * sge state.
 526 */
 527bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey)
 528{
 529        int i;
 530        bool rval = false;
 531
 532        if (!ss->num_sge)
 533                return rval;
 534        /* first one */
 535        rval = rvt_mr_has_lkey(ss->sge.mr, lkey);
 536        /* any others */
 537        for (i = 0; !rval && i < ss->num_sge - 1; i++)
 538                rval = rvt_mr_has_lkey(ss->sg_list[i].mr, lkey);
 539        return rval;
 540}
 541
 542/**
 543 * rvt_dereg_mr - unregister and free a memory region
 544 * @ibmr: the memory region to free
 545 *
 546 *
 547 * Note that this is called to free MRs created by rvt_get_dma_mr()
 548 * or rvt_reg_user_mr().
 549 *
 550 * Returns 0 on success.
 551 */
 552int rvt_dereg_mr(struct ib_mr *ibmr)
 553{
 554        struct rvt_mr *mr = to_imr(ibmr);
 555        int ret;
 556
 557        rvt_free_lkey(&mr->mr);
 558
 559        rvt_put_mr(&mr->mr); /* will set completion if last */
 560        ret = rvt_check_refs(&mr->mr, __func__);
 561        if (ret)
 562                goto out;
 563        rvt_deinit_mregion(&mr->mr);
 564        if (mr->umem)
 565                ib_umem_release(mr->umem);
 566        kfree(mr);
 567out:
 568        return ret;
 569}
 570
 571/**
 572 * rvt_alloc_mr - Allocate a memory region usable with the
 573 * @pd: protection domain for this memory region
 574 * @mr_type: mem region type
 575 * @max_num_sg: Max number of segments allowed
 576 *
 577 * Return: the memory region on success, otherwise return an errno.
 578 */
 579struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
 580                           enum ib_mr_type mr_type,
 581                           u32 max_num_sg)
 582{
 583        struct rvt_mr *mr;
 584
 585        if (mr_type != IB_MR_TYPE_MEM_REG)
 586                return ERR_PTR(-EINVAL);
 587
 588        mr = __rvt_alloc_mr(max_num_sg, pd);
 589        if (IS_ERR(mr))
 590                return (struct ib_mr *)mr;
 591
 592        return &mr->ibmr;
 593}
 594
 595/**
 596 * rvt_set_page - page assignment function called by ib_sg_to_pages
 597 * @ibmr: memory region
 598 * @addr: dma address of mapped page
 599 *
 600 * Return: 0 on success
 601 */
 602static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
 603{
 604        struct rvt_mr *mr = to_imr(ibmr);
 605        u32 ps = 1 << mr->mr.page_shift;
 606        u32 mapped_segs = mr->mr.length >> mr->mr.page_shift;
 607        int m, n;
 608
 609        if (unlikely(mapped_segs == mr->mr.max_segs))
 610                return -ENOMEM;
 611
 612        if (mr->mr.length == 0) {
 613                mr->mr.user_base = addr;
 614                mr->mr.iova = addr;
 615        }
 616
 617        m = mapped_segs / RVT_SEGSZ;
 618        n = mapped_segs % RVT_SEGSZ;
 619        mr->mr.map[m]->segs[n].vaddr = (void *)addr;
 620        mr->mr.map[m]->segs[n].length = ps;
 621        trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps);
 622        mr->mr.length += ps;
 623
 624        return 0;
 625}
 626
 627/**
 628 * rvt_map_mr_sg - map sg list and set it the memory region
 629 * @ibmr: memory region
 630 * @sg: dma mapped scatterlist
 631 * @sg_nents: number of entries in sg
 632 * @sg_offset: offset in bytes into sg
 633 *
 634 * Return: number of sg elements mapped to the memory region
 635 */
 636int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 637                  int sg_nents, unsigned int *sg_offset)
 638{
 639        struct rvt_mr *mr = to_imr(ibmr);
 640
 641        mr->mr.length = 0;
 642        mr->mr.page_shift = PAGE_SHIFT;
 643        return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
 644                              rvt_set_page);
 645}
 646
 647/**
 648 * rvt_fast_reg_mr - fast register physical MR
 649 * @qp: the queue pair where the work request comes from
 650 * @ibmr: the memory region to be registered
 651 * @key: updated key for this memory region
 652 * @access: access flags for this memory region
 653 *
 654 * Returns 0 on success.
 655 */
 656int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
 657                    int access)
 658{
 659        struct rvt_mr *mr = to_imr(ibmr);
 660
 661        if (qp->ibqp.pd != mr->mr.pd)
 662                return -EACCES;
 663
 664        /* not applicable to dma MR or user MR */
 665        if (!mr->mr.lkey || mr->umem)
 666                return -EINVAL;
 667
 668        if ((key & 0xFFFFFF00) != (mr->mr.lkey & 0xFFFFFF00))
 669                return -EINVAL;
 670
 671        ibmr->lkey = key;
 672        ibmr->rkey = key;
 673        mr->mr.lkey = key;
 674        mr->mr.access_flags = access;
 675        atomic_set(&mr->mr.lkey_invalid, 0);
 676
 677        return 0;
 678}
 679EXPORT_SYMBOL(rvt_fast_reg_mr);
 680
 681/**
 682 * rvt_invalidate_rkey - invalidate an MR rkey
 683 * @qp: queue pair associated with the invalidate op
 684 * @rkey: rkey to invalidate
 685 *
 686 * Returns 0 on success.
 687 */
 688int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey)
 689{
 690        struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
 691        struct rvt_lkey_table *rkt = &dev->lkey_table;
 692        struct rvt_mregion *mr;
 693
 694        if (rkey == 0)
 695                return -EINVAL;
 696
 697        rcu_read_lock();
 698        mr = rcu_dereference(
 699                rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
 700        if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
 701                goto bail;
 702
 703        atomic_set(&mr->lkey_invalid, 1);
 704        rcu_read_unlock();
 705        return 0;
 706
 707bail:
 708        rcu_read_unlock();
 709        return -EINVAL;
 710}
 711EXPORT_SYMBOL(rvt_invalidate_rkey);
 712
 713/**
 714 * rvt_alloc_fmr - allocate a fast memory region
 715 * @pd: the protection domain for this memory region
 716 * @mr_access_flags: access flags for this memory region
 717 * @fmr_attr: fast memory region attributes
 718 *
 719 * Return: the memory region on success, otherwise returns an errno.
 720 */
 721struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
 722                             struct ib_fmr_attr *fmr_attr)
 723{
 724        struct rvt_fmr *fmr;
 725        int m;
 726        struct ib_fmr *ret;
 727        int rval = -ENOMEM;
 728
 729        /* Allocate struct plus pointers to first level page tables. */
 730        m = (fmr_attr->max_pages + RVT_SEGSZ - 1) / RVT_SEGSZ;
 731        fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL);
 732        if (!fmr)
 733                goto bail;
 734
 735        rval = rvt_init_mregion(&fmr->mr, pd, fmr_attr->max_pages,
 736                                PERCPU_REF_INIT_ATOMIC);
 737        if (rval)
 738                goto bail;
 739
 740        /*
 741         * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
 742         * rkey.
 743         */
 744        rval = rvt_alloc_lkey(&fmr->mr, 0);
 745        if (rval)
 746                goto bail_mregion;
 747        fmr->ibfmr.rkey = fmr->mr.lkey;
 748        fmr->ibfmr.lkey = fmr->mr.lkey;
 749        /*
 750         * Resources are allocated but no valid mapping (RKEY can't be
 751         * used).
 752         */
 753        fmr->mr.access_flags = mr_access_flags;
 754        fmr->mr.max_segs = fmr_attr->max_pages;
 755        fmr->mr.page_shift = fmr_attr->page_shift;
 756
 757        ret = &fmr->ibfmr;
 758done:
 759        return ret;
 760
 761bail_mregion:
 762        rvt_deinit_mregion(&fmr->mr);
 763bail:
 764        kfree(fmr);
 765        ret = ERR_PTR(rval);
 766        goto done;
 767}
 768
 769/**
 770 * rvt_map_phys_fmr - set up a fast memory region
 771 * @ibmfr: the fast memory region to set up
 772 * @page_list: the list of pages to associate with the fast memory region
 773 * @list_len: the number of pages to associate with the fast memory region
 774 * @iova: the virtual address of the start of the fast memory region
 775 *
 776 * This may be called from interrupt context.
 777 *
 778 * Return: 0 on success
 779 */
 780
 781int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
 782                     int list_len, u64 iova)
 783{
 784        struct rvt_fmr *fmr = to_ifmr(ibfmr);
 785        struct rvt_lkey_table *rkt;
 786        unsigned long flags;
 787        int m, n;
 788        unsigned long i;
 789        u32 ps;
 790        struct rvt_dev_info *rdi = ib_to_rvt(ibfmr->device);
 791
 792        i = atomic_long_read(&fmr->mr.refcount.count);
 793        if (i > 2)
 794                return -EBUSY;
 795
 796        if (list_len > fmr->mr.max_segs)
 797                return -EINVAL;
 798
 799        rkt = &rdi->lkey_table;
 800        spin_lock_irqsave(&rkt->lock, flags);
 801        fmr->mr.user_base = iova;
 802        fmr->mr.iova = iova;
 803        ps = 1 << fmr->mr.page_shift;
 804        fmr->mr.length = list_len * ps;
 805        m = 0;
 806        n = 0;
 807        for (i = 0; i < list_len; i++) {
 808                fmr->mr.map[m]->segs[n].vaddr = (void *)page_list[i];
 809                fmr->mr.map[m]->segs[n].length = ps;
 810                trace_rvt_mr_fmr_seg(&fmr->mr, m, n, (void *)page_list[i], ps);
 811                if (++n == RVT_SEGSZ) {
 812                        m++;
 813                        n = 0;
 814                }
 815        }
 816        spin_unlock_irqrestore(&rkt->lock, flags);
 817        return 0;
 818}
 819
 820/**
 821 * rvt_unmap_fmr - unmap fast memory regions
 822 * @fmr_list: the list of fast memory regions to unmap
 823 *
 824 * Return: 0 on success.
 825 */
 826int rvt_unmap_fmr(struct list_head *fmr_list)
 827{
 828        struct rvt_fmr *fmr;
 829        struct rvt_lkey_table *rkt;
 830        unsigned long flags;
 831        struct rvt_dev_info *rdi;
 832
 833        list_for_each_entry(fmr, fmr_list, ibfmr.list) {
 834                rdi = ib_to_rvt(fmr->ibfmr.device);
 835                rkt = &rdi->lkey_table;
 836                spin_lock_irqsave(&rkt->lock, flags);
 837                fmr->mr.user_base = 0;
 838                fmr->mr.iova = 0;
 839                fmr->mr.length = 0;
 840                spin_unlock_irqrestore(&rkt->lock, flags);
 841        }
 842        return 0;
 843}
 844
 845/**
 846 * rvt_dealloc_fmr - deallocate a fast memory region
 847 * @ibfmr: the fast memory region to deallocate
 848 *
 849 * Return: 0 on success.
 850 */
 851int rvt_dealloc_fmr(struct ib_fmr *ibfmr)
 852{
 853        struct rvt_fmr *fmr = to_ifmr(ibfmr);
 854        int ret = 0;
 855
 856        rvt_free_lkey(&fmr->mr);
 857        rvt_put_mr(&fmr->mr); /* will set completion if last */
 858        ret = rvt_check_refs(&fmr->mr, __func__);
 859        if (ret)
 860                goto out;
 861        rvt_deinit_mregion(&fmr->mr);
 862        kfree(fmr);
 863out:
 864        return ret;
 865}
 866
 867/**
 868 * rvt_sge_adjacent - is isge compressible
 869 * @last_sge: last outgoing SGE written
 870 * @sge: SGE to check
 871 *
 872 * If adjacent will update last_sge to add length.
 873 *
 874 * Return: true if isge is adjacent to last sge
 875 */
 876static inline bool rvt_sge_adjacent(struct rvt_sge *last_sge,
 877                                    struct ib_sge *sge)
 878{
 879        if (last_sge && sge->lkey == last_sge->mr->lkey &&
 880            ((uint64_t)(last_sge->vaddr + last_sge->length) == sge->addr)) {
 881                if (sge->lkey) {
 882                        if (unlikely((sge->addr - last_sge->mr->user_base +
 883                              sge->length > last_sge->mr->length)))
 884                                return false; /* overrun, caller will catch */
 885                } else {
 886                        last_sge->length += sge->length;
 887                }
 888                last_sge->sge_length += sge->length;
 889                trace_rvt_sge_adjacent(last_sge, sge);
 890                return true;
 891        }
 892        return false;
 893}
 894
 895/**
 896 * rvt_lkey_ok - check IB SGE for validity and initialize
 897 * @rkt: table containing lkey to check SGE against
 898 * @pd: protection domain
 899 * @isge: outgoing internal SGE
 900 * @last_sge: last outgoing SGE written
 901 * @sge: SGE to check
 902 * @acc: access flags
 903 *
 904 * Check the IB SGE for validity and initialize our internal version
 905 * of it.
 906 *
 907 * Increments the reference count when a new sge is stored.
 908 *
 909 * Return: 0 if compressed, 1 if added , otherwise returns -errno.
 910 */
 911int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
 912                struct rvt_sge *isge, struct rvt_sge *last_sge,
 913                struct ib_sge *sge, int acc)
 914{
 915        struct rvt_mregion *mr;
 916        unsigned n, m;
 917        size_t off;
 918
 919        /*
 920         * We use LKEY == zero for kernel virtual addresses
 921         * (see rvt_get_dma_mr() and dma_virt_ops).
 922         */
 923        if (sge->lkey == 0) {
 924                struct rvt_dev_info *dev = ib_to_rvt(pd->ibpd.device);
 925
 926                if (pd->user)
 927                        return -EINVAL;
 928                if (rvt_sge_adjacent(last_sge, sge))
 929                        return 0;
 930                rcu_read_lock();
 931                mr = rcu_dereference(dev->dma_mr);
 932                if (!mr)
 933                        goto bail;
 934                rvt_get_mr(mr);
 935                rcu_read_unlock();
 936
 937                isge->mr = mr;
 938                isge->vaddr = (void *)sge->addr;
 939                isge->length = sge->length;
 940                isge->sge_length = sge->length;
 941                isge->m = 0;
 942                isge->n = 0;
 943                goto ok;
 944        }
 945        if (rvt_sge_adjacent(last_sge, sge))
 946                return 0;
 947        rcu_read_lock();
 948        mr = rcu_dereference(rkt->table[sge->lkey >> rkt->shift]);
 949        if (!mr)
 950                goto bail;
 951        rvt_get_mr(mr);
 952        if (!READ_ONCE(mr->lkey_published))
 953                goto bail_unref;
 954
 955        if (unlikely(atomic_read(&mr->lkey_invalid) ||
 956                     mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
 957                goto bail_unref;
 958
 959        off = sge->addr - mr->user_base;
 960        if (unlikely(sge->addr < mr->user_base ||
 961                     off + sge->length > mr->length ||
 962                     (mr->access_flags & acc) != acc))
 963                goto bail_unref;
 964        rcu_read_unlock();
 965
 966        off += mr->offset;
 967        if (mr->page_shift) {
 968                /*
 969                 * page sizes are uniform power of 2 so no loop is necessary
 970                 * entries_spanned_by_off is the number of times the loop below
 971                 * would have executed.
 972                */
 973                size_t entries_spanned_by_off;
 974
 975                entries_spanned_by_off = off >> mr->page_shift;
 976                off -= (entries_spanned_by_off << mr->page_shift);
 977                m = entries_spanned_by_off / RVT_SEGSZ;
 978                n = entries_spanned_by_off % RVT_SEGSZ;
 979        } else {
 980                m = 0;
 981                n = 0;
 982                while (off >= mr->map[m]->segs[n].length) {
 983                        off -= mr->map[m]->segs[n].length;
 984                        n++;
 985                        if (n >= RVT_SEGSZ) {
 986                                m++;
 987                                n = 0;
 988                        }
 989                }
 990        }
 991        isge->mr = mr;
 992        isge->vaddr = mr->map[m]->segs[n].vaddr + off;
 993        isge->length = mr->map[m]->segs[n].length - off;
 994        isge->sge_length = sge->length;
 995        isge->m = m;
 996        isge->n = n;
 997ok:
 998        trace_rvt_sge_new(isge, sge);
 999        return 1;
1000bail_unref:
1001        rvt_put_mr(mr);
1002bail:
1003        rcu_read_unlock();
1004        return -EINVAL;
1005}
1006EXPORT_SYMBOL(rvt_lkey_ok);
1007
1008/**
1009 * rvt_rkey_ok - check the IB virtual address, length, and RKEY
1010 * @qp: qp for validation
1011 * @sge: SGE state
1012 * @len: length of data
1013 * @vaddr: virtual address to place data
1014 * @rkey: rkey to check
1015 * @acc: access flags
1016 *
1017 * Return: 1 if successful, otherwise 0.
1018 *
1019 * increments the reference count upon success
1020 */
1021int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
1022                u32 len, u64 vaddr, u32 rkey, int acc)
1023{
1024        struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
1025        struct rvt_lkey_table *rkt = &dev->lkey_table;
1026        struct rvt_mregion *mr;
1027        unsigned n, m;
1028        size_t off;
1029
1030        /*
1031         * We use RKEY == zero for kernel virtual addresses
1032         * (see rvt_get_dma_mr() and dma_virt_ops).
1033         */
1034        rcu_read_lock();
1035        if (rkey == 0) {
1036                struct rvt_pd *pd = ibpd_to_rvtpd(qp->ibqp.pd);
1037                struct rvt_dev_info *rdi = ib_to_rvt(pd->ibpd.device);
1038
1039                if (pd->user)
1040                        goto bail;
1041                mr = rcu_dereference(rdi->dma_mr);
1042                if (!mr)
1043                        goto bail;
1044                rvt_get_mr(mr);
1045                rcu_read_unlock();
1046
1047                sge->mr = mr;
1048                sge->vaddr = (void *)vaddr;
1049                sge->length = len;
1050                sge->sge_length = len;
1051                sge->m = 0;
1052                sge->n = 0;
1053                goto ok;
1054        }
1055
1056        mr = rcu_dereference(rkt->table[rkey >> rkt->shift]);
1057        if (!mr)
1058                goto bail;
1059        rvt_get_mr(mr);
1060        /* insure mr read is before test */
1061        if (!READ_ONCE(mr->lkey_published))
1062                goto bail_unref;
1063        if (unlikely(atomic_read(&mr->lkey_invalid) ||
1064                     mr->lkey != rkey || qp->ibqp.pd != mr->pd))
1065                goto bail_unref;
1066
1067        off = vaddr - mr->iova;
1068        if (unlikely(vaddr < mr->iova || off + len > mr->length ||
1069                     (mr->access_flags & acc) == 0))
1070                goto bail_unref;
1071        rcu_read_unlock();
1072
1073        off += mr->offset;
1074        if (mr->page_shift) {
1075                /*
1076                 * page sizes are uniform power of 2 so no loop is necessary
1077                 * entries_spanned_by_off is the number of times the loop below
1078                 * would have executed.
1079                */
1080                size_t entries_spanned_by_off;
1081
1082                entries_spanned_by_off = off >> mr->page_shift;
1083                off -= (entries_spanned_by_off << mr->page_shift);
1084                m = entries_spanned_by_off / RVT_SEGSZ;
1085                n = entries_spanned_by_off % RVT_SEGSZ;
1086        } else {
1087                m = 0;
1088                n = 0;
1089                while (off >= mr->map[m]->segs[n].length) {
1090                        off -= mr->map[m]->segs[n].length;
1091                        n++;
1092                        if (n >= RVT_SEGSZ) {
1093                                m++;
1094                                n = 0;
1095                        }
1096                }
1097        }
1098        sge->mr = mr;
1099        sge->vaddr = mr->map[m]->segs[n].vaddr + off;
1100        sge->length = mr->map[m]->segs[n].length - off;
1101        sge->sge_length = len;
1102        sge->m = m;
1103        sge->n = n;
1104ok:
1105        return 1;
1106bail_unref:
1107        rvt_put_mr(mr);
1108bail:
1109        rcu_read_unlock();
1110        return 0;
1111}
1112EXPORT_SYMBOL(rvt_rkey_ok);
1113