linux/drivers/infiniband/core/rw.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2016 HGST, a Western Digital Company.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 */
  13#include <linux/moduleparam.h>
  14#include <linux/slab.h>
  15#include <linux/pci-p2pdma.h>
  16#include <rdma/mr_pool.h>
  17#include <rdma/rw.h>
  18
  19enum {
  20        RDMA_RW_SINGLE_WR,
  21        RDMA_RW_MULTI_WR,
  22        RDMA_RW_MR,
  23        RDMA_RW_SIG_MR,
  24};
  25
  26static bool rdma_rw_force_mr;
  27module_param_named(force_mr, rdma_rw_force_mr, bool, 0);
  28MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations");
  29
  30/*
  31 * Check if the device might use memory registration.  This is currently only
  32 * true for iWarp devices. In the future we can hopefully fine tune this based
  33 * on HCA driver input.
  34 */
  35static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num)
  36{
  37        if (rdma_protocol_iwarp(dev, port_num))
  38                return true;
  39        if (unlikely(rdma_rw_force_mr))
  40                return true;
  41        return false;
  42}
  43
  44/*
  45 * Check if the device will use memory registration for this RW operation.
  46 * We currently always use memory registrations for iWarp RDMA READs, and
  47 * have a debug option to force usage of MRs.
  48 *
  49 * XXX: In the future we can hopefully fine tune this based on HCA driver
  50 * input.
  51 */
  52static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
  53                enum dma_data_direction dir, int dma_nents)
  54{
  55        if (rdma_protocol_iwarp(dev, port_num) && dir == DMA_FROM_DEVICE)
  56                return true;
  57        if (unlikely(rdma_rw_force_mr))
  58                return true;
  59        return false;
  60}
  61
  62static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
  63{
  64        /* arbitrary limit to avoid allocating gigantic resources */
  65        return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
  66}
  67
  68/* Caller must have zero-initialized *reg. */
  69static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
  70                struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
  71                u32 sg_cnt, u32 offset)
  72{
  73        u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
  74        u32 nents = min(sg_cnt, pages_per_mr);
  75        int count = 0, ret;
  76
  77        reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs);
  78        if (!reg->mr)
  79                return -EAGAIN;
  80
  81        if (reg->mr->need_inval) {
  82                reg->inv_wr.opcode = IB_WR_LOCAL_INV;
  83                reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
  84                reg->inv_wr.next = &reg->reg_wr.wr;
  85                count++;
  86        } else {
  87                reg->inv_wr.next = NULL;
  88        }
  89
  90        ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
  91        if (ret < 0 || ret < nents) {
  92                ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr);
  93                return -EINVAL;
  94        }
  95
  96        reg->reg_wr.wr.opcode = IB_WR_REG_MR;
  97        reg->reg_wr.mr = reg->mr;
  98        reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
  99        if (rdma_protocol_iwarp(qp->device, port_num))
 100                reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
 101        count++;
 102
 103        reg->sge.addr = reg->mr->iova;
 104        reg->sge.length = reg->mr->length;
 105        return count;
 106}
 107
 108static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 109                u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
 110                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 111{
 112        struct rdma_rw_reg_ctx *prev = NULL;
 113        u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
 114        int i, j, ret = 0, count = 0;
 115
 116        ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr;
 117        ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL);
 118        if (!ctx->reg) {
 119                ret = -ENOMEM;
 120                goto out;
 121        }
 122
 123        for (i = 0; i < ctx->nr_ops; i++) {
 124                struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
 125                u32 nents = min(sg_cnt, pages_per_mr);
 126
 127                ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt,
 128                                offset);
 129                if (ret < 0)
 130                        goto out_free;
 131                count += ret;
 132
 133                if (prev) {
 134                        if (reg->mr->need_inval)
 135                                prev->wr.wr.next = &reg->inv_wr;
 136                        else
 137                                prev->wr.wr.next = &reg->reg_wr.wr;
 138                }
 139
 140                reg->reg_wr.wr.next = &reg->wr.wr;
 141
 142                reg->wr.wr.sg_list = &reg->sge;
 143                reg->wr.wr.num_sge = 1;
 144                reg->wr.remote_addr = remote_addr;
 145                reg->wr.rkey = rkey;
 146                if (dir == DMA_TO_DEVICE) {
 147                        reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
 148                } else if (!rdma_cap_read_inv(qp->device, port_num)) {
 149                        reg->wr.wr.opcode = IB_WR_RDMA_READ;
 150                } else {
 151                        reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
 152                        reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
 153                }
 154                count++;
 155
 156                remote_addr += reg->sge.length;
 157                sg_cnt -= nents;
 158                for (j = 0; j < nents; j++)
 159                        sg = sg_next(sg);
 160                prev = reg;
 161                offset = 0;
 162        }
 163
 164        if (prev)
 165                prev->wr.wr.next = NULL;
 166
 167        ctx->type = RDMA_RW_MR;
 168        return count;
 169
 170out_free:
 171        while (--i >= 0)
 172                ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
 173        kfree(ctx->reg);
 174out:
 175        return ret;
 176}
 177
 178static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 179                struct scatterlist *sg, u32 sg_cnt, u32 offset,
 180                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 181{
 182        struct ib_device *dev = qp->pd->device;
 183        u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
 184                      qp->max_read_sge;
 185        struct ib_sge *sge;
 186        u32 total_len = 0, i, j;
 187
 188        ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge);
 189
 190        ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL);
 191        if (!ctx->map.sges)
 192                goto out;
 193
 194        ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL);
 195        if (!ctx->map.wrs)
 196                goto out_free_sges;
 197
 198        for (i = 0; i < ctx->nr_ops; i++) {
 199                struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
 200                u32 nr_sge = min(sg_cnt, max_sge);
 201
 202                if (dir == DMA_TO_DEVICE)
 203                        rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
 204                else
 205                        rdma_wr->wr.opcode = IB_WR_RDMA_READ;
 206                rdma_wr->remote_addr = remote_addr + total_len;
 207                rdma_wr->rkey = rkey;
 208                rdma_wr->wr.num_sge = nr_sge;
 209                rdma_wr->wr.sg_list = sge;
 210
 211                for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
 212                        sge->addr = ib_sg_dma_address(dev, sg) + offset;
 213                        sge->length = ib_sg_dma_len(dev, sg) - offset;
 214                        sge->lkey = qp->pd->local_dma_lkey;
 215
 216                        total_len += sge->length;
 217                        sge++;
 218                        sg_cnt--;
 219                        offset = 0;
 220                }
 221
 222                rdma_wr->wr.next = i + 1 < ctx->nr_ops ?
 223                        &ctx->map.wrs[i + 1].wr : NULL;
 224        }
 225
 226        ctx->type = RDMA_RW_MULTI_WR;
 227        return ctx->nr_ops;
 228
 229out_free_sges:
 230        kfree(ctx->map.sges);
 231out:
 232        return -ENOMEM;
 233}
 234
 235static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 236                struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
 237                enum dma_data_direction dir)
 238{
 239        struct ib_device *dev = qp->pd->device;
 240        struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
 241
 242        ctx->nr_ops = 1;
 243
 244        ctx->single.sge.lkey = qp->pd->local_dma_lkey;
 245        ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset;
 246        ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset;
 247
 248        memset(rdma_wr, 0, sizeof(*rdma_wr));
 249        if (dir == DMA_TO_DEVICE)
 250                rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
 251        else
 252                rdma_wr->wr.opcode = IB_WR_RDMA_READ;
 253        rdma_wr->wr.sg_list = &ctx->single.sge;
 254        rdma_wr->wr.num_sge = 1;
 255        rdma_wr->remote_addr = remote_addr;
 256        rdma_wr->rkey = rkey;
 257
 258        ctx->type = RDMA_RW_SINGLE_WR;
 259        return 1;
 260}
 261
 262/**
 263 * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
 264 * @ctx:        context to initialize
 265 * @qp:         queue pair to operate on
 266 * @port_num:   port num to which the connection is bound
 267 * @sg:         scatterlist to READ/WRITE from/to
 268 * @sg_cnt:     number of entries in @sg
 269 * @sg_offset:  current byte offset into @sg
 270 * @remote_addr:remote address to read/write (relative to @rkey)
 271 * @rkey:       remote key to operate on
 272 * @dir:        %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
 273 *
 274 * Returns the number of WQEs that will be needed on the workqueue if
 275 * successful, or a negative error code.
 276 */
 277int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
 278                struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
 279                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 280{
 281        struct ib_device *dev = qp->pd->device;
 282        int ret;
 283
 284        if (is_pci_p2pdma_page(sg_page(sg)))
 285                ret = pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir);
 286        else
 287                ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
 288
 289        if (!ret)
 290                return -ENOMEM;
 291        sg_cnt = ret;
 292
 293        /*
 294         * Skip to the S/G entry that sg_offset falls into:
 295         */
 296        for (;;) {
 297                u32 len = ib_sg_dma_len(dev, sg);
 298
 299                if (sg_offset < len)
 300                        break;
 301
 302                sg = sg_next(sg);
 303                sg_offset -= len;
 304                sg_cnt--;
 305        }
 306
 307        ret = -EIO;
 308        if (WARN_ON_ONCE(sg_cnt == 0))
 309                goto out_unmap_sg;
 310
 311        if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) {
 312                ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt,
 313                                sg_offset, remote_addr, rkey, dir);
 314        } else if (sg_cnt > 1) {
 315                ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset,
 316                                remote_addr, rkey, dir);
 317        } else {
 318                ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset,
 319                                remote_addr, rkey, dir);
 320        }
 321
 322        if (ret < 0)
 323                goto out_unmap_sg;
 324        return ret;
 325
 326out_unmap_sg:
 327        ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
 328        return ret;
 329}
 330EXPORT_SYMBOL(rdma_rw_ctx_init);
 331
 332/**
 333 * rdma_rw_ctx_signature_init - initialize a RW context with signature offload
 334 * @ctx:        context to initialize
 335 * @qp:         queue pair to operate on
 336 * @port_num:   port num to which the connection is bound
 337 * @sg:         scatterlist to READ/WRITE from/to
 338 * @sg_cnt:     number of entries in @sg
 339 * @prot_sg:    scatterlist to READ/WRITE protection information from/to
 340 * @prot_sg_cnt: number of entries in @prot_sg
 341 * @sig_attrs:  signature offloading algorithms
 342 * @remote_addr:remote address to read/write (relative to @rkey)
 343 * @rkey:       remote key to operate on
 344 * @dir:        %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
 345 *
 346 * Returns the number of WQEs that will be needed on the workqueue if
 347 * successful, or a negative error code.
 348 */
 349int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 350                u8 port_num, struct scatterlist *sg, u32 sg_cnt,
 351                struct scatterlist *prot_sg, u32 prot_sg_cnt,
 352                struct ib_sig_attrs *sig_attrs,
 353                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 354{
 355        struct ib_device *dev = qp->pd->device;
 356        u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
 357        struct ib_rdma_wr *rdma_wr;
 358        struct ib_send_wr *prev_wr = NULL;
 359        int count = 0, ret;
 360
 361        if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
 362                pr_err("SG count too large\n");
 363                return -EINVAL;
 364        }
 365
 366        ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
 367        if (!ret)
 368                return -ENOMEM;
 369        sg_cnt = ret;
 370
 371        ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
 372        if (!ret) {
 373                ret = -ENOMEM;
 374                goto out_unmap_sg;
 375        }
 376        prot_sg_cnt = ret;
 377
 378        ctx->type = RDMA_RW_SIG_MR;
 379        ctx->nr_ops = 1;
 380        ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL);
 381        if (!ctx->sig) {
 382                ret = -ENOMEM;
 383                goto out_unmap_prot_sg;
 384        }
 385
 386        ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0);
 387        if (ret < 0)
 388                goto out_free_ctx;
 389        count += ret;
 390        prev_wr = &ctx->sig->data.reg_wr.wr;
 391
 392        ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot,
 393                                  prot_sg, prot_sg_cnt, 0);
 394        if (ret < 0)
 395                goto out_destroy_data_mr;
 396        count += ret;
 397
 398        if (ctx->sig->prot.inv_wr.next)
 399                prev_wr->next = &ctx->sig->prot.inv_wr;
 400        else
 401                prev_wr->next = &ctx->sig->prot.reg_wr.wr;
 402        prev_wr = &ctx->sig->prot.reg_wr.wr;
 403
 404        ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs);
 405        if (!ctx->sig->sig_mr) {
 406                ret = -EAGAIN;
 407                goto out_destroy_prot_mr;
 408        }
 409
 410        if (ctx->sig->sig_mr->need_inval) {
 411                memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr));
 412
 413                ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV;
 414                ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey;
 415
 416                prev_wr->next = &ctx->sig->sig_inv_wr;
 417                prev_wr = &ctx->sig->sig_inv_wr;
 418        }
 419
 420        ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR;
 421        ctx->sig->sig_wr.wr.wr_cqe = NULL;
 422        ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge;
 423        ctx->sig->sig_wr.wr.num_sge = 1;
 424        ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE;
 425        ctx->sig->sig_wr.sig_attrs = sig_attrs;
 426        ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr;
 427        if (prot_sg_cnt)
 428                ctx->sig->sig_wr.prot = &ctx->sig->prot.sge;
 429        prev_wr->next = &ctx->sig->sig_wr.wr;
 430        prev_wr = &ctx->sig->sig_wr.wr;
 431        count++;
 432
 433        ctx->sig->sig_sge.addr = 0;
 434        ctx->sig->sig_sge.length = ctx->sig->data.sge.length;
 435        if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE)
 436                ctx->sig->sig_sge.length += ctx->sig->prot.sge.length;
 437
 438        rdma_wr = &ctx->sig->data.wr;
 439        rdma_wr->wr.sg_list = &ctx->sig->sig_sge;
 440        rdma_wr->wr.num_sge = 1;
 441        rdma_wr->remote_addr = remote_addr;
 442        rdma_wr->rkey = rkey;
 443        if (dir == DMA_TO_DEVICE)
 444                rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
 445        else
 446                rdma_wr->wr.opcode = IB_WR_RDMA_READ;
 447        prev_wr->next = &rdma_wr->wr;
 448        prev_wr = &rdma_wr->wr;
 449        count++;
 450
 451        return count;
 452
 453out_destroy_prot_mr:
 454        if (prot_sg_cnt)
 455                ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
 456out_destroy_data_mr:
 457        ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
 458out_free_ctx:
 459        kfree(ctx->sig);
 460out_unmap_prot_sg:
 461        ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
 462out_unmap_sg:
 463        ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
 464        return ret;
 465}
 466EXPORT_SYMBOL(rdma_rw_ctx_signature_init);
 467
 468/*
 469 * Now that we are going to post the WRs we can update the lkey and need_inval
 470 * state on the MRs.  If we were doing this at init time, we would get double
 471 * or missing invalidations if a context was initialized but not actually
 472 * posted.
 473 */
 474static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval)
 475{
 476        reg->mr->need_inval = need_inval;
 477        ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey));
 478        reg->reg_wr.key = reg->mr->lkey;
 479        reg->sge.lkey = reg->mr->lkey;
 480}
 481
 482/**
 483 * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation
 484 * @ctx:        context to operate on
 485 * @qp:         queue pair to operate on
 486 * @port_num:   port num to which the connection is bound
 487 * @cqe:        completion queue entry for the last WR
 488 * @chain_wr:   WR to append to the posted chain
 489 *
 490 * Return the WR chain for the set of RDMA READ/WRITE operations described by
 491 * @ctx, as well as any memory registration operations needed.  If @chain_wr
 492 * is non-NULL the WR it points to will be appended to the chain of WRs posted.
 493 * If @chain_wr is not set @cqe must be set so that the caller gets a
 494 * completion notification.
 495 */
 496struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 497                u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
 498{
 499        struct ib_send_wr *first_wr, *last_wr;
 500        int i;
 501
 502        switch (ctx->type) {
 503        case RDMA_RW_SIG_MR:
 504                rdma_rw_update_lkey(&ctx->sig->data, true);
 505                if (ctx->sig->prot.mr)
 506                        rdma_rw_update_lkey(&ctx->sig->prot, true);
 507        
 508                ctx->sig->sig_mr->need_inval = true;
 509                ib_update_fast_reg_key(ctx->sig->sig_mr,
 510                        ib_inc_rkey(ctx->sig->sig_mr->lkey));
 511                ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey;
 512
 513                if (ctx->sig->data.inv_wr.next)
 514                        first_wr = &ctx->sig->data.inv_wr;
 515                else
 516                        first_wr = &ctx->sig->data.reg_wr.wr;
 517                last_wr = &ctx->sig->data.wr.wr;
 518                break;
 519        case RDMA_RW_MR:
 520                for (i = 0; i < ctx->nr_ops; i++) {
 521                        rdma_rw_update_lkey(&ctx->reg[i],
 522                                ctx->reg[i].wr.wr.opcode !=
 523                                        IB_WR_RDMA_READ_WITH_INV);
 524                }
 525
 526                if (ctx->reg[0].inv_wr.next)
 527                        first_wr = &ctx->reg[0].inv_wr;
 528                else
 529                        first_wr = &ctx->reg[0].reg_wr.wr;
 530                last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
 531                break;
 532        case RDMA_RW_MULTI_WR:
 533                first_wr = &ctx->map.wrs[0].wr;
 534                last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
 535                break;
 536        case RDMA_RW_SINGLE_WR:
 537                first_wr = &ctx->single.wr.wr;
 538                last_wr = &ctx->single.wr.wr;
 539                break;
 540        default:
 541                BUG();
 542        }
 543
 544        if (chain_wr) {
 545                last_wr->next = chain_wr;
 546        } else {
 547                last_wr->wr_cqe = cqe;
 548                last_wr->send_flags |= IB_SEND_SIGNALED;
 549        }
 550
 551        return first_wr;
 552}
 553EXPORT_SYMBOL(rdma_rw_ctx_wrs);
 554
 555/**
 556 * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation
 557 * @ctx:        context to operate on
 558 * @qp:         queue pair to operate on
 559 * @port_num:   port num to which the connection is bound
 560 * @cqe:        completion queue entry for the last WR
 561 * @chain_wr:   WR to append to the posted chain
 562 *
 563 * Post the set of RDMA READ/WRITE operations described by @ctx, as well as
 564 * any memory registration operations needed.  If @chain_wr is non-NULL the
 565 * WR it points to will be appended to the chain of WRs posted.  If @chain_wr
 566 * is not set @cqe must be set so that the caller gets a completion
 567 * notification.
 568 */
 569int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
 570                struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
 571{
 572        struct ib_send_wr *first_wr;
 573
 574        first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr);
 575        return ib_post_send(qp, first_wr, NULL);
 576}
 577EXPORT_SYMBOL(rdma_rw_ctx_post);
 578
 579/**
 580 * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
 581 * @ctx:        context to release
 582 * @qp:         queue pair to operate on
 583 * @port_num:   port num to which the connection is bound
 584 * @sg:         scatterlist that was used for the READ/WRITE
 585 * @sg_cnt:     number of entries in @sg
 586 * @dir:        %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
 587 */
 588void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
 589                struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir)
 590{
 591        int i;
 592
 593        switch (ctx->type) {
 594        case RDMA_RW_MR:
 595                for (i = 0; i < ctx->nr_ops; i++)
 596                        ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
 597                kfree(ctx->reg);
 598                break;
 599        case RDMA_RW_MULTI_WR:
 600                kfree(ctx->map.wrs);
 601                kfree(ctx->map.sges);
 602                break;
 603        case RDMA_RW_SINGLE_WR:
 604                break;
 605        default:
 606                BUG();
 607                break;
 608        }
 609
 610        /* P2PDMA contexts do not need to be unmapped */
 611        if (!is_pci_p2pdma_page(sg_page(sg)))
 612                ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
 613}
 614EXPORT_SYMBOL(rdma_rw_ctx_destroy);
 615
 616/**
 617 * rdma_rw_ctx_destroy_signature - release all resources allocated by
 618 *      rdma_rw_ctx_init_signature
 619 * @ctx:        context to release
 620 * @qp:         queue pair to operate on
 621 * @port_num:   port num to which the connection is bound
 622 * @sg:         scatterlist that was used for the READ/WRITE
 623 * @sg_cnt:     number of entries in @sg
 624 * @prot_sg:    scatterlist that was used for the READ/WRITE of the PI
 625 * @prot_sg_cnt: number of entries in @prot_sg
 626 * @dir:        %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
 627 */
 628void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 629                u8 port_num, struct scatterlist *sg, u32 sg_cnt,
 630                struct scatterlist *prot_sg, u32 prot_sg_cnt,
 631                enum dma_data_direction dir)
 632{
 633        if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
 634                return;
 635
 636        ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
 637        ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
 638
 639        if (ctx->sig->prot.mr) {
 640                ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
 641                ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
 642        }
 643
 644        ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr);
 645        kfree(ctx->sig);
 646}
 647EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
 648
 649/**
 650 * rdma_rw_mr_factor - return number of MRs required for a payload
 651 * @device:     device handling the connection
 652 * @port_num:   port num to which the connection is bound
 653 * @maxpages:   maximum payload pages per rdma_rw_ctx
 654 *
 655 * Returns the number of MRs the device requires to move @maxpayload
 656 * bytes. The returned value is used during transport creation to
 657 * compute max_rdma_ctxts and the size of the transport's Send and
 658 * Send Completion Queues.
 659 */
 660unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num,
 661                               unsigned int maxpages)
 662{
 663        unsigned int mr_pages;
 664
 665        if (rdma_rw_can_use_mr(device, port_num))
 666                mr_pages = rdma_rw_fr_page_list_len(device);
 667        else
 668                mr_pages = device->attrs.max_sge_rd;
 669        return DIV_ROUND_UP(maxpages, mr_pages);
 670}
 671EXPORT_SYMBOL(rdma_rw_mr_factor);
 672
 673void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
 674{
 675        u32 factor;
 676
 677        WARN_ON_ONCE(attr->port_num == 0);
 678
 679        /*
 680         * Each context needs at least one RDMA READ or WRITE WR.
 681         *
 682         * For some hardware we might need more, eventually we should ask the
 683         * HCA driver for a multiplier here.
 684         */
 685        factor = 1;
 686
 687        /*
 688         * If the devices needs MRs to perform RDMA READ or WRITE operations,
 689         * we'll need two additional MRs for the registrations and the
 690         * invalidation.
 691         */
 692        if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
 693                factor += 6;    /* (inv + reg) * (data + prot + sig) */
 694        else if (rdma_rw_can_use_mr(dev, attr->port_num))
 695                factor += 2;    /* inv + reg */
 696
 697        attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
 698
 699        /*
 700         * But maybe we were just too high in the sky and the device doesn't
 701         * even support all we need, and we'll have to live with what we get..
 702         */
 703        attr->cap.max_send_wr =
 704                min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);
 705}
 706
 707int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
 708{
 709        struct ib_device *dev = qp->pd->device;
 710        u32 nr_mrs = 0, nr_sig_mrs = 0;
 711        int ret = 0;
 712
 713        if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) {
 714                nr_sig_mrs = attr->cap.max_rdma_ctxs;
 715                nr_mrs = attr->cap.max_rdma_ctxs * 2;
 716        } else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
 717                nr_mrs = attr->cap.max_rdma_ctxs;
 718        }
 719
 720        if (nr_mrs) {
 721                ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
 722                                IB_MR_TYPE_MEM_REG,
 723                                rdma_rw_fr_page_list_len(dev));
 724                if (ret) {
 725                        pr_err("%s: failed to allocated %d MRs\n",
 726                                __func__, nr_mrs);
 727                        return ret;
 728                }
 729        }
 730
 731        if (nr_sig_mrs) {
 732                ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
 733                                IB_MR_TYPE_SIGNATURE, 2);
 734                if (ret) {
 735                        pr_err("%s: failed to allocated %d SIG MRs\n",
 736                                __func__, nr_mrs);
 737                        goto out_free_rdma_mrs;
 738                }
 739        }
 740
 741        return 0;
 742
 743out_free_rdma_mrs:
 744        ib_mr_pool_destroy(qp, &qp->rdma_mrs);
 745        return ret;
 746}
 747
 748void rdma_rw_cleanup_mrs(struct ib_qp *qp)
 749{
 750        ib_mr_pool_destroy(qp, &qp->sig_mrs);
 751        ib_mr_pool_destroy(qp, &qp->rdma_mrs);
 752}
 753