linux/drivers/infiniband/sw/siw/siw_verbs.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
   2
   3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
   4/* Copyright (c) 2008-2019, IBM Corporation */
   5
   6#include <linux/errno.h>
   7#include <linux/types.h>
   8#include <linux/uaccess.h>
   9#include <linux/vmalloc.h>
  10#include <linux/xarray.h>
  11
  12#include <rdma/iw_cm.h>
  13#include <rdma/ib_verbs.h>
  14#include <rdma/ib_user_verbs.h>
  15#include <rdma/uverbs_ioctl.h>
  16
  17#include "siw.h"
  18#include "siw_verbs.h"
  19#include "siw_mem.h"
  20
  21static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
  22        [IB_QPS_RESET] = SIW_QP_STATE_IDLE,
  23        [IB_QPS_INIT] = SIW_QP_STATE_IDLE,
  24        [IB_QPS_RTR] = SIW_QP_STATE_RTR,
  25        [IB_QPS_RTS] = SIW_QP_STATE_RTS,
  26        [IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
  27        [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
  28        [IB_QPS_ERR] = SIW_QP_STATE_ERROR
  29};
  30
  31static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
  32        [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
  33        [IB_QPS_RTS] = "RTS",     [IB_QPS_SQD] = "SQD",   [IB_QPS_SQE] = "SQE",
  34        [IB_QPS_ERR] = "ERR"
  35};
  36
  37void siw_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
  38{
  39        struct siw_user_mmap_entry *entry = to_siw_mmap_entry(rdma_entry);
  40
  41        kfree(entry);
  42}
  43
  44int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
  45{
  46        struct siw_ucontext *uctx = to_siw_ctx(ctx);
  47        size_t size = vma->vm_end - vma->vm_start;
  48        struct rdma_user_mmap_entry *rdma_entry;
  49        struct siw_user_mmap_entry *entry;
  50        int rv = -EINVAL;
  51
  52        /*
  53         * Must be page aligned
  54         */
  55        if (vma->vm_start & (PAGE_SIZE - 1)) {
  56                pr_warn("siw: mmap not page aligned\n");
  57                return -EINVAL;
  58        }
  59        rdma_entry = rdma_user_mmap_entry_get(&uctx->base_ucontext, vma);
  60        if (!rdma_entry) {
  61                siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %#zx\n",
  62                        vma->vm_pgoff, size);
  63                return -EINVAL;
  64        }
  65        entry = to_siw_mmap_entry(rdma_entry);
  66
  67        rv = remap_vmalloc_range(vma, entry->address, 0);
  68        if (rv) {
  69                pr_warn("remap_vmalloc_range failed: %lu, %zu\n", vma->vm_pgoff,
  70                        size);
  71                goto out;
  72        }
  73out:
  74        rdma_user_mmap_entry_put(rdma_entry);
  75
  76        return rv;
  77}
  78
  79int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
  80{
  81        struct siw_device *sdev = to_siw_dev(base_ctx->device);
  82        struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
  83        struct siw_uresp_alloc_ctx uresp = {};
  84        int rv;
  85
  86        if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
  87                rv = -ENOMEM;
  88                goto err_out;
  89        }
  90        ctx->sdev = sdev;
  91
  92        uresp.dev_id = sdev->vendor_part_id;
  93
  94        if (udata->outlen < sizeof(uresp)) {
  95                rv = -EINVAL;
  96                goto err_out;
  97        }
  98        rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
  99        if (rv)
 100                goto err_out;
 101
 102        siw_dbg(base_ctx->device, "success. now %d context(s)\n",
 103                atomic_read(&sdev->num_ctx));
 104
 105        return 0;
 106
 107err_out:
 108        atomic_dec(&sdev->num_ctx);
 109        siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
 110                atomic_read(&sdev->num_ctx));
 111
 112        return rv;
 113}
 114
 115void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
 116{
 117        struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
 118
 119        atomic_dec(&uctx->sdev->num_ctx);
 120}
 121
 122int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
 123                     struct ib_udata *udata)
 124{
 125        struct siw_device *sdev = to_siw_dev(base_dev);
 126
 127        if (udata->inlen || udata->outlen)
 128                return -EINVAL;
 129
 130        memset(attr, 0, sizeof(*attr));
 131
 132        /* Revisit atomic caps if RFC 7306 gets supported */
 133        attr->atomic_cap = 0;
 134        attr->device_cap_flags =
 135                IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG;
 136        attr->max_cq = sdev->attrs.max_cq;
 137        attr->max_cqe = sdev->attrs.max_cqe;
 138        attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
 139        attr->max_mr = sdev->attrs.max_mr;
 140        attr->max_mw = sdev->attrs.max_mw;
 141        attr->max_mr_size = ~0ull;
 142        attr->max_pd = sdev->attrs.max_pd;
 143        attr->max_qp = sdev->attrs.max_qp;
 144        attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
 145        attr->max_qp_rd_atom = sdev->attrs.max_ord;
 146        attr->max_qp_wr = sdev->attrs.max_qp_wr;
 147        attr->max_recv_sge = sdev->attrs.max_sge;
 148        attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
 149        attr->max_send_sge = sdev->attrs.max_sge;
 150        attr->max_sge_rd = sdev->attrs.max_sge_rd;
 151        attr->max_srq = sdev->attrs.max_srq;
 152        attr->max_srq_sge = sdev->attrs.max_srq_sge;
 153        attr->max_srq_wr = sdev->attrs.max_srq_wr;
 154        attr->page_size_cap = PAGE_SIZE;
 155        attr->vendor_id = SIW_VENDOR_ID;
 156        attr->vendor_part_id = sdev->vendor_part_id;
 157
 158        memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6);
 159
 160        return 0;
 161}
 162
 163int siw_query_port(struct ib_device *base_dev, u32 port,
 164                   struct ib_port_attr *attr)
 165{
 166        struct siw_device *sdev = to_siw_dev(base_dev);
 167        int rv;
 168
 169        memset(attr, 0, sizeof(*attr));
 170
 171        rv = ib_get_eth_speed(base_dev, port, &attr->active_speed,
 172                         &attr->active_width);
 173        attr->gid_tbl_len = 1;
 174        attr->max_msg_sz = -1;
 175        attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
 176        attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
 177        attr->phys_state = sdev->state == IB_PORT_ACTIVE ?
 178                IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
 179        attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
 180        attr->state = sdev->state;
 181        /*
 182         * All zero
 183         *
 184         * attr->lid = 0;
 185         * attr->bad_pkey_cntr = 0;
 186         * attr->qkey_viol_cntr = 0;
 187         * attr->sm_lid = 0;
 188         * attr->lmc = 0;
 189         * attr->max_vl_num = 0;
 190         * attr->sm_sl = 0;
 191         * attr->subnet_timeout = 0;
 192         * attr->init_type_repy = 0;
 193         */
 194        return rv;
 195}
 196
 197int siw_get_port_immutable(struct ib_device *base_dev, u32 port,
 198                           struct ib_port_immutable *port_immutable)
 199{
 200        struct ib_port_attr attr;
 201        int rv = siw_query_port(base_dev, port, &attr);
 202
 203        if (rv)
 204                return rv;
 205
 206        port_immutable->gid_tbl_len = attr.gid_tbl_len;
 207        port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
 208
 209        return 0;
 210}
 211
 212int siw_query_gid(struct ib_device *base_dev, u32 port, int idx,
 213                  union ib_gid *gid)
 214{
 215        struct siw_device *sdev = to_siw_dev(base_dev);
 216
 217        /* subnet_prefix == interface_id == 0; */
 218        memset(gid, 0, sizeof(*gid));
 219        memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6);
 220
 221        return 0;
 222}
 223
 224int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 225{
 226        struct siw_device *sdev = to_siw_dev(pd->device);
 227
 228        if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
 229                atomic_dec(&sdev->num_pd);
 230                return -ENOMEM;
 231        }
 232        siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
 233
 234        return 0;
 235}
 236
 237int siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 238{
 239        struct siw_device *sdev = to_siw_dev(pd->device);
 240
 241        siw_dbg_pd(pd, "free PD\n");
 242        atomic_dec(&sdev->num_pd);
 243        return 0;
 244}
 245
 246void siw_qp_get_ref(struct ib_qp *base_qp)
 247{
 248        siw_qp_get(to_siw_qp(base_qp));
 249}
 250
 251void siw_qp_put_ref(struct ib_qp *base_qp)
 252{
 253        siw_qp_put(to_siw_qp(base_qp));
 254}
 255
 256static struct rdma_user_mmap_entry *
 257siw_mmap_entry_insert(struct siw_ucontext *uctx,
 258                      void *address, size_t length,
 259                      u64 *offset)
 260{
 261        struct siw_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 262        int rv;
 263
 264        *offset = SIW_INVAL_UOBJ_KEY;
 265        if (!entry)
 266                return NULL;
 267
 268        entry->address = address;
 269
 270        rv = rdma_user_mmap_entry_insert(&uctx->base_ucontext,
 271                                         &entry->rdma_entry,
 272                                         length);
 273        if (rv) {
 274                kfree(entry);
 275                return NULL;
 276        }
 277
 278        *offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
 279
 280        return &entry->rdma_entry;
 281}
 282
 283/*
 284 * siw_create_qp()
 285 *
 286 * Create QP of requested size on given device.
 287 *
 288 * @qp:         Queue pait
 289 * @attrs:      Initial QP attributes.
 290 * @udata:      used to provide QP ID, SQ and RQ size back to user.
 291 */
 292
 293int siw_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs,
 294                  struct ib_udata *udata)
 295{
 296        struct ib_pd *pd = ibqp->pd;
 297        struct siw_qp *qp = to_siw_qp(ibqp);
 298        struct ib_device *base_dev = pd->device;
 299        struct siw_device *sdev = to_siw_dev(base_dev);
 300        struct siw_ucontext *uctx =
 301                rdma_udata_to_drv_context(udata, struct siw_ucontext,
 302                                          base_ucontext);
 303        unsigned long flags;
 304        int num_sqe, num_rqe, rv = 0;
 305        size_t length;
 306
 307        siw_dbg(base_dev, "create new QP\n");
 308
 309        if (attrs->create_flags)
 310                return -EOPNOTSUPP;
 311
 312        if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
 313                siw_dbg(base_dev, "too many QP's\n");
 314                return -ENOMEM;
 315        }
 316        if (attrs->qp_type != IB_QPT_RC) {
 317                siw_dbg(base_dev, "only RC QP's supported\n");
 318                rv = -EOPNOTSUPP;
 319                goto err_atomic;
 320        }
 321        if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
 322            (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
 323            (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
 324            (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
 325                siw_dbg(base_dev, "QP size error\n");
 326                rv = -EINVAL;
 327                goto err_atomic;
 328        }
 329        if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
 330                siw_dbg(base_dev, "max inline send: %d > %d\n",
 331                        attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
 332                rv = -EINVAL;
 333                goto err_atomic;
 334        }
 335        /*
 336         * NOTE: we allow for zero element SQ and RQ WQE's SGL's
 337         * but not for a QP unable to hold any WQE (SQ + RQ)
 338         */
 339        if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) {
 340                siw_dbg(base_dev, "QP must have send or receive queue\n");
 341                rv = -EINVAL;
 342                goto err_atomic;
 343        }
 344
 345        if (!attrs->send_cq || (!attrs->recv_cq && !attrs->srq)) {
 346                siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
 347                rv = -EINVAL;
 348                goto err_atomic;
 349        }
 350
 351        init_rwsem(&qp->state_lock);
 352        spin_lock_init(&qp->sq_lock);
 353        spin_lock_init(&qp->rq_lock);
 354        spin_lock_init(&qp->orq_lock);
 355
 356        rv = siw_qp_add(sdev, qp);
 357        if (rv)
 358                goto err_atomic;
 359
 360        num_sqe = attrs->cap.max_send_wr;
 361        num_rqe = attrs->cap.max_recv_wr;
 362
 363        /* All queue indices are derived from modulo operations
 364         * on a free running 'get' (consumer) and 'put' (producer)
 365         * unsigned counter. Having queue sizes at power of two
 366         * avoids handling counter wrap around.
 367         */
 368        if (num_sqe)
 369                num_sqe = roundup_pow_of_two(num_sqe);
 370        else {
 371                /* Zero sized SQ is not supported */
 372                rv = -EINVAL;
 373                goto err_out_xa;
 374        }
 375        if (num_rqe)
 376                num_rqe = roundup_pow_of_two(num_rqe);
 377
 378        if (udata)
 379                qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
 380        else
 381                qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe));
 382
 383        if (qp->sendq == NULL) {
 384                rv = -ENOMEM;
 385                goto err_out_xa;
 386        }
 387        if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
 388                if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
 389                        qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
 390                else {
 391                        rv = -EINVAL;
 392                        goto err_out_xa;
 393                }
 394        }
 395        qp->pd = pd;
 396        qp->scq = to_siw_cq(attrs->send_cq);
 397        qp->rcq = to_siw_cq(attrs->recv_cq);
 398
 399        if (attrs->srq) {
 400                /*
 401                 * SRQ support.
 402                 * Verbs 6.3.7: ignore RQ size, if SRQ present
 403                 * Verbs 6.3.5: do not check PD of SRQ against PD of QP
 404                 */
 405                qp->srq = to_siw_srq(attrs->srq);
 406                qp->attrs.rq_size = 0;
 407                siw_dbg(base_dev, "QP [%u]: SRQ attached\n",
 408                        qp->base_qp.qp_num);
 409        } else if (num_rqe) {
 410                if (udata)
 411                        qp->recvq =
 412                                vmalloc_user(num_rqe * sizeof(struct siw_rqe));
 413                else
 414                        qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe));
 415
 416                if (qp->recvq == NULL) {
 417                        rv = -ENOMEM;
 418                        goto err_out_xa;
 419                }
 420                qp->attrs.rq_size = num_rqe;
 421        }
 422        qp->attrs.sq_size = num_sqe;
 423        qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
 424        qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
 425
 426        /* Make those two tunables fixed for now. */
 427        qp->tx_ctx.gso_seg_limit = 1;
 428        qp->tx_ctx.zcopy_tx = zcopy_tx;
 429
 430        qp->attrs.state = SIW_QP_STATE_IDLE;
 431
 432        if (udata) {
 433                struct siw_uresp_create_qp uresp = {};
 434
 435                uresp.num_sqe = num_sqe;
 436                uresp.num_rqe = num_rqe;
 437                uresp.qp_id = qp_id(qp);
 438
 439                if (qp->sendq) {
 440                        length = num_sqe * sizeof(struct siw_sqe);
 441                        qp->sq_entry =
 442                                siw_mmap_entry_insert(uctx, qp->sendq,
 443                                                      length, &uresp.sq_key);
 444                        if (!qp->sq_entry) {
 445                                rv = -ENOMEM;
 446                                goto err_out_xa;
 447                        }
 448                }
 449
 450                if (qp->recvq) {
 451                        length = num_rqe * sizeof(struct siw_rqe);
 452                        qp->rq_entry =
 453                                siw_mmap_entry_insert(uctx, qp->recvq,
 454                                                      length, &uresp.rq_key);
 455                        if (!qp->rq_entry) {
 456                                uresp.sq_key = SIW_INVAL_UOBJ_KEY;
 457                                rv = -ENOMEM;
 458                                goto err_out_xa;
 459                        }
 460                }
 461
 462                if (udata->outlen < sizeof(uresp)) {
 463                        rv = -EINVAL;
 464                        goto err_out_xa;
 465                }
 466                rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 467                if (rv)
 468                        goto err_out_xa;
 469        }
 470        qp->tx_cpu = siw_get_tx_cpu(sdev);
 471        if (qp->tx_cpu < 0) {
 472                rv = -EINVAL;
 473                goto err_out_xa;
 474        }
 475        INIT_LIST_HEAD(&qp->devq);
 476        spin_lock_irqsave(&sdev->lock, flags);
 477        list_add_tail(&qp->devq, &sdev->qp_list);
 478        spin_unlock_irqrestore(&sdev->lock, flags);
 479
 480        return 0;
 481
 482err_out_xa:
 483        xa_erase(&sdev->qp_xa, qp_id(qp));
 484        if (uctx) {
 485                rdma_user_mmap_entry_remove(qp->sq_entry);
 486                rdma_user_mmap_entry_remove(qp->rq_entry);
 487        }
 488        vfree(qp->sendq);
 489        vfree(qp->recvq);
 490
 491err_atomic:
 492        atomic_dec(&sdev->num_qp);
 493        return rv;
 494}
 495
 496/*
 497 * Minimum siw_query_qp() verb interface.
 498 *
 499 * @qp_attr_mask is not used but all available information is provided
 500 */
 501int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
 502                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
 503{
 504        struct siw_qp *qp;
 505        struct siw_device *sdev;
 506
 507        if (base_qp && qp_attr && qp_init_attr) {
 508                qp = to_siw_qp(base_qp);
 509                sdev = to_siw_dev(base_qp->device);
 510        } else {
 511                return -EINVAL;
 512        }
 513        qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
 514        qp_attr->cap.max_send_wr = qp->attrs.sq_size;
 515        qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
 516        qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
 517        qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
 518        qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
 519        qp_attr->max_rd_atomic = qp->attrs.irq_size;
 520        qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
 521
 522        qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
 523                                   IB_ACCESS_REMOTE_WRITE |
 524                                   IB_ACCESS_REMOTE_READ;
 525
 526        qp_init_attr->qp_type = base_qp->qp_type;
 527        qp_init_attr->send_cq = base_qp->send_cq;
 528        qp_init_attr->recv_cq = base_qp->recv_cq;
 529        qp_init_attr->srq = base_qp->srq;
 530
 531        qp_init_attr->cap = qp_attr->cap;
 532
 533        return 0;
 534}
 535
 536int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
 537                        int attr_mask, struct ib_udata *udata)
 538{
 539        struct siw_qp_attrs new_attrs;
 540        enum siw_qp_attr_mask siw_attr_mask = 0;
 541        struct siw_qp *qp = to_siw_qp(base_qp);
 542        int rv = 0;
 543
 544        if (!attr_mask)
 545                return 0;
 546
 547        if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
 548                return -EOPNOTSUPP;
 549
 550        memset(&new_attrs, 0, sizeof(new_attrs));
 551
 552        if (attr_mask & IB_QP_ACCESS_FLAGS) {
 553                siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
 554
 555                if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
 556                        new_attrs.flags |= SIW_RDMA_READ_ENABLED;
 557                if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
 558                        new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
 559                if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
 560                        new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
 561        }
 562        if (attr_mask & IB_QP_STATE) {
 563                siw_dbg_qp(qp, "desired IB QP state: %s\n",
 564                           ib_qp_state_to_string[attr->qp_state]);
 565
 566                new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
 567
 568                if (new_attrs.state > SIW_QP_STATE_RTS)
 569                        qp->tx_ctx.tx_suspend = 1;
 570
 571                siw_attr_mask |= SIW_QP_ATTR_STATE;
 572        }
 573        if (!siw_attr_mask)
 574                goto out;
 575
 576        down_write(&qp->state_lock);
 577
 578        rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
 579
 580        up_write(&qp->state_lock);
 581out:
 582        return rv;
 583}
 584
 585int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
 586{
 587        struct siw_qp *qp = to_siw_qp(base_qp);
 588        struct siw_ucontext *uctx =
 589                rdma_udata_to_drv_context(udata, struct siw_ucontext,
 590                                          base_ucontext);
 591        struct siw_qp_attrs qp_attrs;
 592
 593        siw_dbg_qp(qp, "state %d\n", qp->attrs.state);
 594
 595        /*
 596         * Mark QP as in process of destruction to prevent from
 597         * any async callbacks to RDMA core
 598         */
 599        qp->attrs.flags |= SIW_QP_IN_DESTROY;
 600        qp->rx_stream.rx_suspend = 1;
 601
 602        if (uctx) {
 603                rdma_user_mmap_entry_remove(qp->sq_entry);
 604                rdma_user_mmap_entry_remove(qp->rq_entry);
 605        }
 606
 607        down_write(&qp->state_lock);
 608
 609        qp_attrs.state = SIW_QP_STATE_ERROR;
 610        siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
 611
 612        if (qp->cep) {
 613                siw_cep_put(qp->cep);
 614                qp->cep = NULL;
 615        }
 616        up_write(&qp->state_lock);
 617
 618        kfree(qp->tx_ctx.mpa_crc_hd);
 619        kfree(qp->rx_stream.mpa_crc_hd);
 620
 621        qp->scq = qp->rcq = NULL;
 622
 623        siw_qp_put(qp);
 624
 625        return 0;
 626}
 627
 628/*
 629 * siw_copy_inline_sgl()
 630 *
 631 * Prepare sgl of inlined data for sending. For userland callers
 632 * function checks if given buffer addresses and len's are within
 633 * process context bounds.
 634 * Data from all provided sge's are copied together into the wqe,
 635 * referenced by a single sge.
 636 */
 637static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
 638                               struct siw_sqe *sqe)
 639{
 640        struct ib_sge *core_sge = core_wr->sg_list;
 641        void *kbuf = &sqe->sge[1];
 642        int num_sge = core_wr->num_sge, bytes = 0;
 643
 644        sqe->sge[0].laddr = (uintptr_t)kbuf;
 645        sqe->sge[0].lkey = 0;
 646
 647        while (num_sge--) {
 648                if (!core_sge->length) {
 649                        core_sge++;
 650                        continue;
 651                }
 652                bytes += core_sge->length;
 653                if (bytes > SIW_MAX_INLINE) {
 654                        bytes = -EINVAL;
 655                        break;
 656                }
 657                memcpy(kbuf, (void *)(uintptr_t)core_sge->addr,
 658                       core_sge->length);
 659
 660                kbuf += core_sge->length;
 661                core_sge++;
 662        }
 663        sqe->sge[0].length = bytes > 0 ? bytes : 0;
 664        sqe->num_sge = bytes > 0 ? 1 : 0;
 665
 666        return bytes;
 667}
 668
 669/* Complete SQ WR's without processing */
 670static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr,
 671                           const struct ib_send_wr **bad_wr)
 672{
 673        struct siw_sqe sqe = {};
 674        int rv = 0;
 675
 676        while (wr) {
 677                sqe.id = wr->wr_id;
 678                sqe.opcode = wr->opcode;
 679                rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR);
 680                if (rv) {
 681                        if (bad_wr)
 682                                *bad_wr = wr;
 683                        break;
 684                }
 685                wr = wr->next;
 686        }
 687        return rv;
 688}
 689
 690/* Complete RQ WR's without processing */
 691static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr,
 692                           const struct ib_recv_wr **bad_wr)
 693{
 694        struct siw_rqe rqe = {};
 695        int rv = 0;
 696
 697        while (wr) {
 698                rqe.id = wr->wr_id;
 699                rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR);
 700                if (rv) {
 701                        if (bad_wr)
 702                                *bad_wr = wr;
 703                        break;
 704                }
 705                wr = wr->next;
 706        }
 707        return rv;
 708}
 709
 710/*
 711 * siw_post_send()
 712 *
 713 * Post a list of S-WR's to a SQ.
 714 *
 715 * @base_qp:    Base QP contained in siw QP
 716 * @wr:         Null terminated list of user WR's
 717 * @bad_wr:     Points to failing WR in case of synchronous failure.
 718 */
 719int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
 720                  const struct ib_send_wr **bad_wr)
 721{
 722        struct siw_qp *qp = to_siw_qp(base_qp);
 723        struct siw_wqe *wqe = tx_wqe(qp);
 724
 725        unsigned long flags;
 726        int rv = 0;
 727
 728        if (wr && !rdma_is_kernel_res(&qp->base_qp.res)) {
 729                siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
 730                *bad_wr = wr;
 731                return -EINVAL;
 732        }
 733
 734        /*
 735         * Try to acquire QP state lock. Must be non-blocking
 736         * to accommodate kernel clients needs.
 737         */
 738        if (!down_read_trylock(&qp->state_lock)) {
 739                if (qp->attrs.state == SIW_QP_STATE_ERROR) {
 740                        /*
 741                         * ERROR state is final, so we can be sure
 742                         * this state will not change as long as the QP
 743                         * exists.
 744                         *
 745                         * This handles an ib_drain_sq() call with
 746                         * a concurrent request to set the QP state
 747                         * to ERROR.
 748                         */
 749                        rv = siw_sq_flush_wr(qp, wr, bad_wr);
 750                } else {
 751                        siw_dbg_qp(qp, "QP locked, state %d\n",
 752                                   qp->attrs.state);
 753                        *bad_wr = wr;
 754                        rv = -ENOTCONN;
 755                }
 756                return rv;
 757        }
 758        if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
 759                if (qp->attrs.state == SIW_QP_STATE_ERROR) {
 760                        /*
 761                         * Immediately flush this WR to CQ, if QP
 762                         * is in ERROR state. SQ is guaranteed to
 763                         * be empty, so WR complets in-order.
 764                         *
 765                         * Typically triggered by ib_drain_sq().
 766                         */
 767                        rv = siw_sq_flush_wr(qp, wr, bad_wr);
 768                } else {
 769                        siw_dbg_qp(qp, "QP out of state %d\n",
 770                                   qp->attrs.state);
 771                        *bad_wr = wr;
 772                        rv = -ENOTCONN;
 773                }
 774                up_read(&qp->state_lock);
 775                return rv;
 776        }
 777        spin_lock_irqsave(&qp->sq_lock, flags);
 778
 779        while (wr) {
 780                u32 idx = qp->sq_put % qp->attrs.sq_size;
 781                struct siw_sqe *sqe = &qp->sendq[idx];
 782
 783                if (sqe->flags) {
 784                        siw_dbg_qp(qp, "sq full\n");
 785                        rv = -ENOMEM;
 786                        break;
 787                }
 788                if (wr->num_sge > qp->attrs.sq_max_sges) {
 789                        siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
 790                        rv = -EINVAL;
 791                        break;
 792                }
 793                sqe->id = wr->wr_id;
 794
 795                if ((wr->send_flags & IB_SEND_SIGNALED) ||
 796                    (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
 797                        sqe->flags |= SIW_WQE_SIGNALLED;
 798
 799                if (wr->send_flags & IB_SEND_FENCE)
 800                        sqe->flags |= SIW_WQE_READ_FENCE;
 801
 802                switch (wr->opcode) {
 803                case IB_WR_SEND:
 804                case IB_WR_SEND_WITH_INV:
 805                        if (wr->send_flags & IB_SEND_SOLICITED)
 806                                sqe->flags |= SIW_WQE_SOLICITED;
 807
 808                        if (!(wr->send_flags & IB_SEND_INLINE)) {
 809                                siw_copy_sgl(wr->sg_list, sqe->sge,
 810                                             wr->num_sge);
 811                                sqe->num_sge = wr->num_sge;
 812                        } else {
 813                                rv = siw_copy_inline_sgl(wr, sqe);
 814                                if (rv <= 0) {
 815                                        rv = -EINVAL;
 816                                        break;
 817                                }
 818                                sqe->flags |= SIW_WQE_INLINE;
 819                                sqe->num_sge = 1;
 820                        }
 821                        if (wr->opcode == IB_WR_SEND)
 822                                sqe->opcode = SIW_OP_SEND;
 823                        else {
 824                                sqe->opcode = SIW_OP_SEND_REMOTE_INV;
 825                                sqe->rkey = wr->ex.invalidate_rkey;
 826                        }
 827                        break;
 828
 829                case IB_WR_RDMA_READ_WITH_INV:
 830                case IB_WR_RDMA_READ:
 831                        /*
 832                         * iWarp restricts RREAD sink to SGL containing
 833                         * 1 SGE only. we could relax to SGL with multiple
 834                         * elements referring the SAME ltag or even sending
 835                         * a private per-rreq tag referring to a checked
 836                         * local sgl with MULTIPLE ltag's.
 837                         */
 838                        if (unlikely(wr->num_sge != 1)) {
 839                                rv = -EINVAL;
 840                                break;
 841                        }
 842                        siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
 843                        /*
 844                         * NOTE: zero length RREAD is allowed!
 845                         */
 846                        sqe->raddr = rdma_wr(wr)->remote_addr;
 847                        sqe->rkey = rdma_wr(wr)->rkey;
 848                        sqe->num_sge = 1;
 849
 850                        if (wr->opcode == IB_WR_RDMA_READ)
 851                                sqe->opcode = SIW_OP_READ;
 852                        else
 853                                sqe->opcode = SIW_OP_READ_LOCAL_INV;
 854                        break;
 855
 856                case IB_WR_RDMA_WRITE:
 857                        if (!(wr->send_flags & IB_SEND_INLINE)) {
 858                                siw_copy_sgl(wr->sg_list, &sqe->sge[0],
 859                                             wr->num_sge);
 860                                sqe->num_sge = wr->num_sge;
 861                        } else {
 862                                rv = siw_copy_inline_sgl(wr, sqe);
 863                                if (unlikely(rv < 0)) {
 864                                        rv = -EINVAL;
 865                                        break;
 866                                }
 867                                sqe->flags |= SIW_WQE_INLINE;
 868                                sqe->num_sge = 1;
 869                        }
 870                        sqe->raddr = rdma_wr(wr)->remote_addr;
 871                        sqe->rkey = rdma_wr(wr)->rkey;
 872                        sqe->opcode = SIW_OP_WRITE;
 873                        break;
 874
 875                case IB_WR_REG_MR:
 876                        sqe->base_mr = (uintptr_t)reg_wr(wr)->mr;
 877                        sqe->rkey = reg_wr(wr)->key;
 878                        sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
 879                        sqe->opcode = SIW_OP_REG_MR;
 880                        break;
 881
 882                case IB_WR_LOCAL_INV:
 883                        sqe->rkey = wr->ex.invalidate_rkey;
 884                        sqe->opcode = SIW_OP_INVAL_STAG;
 885                        break;
 886
 887                default:
 888                        siw_dbg_qp(qp, "ib wr type %d unsupported\n",
 889                                   wr->opcode);
 890                        rv = -EINVAL;
 891                        break;
 892                }
 893                siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n",
 894                           sqe->opcode, sqe->flags,
 895                           (void *)(uintptr_t)sqe->id);
 896
 897                if (unlikely(rv < 0))
 898                        break;
 899
 900                /* make SQE only valid after completely written */
 901                smp_wmb();
 902                sqe->flags |= SIW_WQE_VALID;
 903
 904                qp->sq_put++;
 905                wr = wr->next;
 906        }
 907
 908        /*
 909         * Send directly if SQ processing is not in progress.
 910         * Eventual immediate errors (rv < 0) do not affect the involved
 911         * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
 912         * processing, if new work is already pending. But rv must be passed
 913         * to caller.
 914         */
 915        if (wqe->wr_status != SIW_WR_IDLE) {
 916                spin_unlock_irqrestore(&qp->sq_lock, flags);
 917                goto skip_direct_sending;
 918        }
 919        rv = siw_activate_tx(qp);
 920        spin_unlock_irqrestore(&qp->sq_lock, flags);
 921
 922        if (rv <= 0)
 923                goto skip_direct_sending;
 924
 925        if (rdma_is_kernel_res(&qp->base_qp.res)) {
 926                rv = siw_sq_start(qp);
 927        } else {
 928                qp->tx_ctx.in_syscall = 1;
 929
 930                if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
 931                        siw_qp_cm_drop(qp, 0);
 932
 933                qp->tx_ctx.in_syscall = 0;
 934        }
 935skip_direct_sending:
 936
 937        up_read(&qp->state_lock);
 938
 939        if (rv >= 0)
 940                return 0;
 941        /*
 942         * Immediate error
 943         */
 944        siw_dbg_qp(qp, "error %d\n", rv);
 945
 946        *bad_wr = wr;
 947        return rv;
 948}
 949
 950/*
 951 * siw_post_receive()
 952 *
 953 * Post a list of R-WR's to a RQ.
 954 *
 955 * @base_qp:    Base QP contained in siw QP
 956 * @wr:         Null terminated list of user WR's
 957 * @bad_wr:     Points to failing WR in case of synchronous failure.
 958 */
 959int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
 960                     const struct ib_recv_wr **bad_wr)
 961{
 962        struct siw_qp *qp = to_siw_qp(base_qp);
 963        unsigned long flags;
 964        int rv = 0;
 965
 966        if (qp->srq || qp->attrs.rq_size == 0) {
 967                *bad_wr = wr;
 968                return -EINVAL;
 969        }
 970        if (!rdma_is_kernel_res(&qp->base_qp.res)) {
 971                siw_dbg_qp(qp, "no kernel post_recv for user mapped rq\n");
 972                *bad_wr = wr;
 973                return -EINVAL;
 974        }
 975
 976        /*
 977         * Try to acquire QP state lock. Must be non-blocking
 978         * to accommodate kernel clients needs.
 979         */
 980        if (!down_read_trylock(&qp->state_lock)) {
 981                if (qp->attrs.state == SIW_QP_STATE_ERROR) {
 982                        /*
 983                         * ERROR state is final, so we can be sure
 984                         * this state will not change as long as the QP
 985                         * exists.
 986                         *
 987                         * This handles an ib_drain_rq() call with
 988                         * a concurrent request to set the QP state
 989                         * to ERROR.
 990                         */
 991                        rv = siw_rq_flush_wr(qp, wr, bad_wr);
 992                } else {
 993                        siw_dbg_qp(qp, "QP locked, state %d\n",
 994                                   qp->attrs.state);
 995                        *bad_wr = wr;
 996                        rv = -ENOTCONN;
 997                }
 998                return rv;
 999        }
1000        if (qp->attrs.state > SIW_QP_STATE_RTS) {
1001                if (qp->attrs.state == SIW_QP_STATE_ERROR) {
1002                        /*
1003                         * Immediately flush this WR to CQ, if QP
1004                         * is in ERROR state. RQ is guaranteed to
1005                         * be empty, so WR complets in-order.
1006                         *
1007                         * Typically triggered by ib_drain_rq().
1008                         */
1009                        rv = siw_rq_flush_wr(qp, wr, bad_wr);
1010                } else {
1011                        siw_dbg_qp(qp, "QP out of state %d\n",
1012                                   qp->attrs.state);
1013                        *bad_wr = wr;
1014                        rv = -ENOTCONN;
1015                }
1016                up_read(&qp->state_lock);
1017                return rv;
1018        }
1019        /*
1020         * Serialize potentially multiple producers.
1021         * Not needed for single threaded consumer side.
1022         */
1023        spin_lock_irqsave(&qp->rq_lock, flags);
1024
1025        while (wr) {
1026                u32 idx = qp->rq_put % qp->attrs.rq_size;
1027                struct siw_rqe *rqe = &qp->recvq[idx];
1028
1029                if (rqe->flags) {
1030                        siw_dbg_qp(qp, "RQ full\n");
1031                        rv = -ENOMEM;
1032                        break;
1033                }
1034                if (wr->num_sge > qp->attrs.rq_max_sges) {
1035                        siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
1036                        rv = -EINVAL;
1037                        break;
1038                }
1039                rqe->id = wr->wr_id;
1040                rqe->num_sge = wr->num_sge;
1041                siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1042
1043                /* make sure RQE is completely written before valid */
1044                smp_wmb();
1045
1046                rqe->flags = SIW_WQE_VALID;
1047
1048                qp->rq_put++;
1049                wr = wr->next;
1050        }
1051        spin_unlock_irqrestore(&qp->rq_lock, flags);
1052
1053        up_read(&qp->state_lock);
1054
1055        if (rv < 0) {
1056                siw_dbg_qp(qp, "error %d\n", rv);
1057                *bad_wr = wr;
1058        }
1059        return rv > 0 ? 0 : rv;
1060}
1061
1062int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
1063{
1064        struct siw_cq *cq = to_siw_cq(base_cq);
1065        struct siw_device *sdev = to_siw_dev(base_cq->device);
1066        struct siw_ucontext *ctx =
1067                rdma_udata_to_drv_context(udata, struct siw_ucontext,
1068                                          base_ucontext);
1069
1070        siw_dbg_cq(cq, "free CQ resources\n");
1071
1072        siw_cq_flush(cq);
1073
1074        if (ctx)
1075                rdma_user_mmap_entry_remove(cq->cq_entry);
1076
1077        atomic_dec(&sdev->num_cq);
1078
1079        vfree(cq->queue);
1080        return 0;
1081}
1082
1083/*
1084 * siw_create_cq()
1085 *
1086 * Populate CQ of requested size
1087 *
1088 * @base_cq: CQ as allocated by RDMA midlayer
1089 * @attr: Initial CQ attributes
1090 * @udata: relates to user context
1091 */
1092
1093int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
1094                  struct ib_udata *udata)
1095{
1096        struct siw_device *sdev = to_siw_dev(base_cq->device);
1097        struct siw_cq *cq = to_siw_cq(base_cq);
1098        int rv, size = attr->cqe;
1099
1100        if (attr->flags)
1101                return -EOPNOTSUPP;
1102
1103        if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
1104                siw_dbg(base_cq->device, "too many CQ's\n");
1105                rv = -ENOMEM;
1106                goto err_out;
1107        }
1108        if (size < 1 || size > sdev->attrs.max_cqe) {
1109                siw_dbg(base_cq->device, "CQ size error: %d\n", size);
1110                rv = -EINVAL;
1111                goto err_out;
1112        }
1113        size = roundup_pow_of_two(size);
1114        cq->base_cq.cqe = size;
1115        cq->num_cqe = size;
1116
1117        if (udata)
1118                cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
1119                                         sizeof(struct siw_cq_ctrl));
1120        else
1121                cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
1122                                    sizeof(struct siw_cq_ctrl));
1123
1124        if (cq->queue == NULL) {
1125                rv = -ENOMEM;
1126                goto err_out;
1127        }
1128        get_random_bytes(&cq->id, 4);
1129        siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
1130
1131        spin_lock_init(&cq->lock);
1132
1133        cq->notify = (struct siw_cq_ctrl *)&cq->queue[size];
1134
1135        if (udata) {
1136                struct siw_uresp_create_cq uresp = {};
1137                struct siw_ucontext *ctx =
1138                        rdma_udata_to_drv_context(udata, struct siw_ucontext,
1139                                                  base_ucontext);
1140                size_t length = size * sizeof(struct siw_cqe) +
1141                        sizeof(struct siw_cq_ctrl);
1142
1143                cq->cq_entry =
1144                        siw_mmap_entry_insert(ctx, cq->queue,
1145                                              length, &uresp.cq_key);
1146                if (!cq->cq_entry) {
1147                        rv = -ENOMEM;
1148                        goto err_out;
1149                }
1150
1151                uresp.cq_id = cq->id;
1152                uresp.num_cqe = size;
1153
1154                if (udata->outlen < sizeof(uresp)) {
1155                        rv = -EINVAL;
1156                        goto err_out;
1157                }
1158                rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1159                if (rv)
1160                        goto err_out;
1161        }
1162        return 0;
1163
1164err_out:
1165        siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
1166
1167        if (cq && cq->queue) {
1168                struct siw_ucontext *ctx =
1169                        rdma_udata_to_drv_context(udata, struct siw_ucontext,
1170                                                  base_ucontext);
1171                if (ctx)
1172                        rdma_user_mmap_entry_remove(cq->cq_entry);
1173                vfree(cq->queue);
1174        }
1175        atomic_dec(&sdev->num_cq);
1176
1177        return rv;
1178}
1179
1180/*
1181 * siw_poll_cq()
1182 *
1183 * Reap CQ entries if available and copy work completion status into
1184 * array of WC's provided by caller. Returns number of reaped CQE's.
1185 *
1186 * @base_cq:    Base CQ contained in siw CQ.
1187 * @num_cqe:    Maximum number of CQE's to reap.
1188 * @wc:         Array of work completions to be filled by siw.
1189 */
1190int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
1191{
1192        struct siw_cq *cq = to_siw_cq(base_cq);
1193        int i;
1194
1195        for (i = 0; i < num_cqe; i++) {
1196                if (!siw_reap_cqe(cq, wc))
1197                        break;
1198                wc++;
1199        }
1200        return i;
1201}
1202
1203/*
1204 * siw_req_notify_cq()
1205 *
1206 * Request notification for new CQE's added to that CQ.
1207 * Defined flags:
1208 * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
1209 *   event if a WQE with notification flag set enters the CQ
1210 * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
1211 *   event if a WQE enters the CQ.
1212 * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
1213 *   number of not reaped CQE's regardless of its notification
1214 *   type and current or new CQ notification settings.
1215 *
1216 * @base_cq:    Base CQ contained in siw CQ.
1217 * @flags:      Requested notification flags.
1218 */
1219int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
1220{
1221        struct siw_cq *cq = to_siw_cq(base_cq);
1222
1223        siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
1224
1225        if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
1226                /*
1227                 * Enable CQ event for next solicited completion.
1228                 * and make it visible to all associated producers.
1229                 */
1230                smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED);
1231        else
1232                /*
1233                 * Enable CQ event for any signalled completion.
1234                 * and make it visible to all associated producers.
1235                 */
1236                smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL);
1237
1238        if (flags & IB_CQ_REPORT_MISSED_EVENTS)
1239                return cq->cq_put - cq->cq_get;
1240
1241        return 0;
1242}
1243
1244/*
1245 * siw_dereg_mr()
1246 *
1247 * Release Memory Region.
1248 *
1249 * @base_mr: Base MR contained in siw MR.
1250 * @udata: points to user context, unused.
1251 */
1252int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
1253{
1254        struct siw_mr *mr = to_siw_mr(base_mr);
1255        struct siw_device *sdev = to_siw_dev(base_mr->device);
1256
1257        siw_dbg_mem(mr->mem, "deregister MR\n");
1258
1259        atomic_dec(&sdev->num_mr);
1260
1261        siw_mr_drop_mem(mr);
1262        kfree_rcu(mr, rcu);
1263
1264        return 0;
1265}
1266
1267/*
1268 * siw_reg_user_mr()
1269 *
1270 * Register Memory Region.
1271 *
1272 * @pd:         Protection Domain
1273 * @start:      starting address of MR (virtual address)
1274 * @len:        len of MR
1275 * @rnic_va:    not used by siw
1276 * @rights:     MR access rights
1277 * @udata:      user buffer to communicate STag and Key.
1278 */
1279struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
1280                              u64 rnic_va, int rights, struct ib_udata *udata)
1281{
1282        struct siw_mr *mr = NULL;
1283        struct siw_umem *umem = NULL;
1284        struct siw_ureq_reg_mr ureq;
1285        struct siw_device *sdev = to_siw_dev(pd->device);
1286
1287        unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
1288        int rv;
1289
1290        siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
1291                   (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
1292                   (unsigned long long)len);
1293
1294        if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1295                siw_dbg_pd(pd, "too many mr's\n");
1296                rv = -ENOMEM;
1297                goto err_out;
1298        }
1299        if (!len) {
1300                rv = -EINVAL;
1301                goto err_out;
1302        }
1303        if (mem_limit != RLIM_INFINITY) {
1304                unsigned long num_pages =
1305                        (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
1306                mem_limit >>= PAGE_SHIFT;
1307
1308                if (num_pages > mem_limit - current->mm->locked_vm) {
1309                        siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
1310                                   num_pages, mem_limit,
1311                                   current->mm->locked_vm);
1312                        rv = -ENOMEM;
1313                        goto err_out;
1314                }
1315        }
1316        umem = siw_umem_get(start, len, ib_access_writable(rights));
1317        if (IS_ERR(umem)) {
1318                rv = PTR_ERR(umem);
1319                siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
1320                umem = NULL;
1321                goto err_out;
1322        }
1323        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1324        if (!mr) {
1325                rv = -ENOMEM;
1326                goto err_out;
1327        }
1328        rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
1329        if (rv)
1330                goto err_out;
1331
1332        if (udata) {
1333                struct siw_uresp_reg_mr uresp = {};
1334                struct siw_mem *mem = mr->mem;
1335
1336                if (udata->inlen < sizeof(ureq)) {
1337                        rv = -EINVAL;
1338                        goto err_out;
1339                }
1340                rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
1341                if (rv)
1342                        goto err_out;
1343
1344                mr->base_mr.lkey |= ureq.stag_key;
1345                mr->base_mr.rkey |= ureq.stag_key;
1346                mem->stag |= ureq.stag_key;
1347                uresp.stag = mem->stag;
1348
1349                if (udata->outlen < sizeof(uresp)) {
1350                        rv = -EINVAL;
1351                        goto err_out;
1352                }
1353                rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1354                if (rv)
1355                        goto err_out;
1356        }
1357        mr->mem->stag_valid = 1;
1358
1359        return &mr->base_mr;
1360
1361err_out:
1362        atomic_dec(&sdev->num_mr);
1363        if (mr) {
1364                if (mr->mem)
1365                        siw_mr_drop_mem(mr);
1366                kfree_rcu(mr, rcu);
1367        } else {
1368                if (umem)
1369                        siw_umem_release(umem, false);
1370        }
1371        return ERR_PTR(rv);
1372}
1373
1374struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
1375                           u32 max_sge)
1376{
1377        struct siw_device *sdev = to_siw_dev(pd->device);
1378        struct siw_mr *mr = NULL;
1379        struct siw_pbl *pbl = NULL;
1380        int rv;
1381
1382        if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1383                siw_dbg_pd(pd, "too many mr's\n");
1384                rv = -ENOMEM;
1385                goto err_out;
1386        }
1387        if (mr_type != IB_MR_TYPE_MEM_REG) {
1388                siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
1389                rv = -EOPNOTSUPP;
1390                goto err_out;
1391        }
1392        if (max_sge > SIW_MAX_SGE_PBL) {
1393                siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
1394                rv = -ENOMEM;
1395                goto err_out;
1396        }
1397        pbl = siw_pbl_alloc(max_sge);
1398        if (IS_ERR(pbl)) {
1399                rv = PTR_ERR(pbl);
1400                siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
1401                pbl = NULL;
1402                goto err_out;
1403        }
1404        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1405        if (!mr) {
1406                rv = -ENOMEM;
1407                goto err_out;
1408        }
1409        rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
1410        if (rv)
1411                goto err_out;
1412
1413        mr->mem->is_pbl = 1;
1414
1415        siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1416
1417        return &mr->base_mr;
1418
1419err_out:
1420        atomic_dec(&sdev->num_mr);
1421
1422        if (!mr) {
1423                kfree(pbl);
1424        } else {
1425                if (mr->mem)
1426                        siw_mr_drop_mem(mr);
1427                kfree_rcu(mr, rcu);
1428        }
1429        siw_dbg_pd(pd, "failed: %d\n", rv);
1430
1431        return ERR_PTR(rv);
1432}
1433
1434/* Just used to count number of pages being mapped */
1435static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
1436{
1437        return 0;
1438}
1439
1440int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
1441                  unsigned int *sg_off)
1442{
1443        struct scatterlist *slp;
1444        struct siw_mr *mr = to_siw_mr(base_mr);
1445        struct siw_mem *mem = mr->mem;
1446        struct siw_pbl *pbl = mem->pbl;
1447        struct siw_pble *pble;
1448        unsigned long pbl_size;
1449        int i, rv;
1450
1451        if (!pbl) {
1452                siw_dbg_mem(mem, "no PBL allocated\n");
1453                return -EINVAL;
1454        }
1455        pble = pbl->pbe;
1456
1457        if (pbl->max_buf < num_sle) {
1458                siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
1459                            mem->pbl->max_buf, num_sle);
1460                return -ENOMEM;
1461        }
1462        for_each_sg(sl, slp, num_sle, i) {
1463                if (sg_dma_len(slp) == 0) {
1464                        siw_dbg_mem(mem, "empty SGE\n");
1465                        return -EINVAL;
1466                }
1467                if (i == 0) {
1468                        pble->addr = sg_dma_address(slp);
1469                        pble->size = sg_dma_len(slp);
1470                        pble->pbl_off = 0;
1471                        pbl_size = pble->size;
1472                        pbl->num_buf = 1;
1473                } else {
1474                        /* Merge PBL entries if adjacent */
1475                        if (pble->addr + pble->size == sg_dma_address(slp)) {
1476                                pble->size += sg_dma_len(slp);
1477                        } else {
1478                                pble++;
1479                                pbl->num_buf++;
1480                                pble->addr = sg_dma_address(slp);
1481                                pble->size = sg_dma_len(slp);
1482                                pble->pbl_off = pbl_size;
1483                        }
1484                        pbl_size += sg_dma_len(slp);
1485                }
1486                siw_dbg_mem(mem,
1487                        "sge[%d], size %u, addr 0x%p, total %lu\n",
1488                        i, pble->size, (void *)(uintptr_t)pble->addr,
1489                        pbl_size);
1490        }
1491        rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
1492        if (rv > 0) {
1493                mem->len = base_mr->length;
1494                mem->va = base_mr->iova;
1495                siw_dbg_mem(mem,
1496                        "%llu bytes, start 0x%pK, %u SLE to %u entries\n",
1497                        mem->len, (void *)(uintptr_t)mem->va, num_sle,
1498                        pbl->num_buf);
1499        }
1500        return rv;
1501}
1502
1503/*
1504 * siw_get_dma_mr()
1505 *
1506 * Create a (empty) DMA memory region, where no umem is attached.
1507 */
1508struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
1509{
1510        struct siw_device *sdev = to_siw_dev(pd->device);
1511        struct siw_mr *mr = NULL;
1512        int rv;
1513
1514        if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1515                siw_dbg_pd(pd, "too many mr's\n");
1516                rv = -ENOMEM;
1517                goto err_out;
1518        }
1519        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1520        if (!mr) {
1521                rv = -ENOMEM;
1522                goto err_out;
1523        }
1524        rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
1525        if (rv)
1526                goto err_out;
1527
1528        mr->mem->stag_valid = 1;
1529
1530        siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1531
1532        return &mr->base_mr;
1533
1534err_out:
1535        if (rv)
1536                kfree(mr);
1537
1538        atomic_dec(&sdev->num_mr);
1539
1540        return ERR_PTR(rv);
1541}
1542
1543/*
1544 * siw_create_srq()
1545 *
1546 * Create Shared Receive Queue of attributes @init_attrs
1547 * within protection domain given by @pd.
1548 *
1549 * @base_srq:   Base SRQ contained in siw SRQ.
1550 * @init_attrs: SRQ init attributes.
1551 * @udata:      points to user context
1552 */
1553int siw_create_srq(struct ib_srq *base_srq,
1554                   struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
1555{
1556        struct siw_srq *srq = to_siw_srq(base_srq);
1557        struct ib_srq_attr *attrs = &init_attrs->attr;
1558        struct siw_device *sdev = to_siw_dev(base_srq->device);
1559        struct siw_ucontext *ctx =
1560                rdma_udata_to_drv_context(udata, struct siw_ucontext,
1561                                          base_ucontext);
1562        int rv;
1563
1564        if (init_attrs->srq_type != IB_SRQT_BASIC)
1565                return -EOPNOTSUPP;
1566
1567        if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
1568                siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
1569                rv = -ENOMEM;
1570                goto err_out;
1571        }
1572        if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
1573            attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
1574                rv = -EINVAL;
1575                goto err_out;
1576        }
1577        srq->max_sge = attrs->max_sge;
1578        srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
1579        srq->limit = attrs->srq_limit;
1580        if (srq->limit)
1581                srq->armed = true;
1582
1583        srq->is_kernel_res = !udata;
1584
1585        if (udata)
1586                srq->recvq =
1587                        vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
1588        else
1589                srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe));
1590
1591        if (srq->recvq == NULL) {
1592                rv = -ENOMEM;
1593                goto err_out;
1594        }
1595        if (udata) {
1596                struct siw_uresp_create_srq uresp = {};
1597                size_t length = srq->num_rqe * sizeof(struct siw_rqe);
1598
1599                srq->srq_entry =
1600                        siw_mmap_entry_insert(ctx, srq->recvq,
1601                                              length, &uresp.srq_key);
1602                if (!srq->srq_entry) {
1603                        rv = -ENOMEM;
1604                        goto err_out;
1605                }
1606
1607                uresp.num_rqe = srq->num_rqe;
1608
1609                if (udata->outlen < sizeof(uresp)) {
1610                        rv = -EINVAL;
1611                        goto err_out;
1612                }
1613                rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1614                if (rv)
1615                        goto err_out;
1616        }
1617        spin_lock_init(&srq->lock);
1618
1619        siw_dbg_pd(base_srq->pd, "[SRQ]: success\n");
1620
1621        return 0;
1622
1623err_out:
1624        if (srq->recvq) {
1625                if (ctx)
1626                        rdma_user_mmap_entry_remove(srq->srq_entry);
1627                vfree(srq->recvq);
1628        }
1629        atomic_dec(&sdev->num_srq);
1630
1631        return rv;
1632}
1633
1634/*
1635 * siw_modify_srq()
1636 *
1637 * Modify SRQ. The caller may resize SRQ and/or set/reset notification
1638 * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
1639 *
1640 * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
1641 * parameter. siw_modify_srq() does not check the attrs->max_sge param.
1642 */
1643int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
1644                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
1645{
1646        struct siw_srq *srq = to_siw_srq(base_srq);
1647        unsigned long flags;
1648        int rv = 0;
1649
1650        spin_lock_irqsave(&srq->lock, flags);
1651
1652        if (attr_mask & IB_SRQ_MAX_WR) {
1653                /* resize request not yet supported */
1654                rv = -EOPNOTSUPP;
1655                goto out;
1656        }
1657        if (attr_mask & IB_SRQ_LIMIT) {
1658                if (attrs->srq_limit) {
1659                        if (unlikely(attrs->srq_limit > srq->num_rqe)) {
1660                                rv = -EINVAL;
1661                                goto out;
1662                        }
1663                        srq->armed = true;
1664                } else {
1665                        srq->armed = false;
1666                }
1667                srq->limit = attrs->srq_limit;
1668        }
1669out:
1670        spin_unlock_irqrestore(&srq->lock, flags);
1671
1672        return rv;
1673}
1674
1675/*
1676 * siw_query_srq()
1677 *
1678 * Query SRQ attributes.
1679 */
1680int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
1681{
1682        struct siw_srq *srq = to_siw_srq(base_srq);
1683        unsigned long flags;
1684
1685        spin_lock_irqsave(&srq->lock, flags);
1686
1687        attrs->max_wr = srq->num_rqe;
1688        attrs->max_sge = srq->max_sge;
1689        attrs->srq_limit = srq->limit;
1690
1691        spin_unlock_irqrestore(&srq->lock, flags);
1692
1693        return 0;
1694}
1695
1696/*
1697 * siw_destroy_srq()
1698 *
1699 * Destroy SRQ.
1700 * It is assumed that the SRQ is not referenced by any
1701 * QP anymore - the code trusts the RDMA core environment to keep track
1702 * of QP references.
1703 */
1704int siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
1705{
1706        struct siw_srq *srq = to_siw_srq(base_srq);
1707        struct siw_device *sdev = to_siw_dev(base_srq->device);
1708        struct siw_ucontext *ctx =
1709                rdma_udata_to_drv_context(udata, struct siw_ucontext,
1710                                          base_ucontext);
1711
1712        if (ctx)
1713                rdma_user_mmap_entry_remove(srq->srq_entry);
1714        vfree(srq->recvq);
1715        atomic_dec(&sdev->num_srq);
1716        return 0;
1717}
1718
1719/*
1720 * siw_post_srq_recv()
1721 *
1722 * Post a list of receive queue elements to SRQ.
1723 * NOTE: The function does not check or lock a certain SRQ state
1724 *       during the post operation. The code simply trusts the
1725 *       RDMA core environment.
1726 *
1727 * @base_srq:   Base SRQ contained in siw SRQ
1728 * @wr:         List of R-WR's
1729 * @bad_wr:     Updated to failing WR if posting fails.
1730 */
1731int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
1732                      const struct ib_recv_wr **bad_wr)
1733{
1734        struct siw_srq *srq = to_siw_srq(base_srq);
1735        unsigned long flags;
1736        int rv = 0;
1737
1738        if (unlikely(!srq->is_kernel_res)) {
1739                siw_dbg_pd(base_srq->pd,
1740                           "[SRQ]: no kernel post_recv for mapped srq\n");
1741                rv = -EINVAL;
1742                goto out;
1743        }
1744        /*
1745         * Serialize potentially multiple producers.
1746         * Also needed to serialize potentially multiple
1747         * consumers.
1748         */
1749        spin_lock_irqsave(&srq->lock, flags);
1750
1751        while (wr) {
1752                u32 idx = srq->rq_put % srq->num_rqe;
1753                struct siw_rqe *rqe = &srq->recvq[idx];
1754
1755                if (rqe->flags) {
1756                        siw_dbg_pd(base_srq->pd, "SRQ full\n");
1757                        rv = -ENOMEM;
1758                        break;
1759                }
1760                if (unlikely(wr->num_sge > srq->max_sge)) {
1761                        siw_dbg_pd(base_srq->pd,
1762                                   "[SRQ]: too many sge's: %d\n", wr->num_sge);
1763                        rv = -EINVAL;
1764                        break;
1765                }
1766                rqe->id = wr->wr_id;
1767                rqe->num_sge = wr->num_sge;
1768                siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1769
1770                /* Make sure S-RQE is completely written before valid */
1771                smp_wmb();
1772
1773                rqe->flags = SIW_WQE_VALID;
1774
1775                srq->rq_put++;
1776                wr = wr->next;
1777        }
1778        spin_unlock_irqrestore(&srq->lock, flags);
1779out:
1780        if (unlikely(rv < 0)) {
1781                siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv);
1782                *bad_wr = wr;
1783        }
1784        return rv;
1785}
1786
1787void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
1788{
1789        struct ib_event event;
1790        struct ib_qp *base_qp = &qp->base_qp;
1791
1792        /*
1793         * Do not report asynchronous errors on QP which gets
1794         * destroyed via verbs interface (siw_destroy_qp())
1795         */
1796        if (qp->attrs.flags & SIW_QP_IN_DESTROY)
1797                return;
1798
1799        event.event = etype;
1800        event.device = base_qp->device;
1801        event.element.qp = base_qp;
1802
1803        if (base_qp->event_handler) {
1804                siw_dbg_qp(qp, "reporting event %d\n", etype);
1805                base_qp->event_handler(&event, base_qp->qp_context);
1806        }
1807}
1808
1809void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
1810{
1811        struct ib_event event;
1812        struct ib_cq *base_cq = &cq->base_cq;
1813
1814        event.event = etype;
1815        event.device = base_cq->device;
1816        event.element.cq = base_cq;
1817
1818        if (base_cq->event_handler) {
1819                siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
1820                base_cq->event_handler(&event, base_cq->cq_context);
1821        }
1822}
1823
1824void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
1825{
1826        struct ib_event event;
1827        struct ib_srq *base_srq = &srq->base_srq;
1828
1829        event.event = etype;
1830        event.device = base_srq->device;
1831        event.element.srq = base_srq;
1832
1833        if (base_srq->event_handler) {
1834                siw_dbg_pd(srq->base_srq.pd,
1835                           "reporting SRQ event %d\n", etype);
1836                base_srq->event_handler(&event, base_srq->srq_context);
1837        }
1838}
1839
1840void siw_port_event(struct siw_device *sdev, u32 port, enum ib_event_type etype)
1841{
1842        struct ib_event event;
1843
1844        event.event = etype;
1845        event.device = &sdev->base_dev;
1846        event.element.port_num = port;
1847
1848        siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
1849
1850        ib_dispatch_event(&event);
1851}
1852