linux/drivers/infiniband/sw/siw/siw_verbs.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
   2
   3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
   4/* Copyright (c) 2008-2019, IBM Corporation */
   5
   6#include <linux/errno.h>
   7#include <linux/types.h>
   8#include <linux/uaccess.h>
   9#include <linux/vmalloc.h>
  10#include <linux/xarray.h>
  11
  12#include <rdma/iw_cm.h>
  13#include <rdma/ib_verbs.h>
  14#include <rdma/ib_user_verbs.h>
  15#include <rdma/uverbs_ioctl.h>
  16
  17#include "siw.h"
  18#include "siw_verbs.h"
  19#include "siw_mem.h"
  20
  21static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
  22        [IB_QPS_RESET] = SIW_QP_STATE_IDLE,
  23        [IB_QPS_INIT] = SIW_QP_STATE_IDLE,
  24        [IB_QPS_RTR] = SIW_QP_STATE_RTR,
  25        [IB_QPS_RTS] = SIW_QP_STATE_RTS,
  26        [IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
  27        [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
  28        [IB_QPS_ERR] = SIW_QP_STATE_ERROR
  29};
  30
  31static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
  32        [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
  33        [IB_QPS_RTS] = "RTS",     [IB_QPS_SQD] = "SQD",   [IB_QPS_SQE] = "SQE",
  34        [IB_QPS_ERR] = "ERR"
  35};
  36
  37static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size)
  38{
  39        struct siw_uobj *uobj;
  40        struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY);
  41        u32 key;
  42
  43        uobj = kzalloc(sizeof(*uobj), GFP_KERNEL);
  44        if (!uobj)
  45                return SIW_INVAL_UOBJ_KEY;
  46
  47        if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey,
  48                            GFP_KERNEL) < 0) {
  49                kfree(uobj);
  50                return SIW_INVAL_UOBJ_KEY;
  51        }
  52        uobj->size = PAGE_ALIGN(size);
  53        uobj->addr = vaddr;
  54
  55        return key;
  56}
  57
  58static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx,
  59                                     unsigned long off, u32 size)
  60{
  61        struct siw_uobj *uobj = xa_load(&uctx->xa, off);
  62
  63        if (uobj && uobj->size == size)
  64                return uobj;
  65
  66        return NULL;
  67}
  68
  69int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
  70{
  71        struct siw_ucontext *uctx = to_siw_ctx(ctx);
  72        struct siw_uobj *uobj;
  73        unsigned long off = vma->vm_pgoff;
  74        int size = vma->vm_end - vma->vm_start;
  75        int rv = -EINVAL;
  76
  77        /*
  78         * Must be page aligned
  79         */
  80        if (vma->vm_start & (PAGE_SIZE - 1)) {
  81                pr_warn("siw: mmap not page aligned\n");
  82                goto out;
  83        }
  84        uobj = siw_get_uobj(uctx, off, size);
  85        if (!uobj) {
  86                siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n",
  87                        off, size);
  88                goto out;
  89        }
  90        rv = remap_vmalloc_range(vma, uobj->addr, 0);
  91        if (rv)
  92                pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size);
  93out:
  94        return rv;
  95}
  96
  97int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
  98{
  99        struct siw_device *sdev = to_siw_dev(base_ctx->device);
 100        struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
 101        struct siw_uresp_alloc_ctx uresp = {};
 102        int rv;
 103
 104        if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
 105                rv = -ENOMEM;
 106                goto err_out;
 107        }
 108        xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC);
 109        ctx->uobj_nextkey = 0;
 110        ctx->sdev = sdev;
 111
 112        uresp.dev_id = sdev->vendor_part_id;
 113
 114        if (udata->outlen < sizeof(uresp)) {
 115                rv = -EINVAL;
 116                goto err_out;
 117        }
 118        rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 119        if (rv)
 120                goto err_out;
 121
 122        siw_dbg(base_ctx->device, "success. now %d context(s)\n",
 123                atomic_read(&sdev->num_ctx));
 124
 125        return 0;
 126
 127err_out:
 128        atomic_dec(&sdev->num_ctx);
 129        siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
 130                atomic_read(&sdev->num_ctx));
 131
 132        return rv;
 133}
 134
 135void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
 136{
 137        struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
 138        void *entry;
 139        unsigned long index;
 140
 141        /*
 142         * Make sure all user mmap objects are gone. Since QP, CQ
 143         * and SRQ destroy routines destroy related objects, nothing
 144         * should be found here.
 145         */
 146        xa_for_each(&uctx->xa, index, entry) {
 147                kfree(xa_erase(&uctx->xa, index));
 148                pr_warn("siw: dropping orphaned uobj at %lu\n", index);
 149        }
 150        xa_destroy(&uctx->xa);
 151        atomic_dec(&uctx->sdev->num_ctx);
 152}
 153
 154int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
 155                     struct ib_udata *udata)
 156{
 157        struct siw_device *sdev = to_siw_dev(base_dev);
 158
 159        if (udata->inlen || udata->outlen)
 160                return -EINVAL;
 161
 162        memset(attr, 0, sizeof(*attr));
 163
 164        /* Revisit atomic caps if RFC 7306 gets supported */
 165        attr->atomic_cap = 0;
 166        attr->device_cap_flags =
 167                IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG;
 168        attr->max_cq = sdev->attrs.max_cq;
 169        attr->max_cqe = sdev->attrs.max_cqe;
 170        attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
 171        attr->max_fmr = sdev->attrs.max_fmr;
 172        attr->max_mr = sdev->attrs.max_mr;
 173        attr->max_mw = sdev->attrs.max_mw;
 174        attr->max_mr_size = ~0ull;
 175        attr->max_pd = sdev->attrs.max_pd;
 176        attr->max_qp = sdev->attrs.max_qp;
 177        attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
 178        attr->max_qp_rd_atom = sdev->attrs.max_ord;
 179        attr->max_qp_wr = sdev->attrs.max_qp_wr;
 180        attr->max_recv_sge = sdev->attrs.max_sge;
 181        attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
 182        attr->max_send_sge = sdev->attrs.max_sge;
 183        attr->max_sge_rd = sdev->attrs.max_sge_rd;
 184        attr->max_srq = sdev->attrs.max_srq;
 185        attr->max_srq_sge = sdev->attrs.max_srq_sge;
 186        attr->max_srq_wr = sdev->attrs.max_srq_wr;
 187        attr->page_size_cap = PAGE_SIZE;
 188        attr->vendor_id = SIW_VENDOR_ID;
 189        attr->vendor_part_id = sdev->vendor_part_id;
 190
 191        memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6);
 192
 193        return 0;
 194}
 195
 196int siw_query_port(struct ib_device *base_dev, u8 port,
 197                   struct ib_port_attr *attr)
 198{
 199        struct siw_device *sdev = to_siw_dev(base_dev);
 200
 201        memset(attr, 0, sizeof(*attr));
 202
 203        attr->active_mtu = attr->max_mtu;
 204        attr->active_speed = 2;
 205        attr->active_width = 2;
 206        attr->gid_tbl_len = 1;
 207        attr->max_msg_sz = -1;
 208        attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
 209        attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 5 : 3;
 210        attr->pkey_tbl_len = 1;
 211        attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
 212        attr->state = sdev->state;
 213        /*
 214         * All zero
 215         *
 216         * attr->lid = 0;
 217         * attr->bad_pkey_cntr = 0;
 218         * attr->qkey_viol_cntr = 0;
 219         * attr->sm_lid = 0;
 220         * attr->lmc = 0;
 221         * attr->max_vl_num = 0;
 222         * attr->sm_sl = 0;
 223         * attr->subnet_timeout = 0;
 224         * attr->init_type_repy = 0;
 225         */
 226        return 0;
 227}
 228
 229int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
 230                           struct ib_port_immutable *port_immutable)
 231{
 232        struct ib_port_attr attr;
 233        int rv = siw_query_port(base_dev, port, &attr);
 234
 235        if (rv)
 236                return rv;
 237
 238        port_immutable->pkey_tbl_len = attr.pkey_tbl_len;
 239        port_immutable->gid_tbl_len = attr.gid_tbl_len;
 240        port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
 241
 242        return 0;
 243}
 244
 245int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey)
 246{
 247        /* Report the default pkey */
 248        *pkey = 0xffff;
 249        return 0;
 250}
 251
 252int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
 253                  union ib_gid *gid)
 254{
 255        struct siw_device *sdev = to_siw_dev(base_dev);
 256
 257        /* subnet_prefix == interface_id == 0; */
 258        memset(gid, 0, sizeof(*gid));
 259        memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6);
 260
 261        return 0;
 262}
 263
 264int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 265{
 266        struct siw_device *sdev = to_siw_dev(pd->device);
 267
 268        if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
 269                atomic_dec(&sdev->num_pd);
 270                return -ENOMEM;
 271        }
 272        siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
 273
 274        return 0;
 275}
 276
 277void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 278{
 279        struct siw_device *sdev = to_siw_dev(pd->device);
 280
 281        siw_dbg_pd(pd, "free PD\n");
 282        atomic_dec(&sdev->num_pd);
 283}
 284
 285void siw_qp_get_ref(struct ib_qp *base_qp)
 286{
 287        siw_qp_get(to_siw_qp(base_qp));
 288}
 289
 290void siw_qp_put_ref(struct ib_qp *base_qp)
 291{
 292        siw_qp_put(to_siw_qp(base_qp));
 293}
 294
 295/*
 296 * siw_create_qp()
 297 *
 298 * Create QP of requested size on given device.
 299 *
 300 * @pd:         Protection Domain
 301 * @attrs:      Initial QP attributes.
 302 * @udata:      used to provide QP ID, SQ and RQ size back to user.
 303 */
 304
 305struct ib_qp *siw_create_qp(struct ib_pd *pd,
 306                            struct ib_qp_init_attr *attrs,
 307                            struct ib_udata *udata)
 308{
 309        struct siw_qp *qp = NULL;
 310        struct siw_base_qp *siw_base_qp = NULL;
 311        struct ib_device *base_dev = pd->device;
 312        struct siw_device *sdev = to_siw_dev(base_dev);
 313        struct siw_ucontext *uctx =
 314                rdma_udata_to_drv_context(udata, struct siw_ucontext,
 315                                          base_ucontext);
 316        struct siw_cq *scq = NULL, *rcq = NULL;
 317        unsigned long flags;
 318        int num_sqe, num_rqe, rv = 0;
 319
 320        siw_dbg(base_dev, "create new QP\n");
 321
 322        if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
 323                siw_dbg(base_dev, "too many QP's\n");
 324                rv = -ENOMEM;
 325                goto err_out;
 326        }
 327        if (attrs->qp_type != IB_QPT_RC) {
 328                siw_dbg(base_dev, "only RC QP's supported\n");
 329                rv = -EINVAL;
 330                goto err_out;
 331        }
 332        if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
 333            (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
 334            (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
 335            (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
 336                siw_dbg(base_dev, "QP size error\n");
 337                rv = -EINVAL;
 338                goto err_out;
 339        }
 340        if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
 341                siw_dbg(base_dev, "max inline send: %d > %d\n",
 342                        attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
 343                rv = -EINVAL;
 344                goto err_out;
 345        }
 346        /*
 347         * NOTE: we allow for zero element SQ and RQ WQE's SGL's
 348         * but not for a QP unable to hold any WQE (SQ + RQ)
 349         */
 350        if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) {
 351                siw_dbg(base_dev, "QP must have send or receive queue\n");
 352                rv = -EINVAL;
 353                goto err_out;
 354        }
 355        scq = to_siw_cq(attrs->send_cq);
 356        rcq = to_siw_cq(attrs->recv_cq);
 357
 358        if (!scq || (!rcq && !attrs->srq)) {
 359                siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
 360                rv = -EINVAL;
 361                goto err_out;
 362        }
 363        siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL);
 364        if (!siw_base_qp) {
 365                rv = -ENOMEM;
 366                goto err_out;
 367        }
 368        qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 369        if (!qp) {
 370                rv = -ENOMEM;
 371                goto err_out;
 372        }
 373        siw_base_qp->qp = qp;
 374        qp->ib_qp = &siw_base_qp->base_qp;
 375
 376        init_rwsem(&qp->state_lock);
 377        spin_lock_init(&qp->sq_lock);
 378        spin_lock_init(&qp->rq_lock);
 379        spin_lock_init(&qp->orq_lock);
 380
 381        qp->kernel_verbs = !udata;
 382        qp->xa_sq_index = SIW_INVAL_UOBJ_KEY;
 383        qp->xa_rq_index = SIW_INVAL_UOBJ_KEY;
 384
 385        rv = siw_qp_add(sdev, qp);
 386        if (rv)
 387                goto err_out;
 388
 389        /* All queue indices are derived from modulo operations
 390         * on a free running 'get' (consumer) and 'put' (producer)
 391         * unsigned counter. Having queue sizes at power of two
 392         * avoids handling counter wrap around.
 393         */
 394        num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr);
 395        num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr);
 396
 397        if (qp->kernel_verbs)
 398                qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe));
 399        else
 400                qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
 401
 402        if (qp->sendq == NULL) {
 403                siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe);
 404                rv = -ENOMEM;
 405                goto err_out_xa;
 406        }
 407        if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
 408                if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
 409                        qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
 410                else {
 411                        rv = -EINVAL;
 412                        goto err_out_xa;
 413                }
 414        }
 415        qp->pd = pd;
 416        qp->scq = scq;
 417        qp->rcq = rcq;
 418
 419        if (attrs->srq) {
 420                /*
 421                 * SRQ support.
 422                 * Verbs 6.3.7: ignore RQ size, if SRQ present
 423                 * Verbs 6.3.5: do not check PD of SRQ against PD of QP
 424                 */
 425                qp->srq = to_siw_srq(attrs->srq);
 426                qp->attrs.rq_size = 0;
 427                siw_dbg(base_dev, "QP [%u]: SRQ attached\n", qp->qp_num);
 428        } else if (num_rqe) {
 429                if (qp->kernel_verbs)
 430                        qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe));
 431                else
 432                        qp->recvq =
 433                                vmalloc_user(num_rqe * sizeof(struct siw_rqe));
 434
 435                if (qp->recvq == NULL) {
 436                        siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe);
 437                        rv = -ENOMEM;
 438                        goto err_out_xa;
 439                }
 440                qp->attrs.rq_size = num_rqe;
 441        }
 442        qp->attrs.sq_size = num_sqe;
 443        qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
 444        qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
 445
 446        /* Make those two tunables fixed for now. */
 447        qp->tx_ctx.gso_seg_limit = 1;
 448        qp->tx_ctx.zcopy_tx = zcopy_tx;
 449
 450        qp->attrs.state = SIW_QP_STATE_IDLE;
 451
 452        if (udata) {
 453                struct siw_uresp_create_qp uresp = {};
 454
 455                uresp.num_sqe = num_sqe;
 456                uresp.num_rqe = num_rqe;
 457                uresp.qp_id = qp_id(qp);
 458
 459                if (qp->sendq) {
 460                        qp->xa_sq_index =
 461                                siw_create_uobj(uctx, qp->sendq,
 462                                        num_sqe * sizeof(struct siw_sqe));
 463                }
 464                if (qp->recvq) {
 465                        qp->xa_rq_index =
 466                                 siw_create_uobj(uctx, qp->recvq,
 467                                        num_rqe * sizeof(struct siw_rqe));
 468                }
 469                if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY ||
 470                    qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) {
 471                        rv = -ENOMEM;
 472                        goto err_out_xa;
 473                }
 474                uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT;
 475                uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT;
 476
 477                if (udata->outlen < sizeof(uresp)) {
 478                        rv = -EINVAL;
 479                        goto err_out_xa;
 480                }
 481                rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 482                if (rv)
 483                        goto err_out_xa;
 484        }
 485        qp->tx_cpu = siw_get_tx_cpu(sdev);
 486        if (qp->tx_cpu < 0) {
 487                rv = -EINVAL;
 488                goto err_out_xa;
 489        }
 490        INIT_LIST_HEAD(&qp->devq);
 491        spin_lock_irqsave(&sdev->lock, flags);
 492        list_add_tail(&qp->devq, &sdev->qp_list);
 493        spin_unlock_irqrestore(&sdev->lock, flags);
 494
 495        return qp->ib_qp;
 496
 497err_out_xa:
 498        xa_erase(&sdev->qp_xa, qp_id(qp));
 499err_out:
 500        kfree(siw_base_qp);
 501
 502        if (qp) {
 503                if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY)
 504                        kfree(xa_erase(&uctx->xa, qp->xa_sq_index));
 505                if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY)
 506                        kfree(xa_erase(&uctx->xa, qp->xa_rq_index));
 507
 508                vfree(qp->sendq);
 509                vfree(qp->recvq);
 510                kfree(qp);
 511        }
 512        atomic_dec(&sdev->num_qp);
 513
 514        return ERR_PTR(rv);
 515}
 516
 517/*
 518 * Minimum siw_query_qp() verb interface.
 519 *
 520 * @qp_attr_mask is not used but all available information is provided
 521 */
 522int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
 523                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
 524{
 525        struct siw_qp *qp;
 526        struct siw_device *sdev;
 527
 528        if (base_qp && qp_attr && qp_init_attr) {
 529                qp = to_siw_qp(base_qp);
 530                sdev = to_siw_dev(base_qp->device);
 531        } else {
 532                return -EINVAL;
 533        }
 534        qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
 535        qp_attr->cap.max_send_wr = qp->attrs.sq_size;
 536        qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
 537        qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
 538        qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
 539        qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
 540        qp_attr->max_rd_atomic = qp->attrs.irq_size;
 541        qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
 542
 543        qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
 544                                   IB_ACCESS_REMOTE_WRITE |
 545                                   IB_ACCESS_REMOTE_READ;
 546
 547        qp_init_attr->qp_type = base_qp->qp_type;
 548        qp_init_attr->send_cq = base_qp->send_cq;
 549        qp_init_attr->recv_cq = base_qp->recv_cq;
 550        qp_init_attr->srq = base_qp->srq;
 551
 552        qp_init_attr->cap = qp_attr->cap;
 553
 554        return 0;
 555}
 556
 557int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
 558                        int attr_mask, struct ib_udata *udata)
 559{
 560        struct siw_qp_attrs new_attrs;
 561        enum siw_qp_attr_mask siw_attr_mask = 0;
 562        struct siw_qp *qp = to_siw_qp(base_qp);
 563        int rv = 0;
 564
 565        if (!attr_mask)
 566                return 0;
 567
 568        memset(&new_attrs, 0, sizeof(new_attrs));
 569
 570        if (attr_mask & IB_QP_ACCESS_FLAGS) {
 571                siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
 572
 573                if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
 574                        new_attrs.flags |= SIW_RDMA_READ_ENABLED;
 575                if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
 576                        new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
 577                if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
 578                        new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
 579        }
 580        if (attr_mask & IB_QP_STATE) {
 581                siw_dbg_qp(qp, "desired IB QP state: %s\n",
 582                           ib_qp_state_to_string[attr->qp_state]);
 583
 584                new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
 585
 586                if (new_attrs.state > SIW_QP_STATE_RTS)
 587                        qp->tx_ctx.tx_suspend = 1;
 588
 589                siw_attr_mask |= SIW_QP_ATTR_STATE;
 590        }
 591        if (!siw_attr_mask)
 592                goto out;
 593
 594        down_write(&qp->state_lock);
 595
 596        rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
 597
 598        up_write(&qp->state_lock);
 599out:
 600        return rv;
 601}
 602
 603int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
 604{
 605        struct siw_qp *qp = to_siw_qp(base_qp);
 606        struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp);
 607        struct siw_ucontext *uctx =
 608                rdma_udata_to_drv_context(udata, struct siw_ucontext,
 609                                          base_ucontext);
 610        struct siw_qp_attrs qp_attrs;
 611
 612        siw_dbg_qp(qp, "state %d\n", qp->attrs.state);
 613
 614        /*
 615         * Mark QP as in process of destruction to prevent from
 616         * any async callbacks to RDMA core
 617         */
 618        qp->attrs.flags |= SIW_QP_IN_DESTROY;
 619        qp->rx_stream.rx_suspend = 1;
 620
 621        if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY)
 622                kfree(xa_erase(&uctx->xa, qp->xa_sq_index));
 623        if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY)
 624                kfree(xa_erase(&uctx->xa, qp->xa_rq_index));
 625
 626        down_write(&qp->state_lock);
 627
 628        qp_attrs.state = SIW_QP_STATE_ERROR;
 629        siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
 630
 631        if (qp->cep) {
 632                siw_cep_put(qp->cep);
 633                qp->cep = NULL;
 634        }
 635        up_write(&qp->state_lock);
 636
 637        kfree(qp->tx_ctx.mpa_crc_hd);
 638        kfree(qp->rx_stream.mpa_crc_hd);
 639
 640        qp->scq = qp->rcq = NULL;
 641
 642        siw_qp_put(qp);
 643        kfree(siw_base_qp);
 644
 645        return 0;
 646}
 647
 648/*
 649 * siw_copy_inline_sgl()
 650 *
 651 * Prepare sgl of inlined data for sending. For userland callers
 652 * function checks if given buffer addresses and len's are within
 653 * process context bounds.
 654 * Data from all provided sge's are copied together into the wqe,
 655 * referenced by a single sge.
 656 */
 657static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
 658                               struct siw_sqe *sqe)
 659{
 660        struct ib_sge *core_sge = core_wr->sg_list;
 661        void *kbuf = &sqe->sge[1];
 662        int num_sge = core_wr->num_sge, bytes = 0;
 663
 664        sqe->sge[0].laddr = (uintptr_t)kbuf;
 665        sqe->sge[0].lkey = 0;
 666
 667        while (num_sge--) {
 668                if (!core_sge->length) {
 669                        core_sge++;
 670                        continue;
 671                }
 672                bytes += core_sge->length;
 673                if (bytes > SIW_MAX_INLINE) {
 674                        bytes = -EINVAL;
 675                        break;
 676                }
 677                memcpy(kbuf, (void *)(uintptr_t)core_sge->addr,
 678                       core_sge->length);
 679
 680                kbuf += core_sge->length;
 681                core_sge++;
 682        }
 683        sqe->sge[0].length = bytes > 0 ? bytes : 0;
 684        sqe->num_sge = bytes > 0 ? 1 : 0;
 685
 686        return bytes;
 687}
 688
 689/*
 690 * siw_post_send()
 691 *
 692 * Post a list of S-WR's to a SQ.
 693 *
 694 * @base_qp:    Base QP contained in siw QP
 695 * @wr:         Null terminated list of user WR's
 696 * @bad_wr:     Points to failing WR in case of synchronous failure.
 697 */
 698int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
 699                  const struct ib_send_wr **bad_wr)
 700{
 701        struct siw_qp *qp = to_siw_qp(base_qp);
 702        struct siw_wqe *wqe = tx_wqe(qp);
 703
 704        unsigned long flags;
 705        int rv = 0;
 706
 707        /*
 708         * Try to acquire QP state lock. Must be non-blocking
 709         * to accommodate kernel clients needs.
 710         */
 711        if (!down_read_trylock(&qp->state_lock)) {
 712                *bad_wr = wr;
 713                siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state);
 714                return -ENOTCONN;
 715        }
 716        if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
 717                up_read(&qp->state_lock);
 718                *bad_wr = wr;
 719                siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state);
 720                return -ENOTCONN;
 721        }
 722        if (wr && !qp->kernel_verbs) {
 723                siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
 724                up_read(&qp->state_lock);
 725                *bad_wr = wr;
 726                return -EINVAL;
 727        }
 728        spin_lock_irqsave(&qp->sq_lock, flags);
 729
 730        while (wr) {
 731                u32 idx = qp->sq_put % qp->attrs.sq_size;
 732                struct siw_sqe *sqe = &qp->sendq[idx];
 733
 734                if (sqe->flags) {
 735                        siw_dbg_qp(qp, "sq full\n");
 736                        rv = -ENOMEM;
 737                        break;
 738                }
 739                if (wr->num_sge > qp->attrs.sq_max_sges) {
 740                        siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
 741                        rv = -EINVAL;
 742                        break;
 743                }
 744                sqe->id = wr->wr_id;
 745
 746                if ((wr->send_flags & IB_SEND_SIGNALED) ||
 747                    (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
 748                        sqe->flags |= SIW_WQE_SIGNALLED;
 749
 750                if (wr->send_flags & IB_SEND_FENCE)
 751                        sqe->flags |= SIW_WQE_READ_FENCE;
 752
 753                switch (wr->opcode) {
 754                case IB_WR_SEND:
 755                case IB_WR_SEND_WITH_INV:
 756                        if (wr->send_flags & IB_SEND_SOLICITED)
 757                                sqe->flags |= SIW_WQE_SOLICITED;
 758
 759                        if (!(wr->send_flags & IB_SEND_INLINE)) {
 760                                siw_copy_sgl(wr->sg_list, sqe->sge,
 761                                             wr->num_sge);
 762                                sqe->num_sge = wr->num_sge;
 763                        } else {
 764                                rv = siw_copy_inline_sgl(wr, sqe);
 765                                if (rv <= 0) {
 766                                        rv = -EINVAL;
 767                                        break;
 768                                }
 769                                sqe->flags |= SIW_WQE_INLINE;
 770                                sqe->num_sge = 1;
 771                        }
 772                        if (wr->opcode == IB_WR_SEND)
 773                                sqe->opcode = SIW_OP_SEND;
 774                        else {
 775                                sqe->opcode = SIW_OP_SEND_REMOTE_INV;
 776                                sqe->rkey = wr->ex.invalidate_rkey;
 777                        }
 778                        break;
 779
 780                case IB_WR_RDMA_READ_WITH_INV:
 781                case IB_WR_RDMA_READ:
 782                        /*
 783                         * iWarp restricts RREAD sink to SGL containing
 784                         * 1 SGE only. we could relax to SGL with multiple
 785                         * elements referring the SAME ltag or even sending
 786                         * a private per-rreq tag referring to a checked
 787                         * local sgl with MULTIPLE ltag's.
 788                         */
 789                        if (unlikely(wr->num_sge != 1)) {
 790                                rv = -EINVAL;
 791                                break;
 792                        }
 793                        siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
 794                        /*
 795                         * NOTE: zero length RREAD is allowed!
 796                         */
 797                        sqe->raddr = rdma_wr(wr)->remote_addr;
 798                        sqe->rkey = rdma_wr(wr)->rkey;
 799                        sqe->num_sge = 1;
 800
 801                        if (wr->opcode == IB_WR_RDMA_READ)
 802                                sqe->opcode = SIW_OP_READ;
 803                        else
 804                                sqe->opcode = SIW_OP_READ_LOCAL_INV;
 805                        break;
 806
 807                case IB_WR_RDMA_WRITE:
 808                        if (!(wr->send_flags & IB_SEND_INLINE)) {
 809                                siw_copy_sgl(wr->sg_list, &sqe->sge[0],
 810                                             wr->num_sge);
 811                                sqe->num_sge = wr->num_sge;
 812                        } else {
 813                                rv = siw_copy_inline_sgl(wr, sqe);
 814                                if (unlikely(rv < 0)) {
 815                                        rv = -EINVAL;
 816                                        break;
 817                                }
 818                                sqe->flags |= SIW_WQE_INLINE;
 819                                sqe->num_sge = 1;
 820                        }
 821                        sqe->raddr = rdma_wr(wr)->remote_addr;
 822                        sqe->rkey = rdma_wr(wr)->rkey;
 823                        sqe->opcode = SIW_OP_WRITE;
 824                        break;
 825
 826                case IB_WR_REG_MR:
 827                        sqe->base_mr = (uintptr_t)reg_wr(wr)->mr;
 828                        sqe->rkey = reg_wr(wr)->key;
 829                        sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
 830                        sqe->opcode = SIW_OP_REG_MR;
 831                        break;
 832
 833                case IB_WR_LOCAL_INV:
 834                        sqe->rkey = wr->ex.invalidate_rkey;
 835                        sqe->opcode = SIW_OP_INVAL_STAG;
 836                        break;
 837
 838                default:
 839                        siw_dbg_qp(qp, "ib wr type %d unsupported\n",
 840                                   wr->opcode);
 841                        rv = -EINVAL;
 842                        break;
 843                }
 844                siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n",
 845                           sqe->opcode, sqe->flags,
 846                           (void *)(uintptr_t)sqe->id);
 847
 848                if (unlikely(rv < 0))
 849                        break;
 850
 851                /* make SQE only valid after completely written */
 852                smp_wmb();
 853                sqe->flags |= SIW_WQE_VALID;
 854
 855                qp->sq_put++;
 856                wr = wr->next;
 857        }
 858
 859        /*
 860         * Send directly if SQ processing is not in progress.
 861         * Eventual immediate errors (rv < 0) do not affect the involved
 862         * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
 863         * processing, if new work is already pending. But rv must be passed
 864         * to caller.
 865         */
 866        if (wqe->wr_status != SIW_WR_IDLE) {
 867                spin_unlock_irqrestore(&qp->sq_lock, flags);
 868                goto skip_direct_sending;
 869        }
 870        rv = siw_activate_tx(qp);
 871        spin_unlock_irqrestore(&qp->sq_lock, flags);
 872
 873        if (rv <= 0)
 874                goto skip_direct_sending;
 875
 876        if (qp->kernel_verbs) {
 877                rv = siw_sq_start(qp);
 878        } else {
 879                qp->tx_ctx.in_syscall = 1;
 880
 881                if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
 882                        siw_qp_cm_drop(qp, 0);
 883
 884                qp->tx_ctx.in_syscall = 0;
 885        }
 886skip_direct_sending:
 887
 888        up_read(&qp->state_lock);
 889
 890        if (rv >= 0)
 891                return 0;
 892        /*
 893         * Immediate error
 894         */
 895        siw_dbg_qp(qp, "error %d\n", rv);
 896
 897        *bad_wr = wr;
 898        return rv;
 899}
 900
 901/*
 902 * siw_post_receive()
 903 *
 904 * Post a list of R-WR's to a RQ.
 905 *
 906 * @base_qp:    Base QP contained in siw QP
 907 * @wr:         Null terminated list of user WR's
 908 * @bad_wr:     Points to failing WR in case of synchronous failure.
 909 */
 910int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
 911                     const struct ib_recv_wr **bad_wr)
 912{
 913        struct siw_qp *qp = to_siw_qp(base_qp);
 914        unsigned long flags;
 915        int rv = 0;
 916
 917        if (qp->srq) {
 918                *bad_wr = wr;
 919                return -EOPNOTSUPP; /* what else from errno.h? */
 920        }
 921        /*
 922         * Try to acquire QP state lock. Must be non-blocking
 923         * to accommodate kernel clients needs.
 924         */
 925        if (!down_read_trylock(&qp->state_lock)) {
 926                *bad_wr = wr;
 927                return -ENOTCONN;
 928        }
 929        if (!qp->kernel_verbs) {
 930                siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n");
 931                up_read(&qp->state_lock);
 932                *bad_wr = wr;
 933                return -EINVAL;
 934        }
 935        if (qp->attrs.state > SIW_QP_STATE_RTS) {
 936                up_read(&qp->state_lock);
 937                *bad_wr = wr;
 938                return -EINVAL;
 939        }
 940        /*
 941         * Serialize potentially multiple producers.
 942         * Not needed for single threaded consumer side.
 943         */
 944        spin_lock_irqsave(&qp->rq_lock, flags);
 945
 946        while (wr) {
 947                u32 idx = qp->rq_put % qp->attrs.rq_size;
 948                struct siw_rqe *rqe = &qp->recvq[idx];
 949
 950                if (rqe->flags) {
 951                        siw_dbg_qp(qp, "RQ full\n");
 952                        rv = -ENOMEM;
 953                        break;
 954                }
 955                if (wr->num_sge > qp->attrs.rq_max_sges) {
 956                        siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
 957                        rv = -EINVAL;
 958                        break;
 959                }
 960                rqe->id = wr->wr_id;
 961                rqe->num_sge = wr->num_sge;
 962                siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
 963
 964                /* make sure RQE is completely written before valid */
 965                smp_wmb();
 966
 967                rqe->flags = SIW_WQE_VALID;
 968
 969                qp->rq_put++;
 970                wr = wr->next;
 971        }
 972        spin_unlock_irqrestore(&qp->rq_lock, flags);
 973
 974        up_read(&qp->state_lock);
 975
 976        if (rv < 0) {
 977                siw_dbg_qp(qp, "error %d\n", rv);
 978                *bad_wr = wr;
 979        }
 980        return rv > 0 ? 0 : rv;
 981}
 982
 983void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
 984{
 985        struct siw_cq *cq = to_siw_cq(base_cq);
 986        struct siw_device *sdev = to_siw_dev(base_cq->device);
 987        struct siw_ucontext *ctx =
 988                rdma_udata_to_drv_context(udata, struct siw_ucontext,
 989                                          base_ucontext);
 990
 991        siw_dbg_cq(cq, "free CQ resources\n");
 992
 993        siw_cq_flush(cq);
 994
 995        if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY)
 996                kfree(xa_erase(&ctx->xa, cq->xa_cq_index));
 997
 998        atomic_dec(&sdev->num_cq);
 999
1000        vfree(cq->queue);
1001}
1002
1003/*
1004 * siw_create_cq()
1005 *
1006 * Populate CQ of requested size
1007 *
1008 * @base_cq: CQ as allocated by RDMA midlayer
1009 * @attr: Initial CQ attributes
1010 * @udata: relates to user context
1011 */
1012
1013int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
1014                  struct ib_udata *udata)
1015{
1016        struct siw_device *sdev = to_siw_dev(base_cq->device);
1017        struct siw_cq *cq = to_siw_cq(base_cq);
1018        int rv, size = attr->cqe;
1019
1020        if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
1021                siw_dbg(base_cq->device, "too many CQ's\n");
1022                rv = -ENOMEM;
1023                goto err_out;
1024        }
1025        if (size < 1 || size > sdev->attrs.max_cqe) {
1026                siw_dbg(base_cq->device, "CQ size error: %d\n", size);
1027                rv = -EINVAL;
1028                goto err_out;
1029        }
1030        size = roundup_pow_of_two(size);
1031        cq->base_cq.cqe = size;
1032        cq->num_cqe = size;
1033        cq->xa_cq_index = SIW_INVAL_UOBJ_KEY;
1034
1035        if (!udata) {
1036                cq->kernel_verbs = 1;
1037                cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
1038                                    sizeof(struct siw_cq_ctrl));
1039        } else {
1040                cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
1041                                         sizeof(struct siw_cq_ctrl));
1042        }
1043        if (cq->queue == NULL) {
1044                rv = -ENOMEM;
1045                goto err_out;
1046        }
1047        get_random_bytes(&cq->id, 4);
1048        siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
1049
1050        spin_lock_init(&cq->lock);
1051
1052        cq->notify = (struct siw_cq_ctrl *)&cq->queue[size];
1053
1054        if (udata) {
1055                struct siw_uresp_create_cq uresp = {};
1056                struct siw_ucontext *ctx =
1057                        rdma_udata_to_drv_context(udata, struct siw_ucontext,
1058                                                  base_ucontext);
1059
1060                cq->xa_cq_index =
1061                        siw_create_uobj(ctx, cq->queue,
1062                                        size * sizeof(struct siw_cqe) +
1063                                                sizeof(struct siw_cq_ctrl));
1064                if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) {
1065                        rv = -ENOMEM;
1066                        goto err_out;
1067                }
1068                uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT;
1069                uresp.cq_id = cq->id;
1070                uresp.num_cqe = size;
1071
1072                if (udata->outlen < sizeof(uresp)) {
1073                        rv = -EINVAL;
1074                        goto err_out;
1075                }
1076                rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1077                if (rv)
1078                        goto err_out;
1079        }
1080        return 0;
1081
1082err_out:
1083        siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
1084
1085        if (cq && cq->queue) {
1086                struct siw_ucontext *ctx =
1087                        rdma_udata_to_drv_context(udata, struct siw_ucontext,
1088                                                  base_ucontext);
1089                if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY)
1090                        kfree(xa_erase(&ctx->xa, cq->xa_cq_index));
1091                vfree(cq->queue);
1092        }
1093        atomic_dec(&sdev->num_cq);
1094
1095        return rv;
1096}
1097
1098/*
1099 * siw_poll_cq()
1100 *
1101 * Reap CQ entries if available and copy work completion status into
1102 * array of WC's provided by caller. Returns number of reaped CQE's.
1103 *
1104 * @base_cq:    Base CQ contained in siw CQ.
1105 * @num_cqe:    Maximum number of CQE's to reap.
1106 * @wc:         Array of work completions to be filled by siw.
1107 */
1108int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
1109{
1110        struct siw_cq *cq = to_siw_cq(base_cq);
1111        int i;
1112
1113        for (i = 0; i < num_cqe; i++) {
1114                if (!siw_reap_cqe(cq, wc))
1115                        break;
1116                wc++;
1117        }
1118        return i;
1119}
1120
1121/*
1122 * siw_req_notify_cq()
1123 *
1124 * Request notification for new CQE's added to that CQ.
1125 * Defined flags:
1126 * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
1127 *   event if a WQE with notification flag set enters the CQ
1128 * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
1129 *   event if a WQE enters the CQ.
1130 * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
1131 *   number of not reaped CQE's regardless of its notification
1132 *   type and current or new CQ notification settings.
1133 *
1134 * @base_cq:    Base CQ contained in siw CQ.
1135 * @flags:      Requested notification flags.
1136 */
1137int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
1138{
1139        struct siw_cq *cq = to_siw_cq(base_cq);
1140
1141        siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
1142
1143        if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
1144                /*
1145                 * Enable CQ event for next solicited completion.
1146                 * and make it visible to all associated producers.
1147                 */
1148                smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED);
1149        else
1150                /*
1151                 * Enable CQ event for any signalled completion.
1152                 * and make it visible to all associated producers.
1153                 */
1154                smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL);
1155
1156        if (flags & IB_CQ_REPORT_MISSED_EVENTS)
1157                return cq->cq_put - cq->cq_get;
1158
1159        return 0;
1160}
1161
1162/*
1163 * siw_dereg_mr()
1164 *
1165 * Release Memory Region.
1166 *
1167 * @base_mr: Base MR contained in siw MR.
1168 * @udata: points to user context, unused.
1169 */
1170int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
1171{
1172        struct siw_mr *mr = to_siw_mr(base_mr);
1173        struct siw_device *sdev = to_siw_dev(base_mr->device);
1174
1175        siw_dbg_mem(mr->mem, "deregister MR\n");
1176
1177        atomic_dec(&sdev->num_mr);
1178
1179        siw_mr_drop_mem(mr);
1180        kfree_rcu(mr, rcu);
1181
1182        return 0;
1183}
1184
1185/*
1186 * siw_reg_user_mr()
1187 *
1188 * Register Memory Region.
1189 *
1190 * @pd:         Protection Domain
1191 * @start:      starting address of MR (virtual address)
1192 * @len:        len of MR
1193 * @rnic_va:    not used by siw
1194 * @rights:     MR access rights
1195 * @udata:      user buffer to communicate STag and Key.
1196 */
1197struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
1198                              u64 rnic_va, int rights, struct ib_udata *udata)
1199{
1200        struct siw_mr *mr = NULL;
1201        struct siw_umem *umem = NULL;
1202        struct siw_ureq_reg_mr ureq;
1203        struct siw_device *sdev = to_siw_dev(pd->device);
1204
1205        unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
1206        int rv;
1207
1208        siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
1209                   (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
1210                   (unsigned long long)len);
1211
1212        if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1213                siw_dbg_pd(pd, "too many mr's\n");
1214                rv = -ENOMEM;
1215                goto err_out;
1216        }
1217        if (!len) {
1218                rv = -EINVAL;
1219                goto err_out;
1220        }
1221        if (mem_limit != RLIM_INFINITY) {
1222                unsigned long num_pages =
1223                        (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
1224                mem_limit >>= PAGE_SHIFT;
1225
1226                if (num_pages > mem_limit - current->mm->locked_vm) {
1227                        siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
1228                                   num_pages, mem_limit,
1229                                   current->mm->locked_vm);
1230                        rv = -ENOMEM;
1231                        goto err_out;
1232                }
1233        }
1234        umem = siw_umem_get(start, len, ib_access_writable(rights));
1235        if (IS_ERR(umem)) {
1236                rv = PTR_ERR(umem);
1237                siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
1238                umem = NULL;
1239                goto err_out;
1240        }
1241        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1242        if (!mr) {
1243                rv = -ENOMEM;
1244                goto err_out;
1245        }
1246        rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
1247        if (rv)
1248                goto err_out;
1249
1250        if (udata) {
1251                struct siw_uresp_reg_mr uresp = {};
1252                struct siw_mem *mem = mr->mem;
1253
1254                if (udata->inlen < sizeof(ureq)) {
1255                        rv = -EINVAL;
1256                        goto err_out;
1257                }
1258                rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
1259                if (rv)
1260                        goto err_out;
1261
1262                mr->base_mr.lkey |= ureq.stag_key;
1263                mr->base_mr.rkey |= ureq.stag_key;
1264                mem->stag |= ureq.stag_key;
1265                uresp.stag = mem->stag;
1266
1267                if (udata->outlen < sizeof(uresp)) {
1268                        rv = -EINVAL;
1269                        goto err_out;
1270                }
1271                rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1272                if (rv)
1273                        goto err_out;
1274        }
1275        mr->mem->stag_valid = 1;
1276
1277        return &mr->base_mr;
1278
1279err_out:
1280        atomic_dec(&sdev->num_mr);
1281        if (mr) {
1282                if (mr->mem)
1283                        siw_mr_drop_mem(mr);
1284                kfree_rcu(mr, rcu);
1285        } else {
1286                if (umem)
1287                        siw_umem_release(umem, false);
1288        }
1289        return ERR_PTR(rv);
1290}
1291
1292struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
1293                           u32 max_sge, struct ib_udata *udata)
1294{
1295        struct siw_device *sdev = to_siw_dev(pd->device);
1296        struct siw_mr *mr = NULL;
1297        struct siw_pbl *pbl = NULL;
1298        int rv;
1299
1300        if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1301                siw_dbg_pd(pd, "too many mr's\n");
1302                rv = -ENOMEM;
1303                goto err_out;
1304        }
1305        if (mr_type != IB_MR_TYPE_MEM_REG) {
1306                siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
1307                rv = -EOPNOTSUPP;
1308                goto err_out;
1309        }
1310        if (max_sge > SIW_MAX_SGE_PBL) {
1311                siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
1312                rv = -ENOMEM;
1313                goto err_out;
1314        }
1315        pbl = siw_pbl_alloc(max_sge);
1316        if (IS_ERR(pbl)) {
1317                rv = PTR_ERR(pbl);
1318                siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
1319                pbl = NULL;
1320                goto err_out;
1321        }
1322        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1323        if (!mr) {
1324                rv = -ENOMEM;
1325                goto err_out;
1326        }
1327        rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
1328        if (rv)
1329                goto err_out;
1330
1331        mr->mem->is_pbl = 1;
1332
1333        siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1334
1335        return &mr->base_mr;
1336
1337err_out:
1338        atomic_dec(&sdev->num_mr);
1339
1340        if (!mr) {
1341                kfree(pbl);
1342        } else {
1343                if (mr->mem)
1344                        siw_mr_drop_mem(mr);
1345                kfree_rcu(mr, rcu);
1346        }
1347        siw_dbg_pd(pd, "failed: %d\n", rv);
1348
1349        return ERR_PTR(rv);
1350}
1351
1352/* Just used to count number of pages being mapped */
1353static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
1354{
1355        return 0;
1356}
1357
1358int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
1359                  unsigned int *sg_off)
1360{
1361        struct scatterlist *slp;
1362        struct siw_mr *mr = to_siw_mr(base_mr);
1363        struct siw_mem *mem = mr->mem;
1364        struct siw_pbl *pbl = mem->pbl;
1365        struct siw_pble *pble;
1366        unsigned long pbl_size;
1367        int i, rv;
1368
1369        if (!pbl) {
1370                siw_dbg_mem(mem, "no PBL allocated\n");
1371                return -EINVAL;
1372        }
1373        pble = pbl->pbe;
1374
1375        if (pbl->max_buf < num_sle) {
1376                siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
1377                            mem->pbl->max_buf, num_sle);
1378                return -ENOMEM;
1379        }
1380        for_each_sg(sl, slp, num_sle, i) {
1381                if (sg_dma_len(slp) == 0) {
1382                        siw_dbg_mem(mem, "empty SGE\n");
1383                        return -EINVAL;
1384                }
1385                if (i == 0) {
1386                        pble->addr = sg_dma_address(slp);
1387                        pble->size = sg_dma_len(slp);
1388                        pble->pbl_off = 0;
1389                        pbl_size = pble->size;
1390                        pbl->num_buf = 1;
1391                } else {
1392                        /* Merge PBL entries if adjacent */
1393                        if (pble->addr + pble->size == sg_dma_address(slp)) {
1394                                pble->size += sg_dma_len(slp);
1395                        } else {
1396                                pble++;
1397                                pbl->num_buf++;
1398                                pble->addr = sg_dma_address(slp);
1399                                pble->size = sg_dma_len(slp);
1400                                pble->pbl_off = pbl_size;
1401                        }
1402                        pbl_size += sg_dma_len(slp);
1403                }
1404                siw_dbg_mem(mem,
1405                        "sge[%d], size %u, addr 0x%p, total %lu\n",
1406                        i, pble->size, (void *)(uintptr_t)pble->addr,
1407                        pbl_size);
1408        }
1409        rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
1410        if (rv > 0) {
1411                mem->len = base_mr->length;
1412                mem->va = base_mr->iova;
1413                siw_dbg_mem(mem,
1414                        "%llu bytes, start 0x%pK, %u SLE to %u entries\n",
1415                        mem->len, (void *)(uintptr_t)mem->va, num_sle,
1416                        pbl->num_buf);
1417        }
1418        return rv;
1419}
1420
1421/*
1422 * siw_get_dma_mr()
1423 *
1424 * Create a (empty) DMA memory region, where no umem is attached.
1425 */
1426struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
1427{
1428        struct siw_device *sdev = to_siw_dev(pd->device);
1429        struct siw_mr *mr = NULL;
1430        int rv;
1431
1432        if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1433                siw_dbg_pd(pd, "too many mr's\n");
1434                rv = -ENOMEM;
1435                goto err_out;
1436        }
1437        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1438        if (!mr) {
1439                rv = -ENOMEM;
1440                goto err_out;
1441        }
1442        rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
1443        if (rv)
1444                goto err_out;
1445
1446        mr->mem->stag_valid = 1;
1447
1448        siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1449
1450        return &mr->base_mr;
1451
1452err_out:
1453        if (rv)
1454                kfree(mr);
1455
1456        atomic_dec(&sdev->num_mr);
1457
1458        return ERR_PTR(rv);
1459}
1460
1461/*
1462 * siw_create_srq()
1463 *
1464 * Create Shared Receive Queue of attributes @init_attrs
1465 * within protection domain given by @pd.
1466 *
1467 * @base_srq:   Base SRQ contained in siw SRQ.
1468 * @init_attrs: SRQ init attributes.
1469 * @udata:      points to user context
1470 */
1471int siw_create_srq(struct ib_srq *base_srq,
1472                   struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
1473{
1474        struct siw_srq *srq = to_siw_srq(base_srq);
1475        struct ib_srq_attr *attrs = &init_attrs->attr;
1476        struct siw_device *sdev = to_siw_dev(base_srq->device);
1477        struct siw_ucontext *ctx =
1478                rdma_udata_to_drv_context(udata, struct siw_ucontext,
1479                                          base_ucontext);
1480        int rv;
1481
1482        if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
1483                siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
1484                rv = -ENOMEM;
1485                goto err_out;
1486        }
1487        if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
1488            attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
1489                rv = -EINVAL;
1490                goto err_out;
1491        }
1492        srq->max_sge = attrs->max_sge;
1493        srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
1494        srq->xa_srq_index = SIW_INVAL_UOBJ_KEY;
1495        srq->limit = attrs->srq_limit;
1496        if (srq->limit)
1497                srq->armed = 1;
1498
1499        srq->kernel_verbs = !udata;
1500
1501        if (udata)
1502                srq->recvq =
1503                        vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
1504        else
1505                srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe));
1506
1507        if (srq->recvq == NULL) {
1508                rv = -ENOMEM;
1509                goto err_out;
1510        }
1511        if (udata) {
1512                struct siw_uresp_create_srq uresp = {};
1513
1514                srq->xa_srq_index = siw_create_uobj(
1515                        ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe));
1516
1517                if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) {
1518                        rv = -ENOMEM;
1519                        goto err_out;
1520                }
1521                uresp.srq_key = srq->xa_srq_index;
1522                uresp.num_rqe = srq->num_rqe;
1523
1524                if (udata->outlen < sizeof(uresp)) {
1525                        rv = -EINVAL;
1526                        goto err_out;
1527                }
1528                rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1529                if (rv)
1530                        goto err_out;
1531        }
1532        spin_lock_init(&srq->lock);
1533
1534        siw_dbg_pd(base_srq->pd, "[SRQ]: success\n");
1535
1536        return 0;
1537
1538err_out:
1539        if (srq->recvq) {
1540                if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY)
1541                        kfree(xa_erase(&ctx->xa, srq->xa_srq_index));
1542                vfree(srq->recvq);
1543        }
1544        atomic_dec(&sdev->num_srq);
1545
1546        return rv;
1547}
1548
1549/*
1550 * siw_modify_srq()
1551 *
1552 * Modify SRQ. The caller may resize SRQ and/or set/reset notification
1553 * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
1554 *
1555 * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
1556 * parameter. siw_modify_srq() does not check the attrs->max_sge param.
1557 */
1558int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
1559                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
1560{
1561        struct siw_srq *srq = to_siw_srq(base_srq);
1562        unsigned long flags;
1563        int rv = 0;
1564
1565        spin_lock_irqsave(&srq->lock, flags);
1566
1567        if (attr_mask & IB_SRQ_MAX_WR) {
1568                /* resize request not yet supported */
1569                rv = -EOPNOTSUPP;
1570                goto out;
1571        }
1572        if (attr_mask & IB_SRQ_LIMIT) {
1573                if (attrs->srq_limit) {
1574                        if (unlikely(attrs->srq_limit > srq->num_rqe)) {
1575                                rv = -EINVAL;
1576                                goto out;
1577                        }
1578                        srq->armed = 1;
1579                } else {
1580                        srq->armed = 0;
1581                }
1582                srq->limit = attrs->srq_limit;
1583        }
1584out:
1585        spin_unlock_irqrestore(&srq->lock, flags);
1586
1587        return rv;
1588}
1589
1590/*
1591 * siw_query_srq()
1592 *
1593 * Query SRQ attributes.
1594 */
1595int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
1596{
1597        struct siw_srq *srq = to_siw_srq(base_srq);
1598        unsigned long flags;
1599
1600        spin_lock_irqsave(&srq->lock, flags);
1601
1602        attrs->max_wr = srq->num_rqe;
1603        attrs->max_sge = srq->max_sge;
1604        attrs->srq_limit = srq->limit;
1605
1606        spin_unlock_irqrestore(&srq->lock, flags);
1607
1608        return 0;
1609}
1610
1611/*
1612 * siw_destroy_srq()
1613 *
1614 * Destroy SRQ.
1615 * It is assumed that the SRQ is not referenced by any
1616 * QP anymore - the code trusts the RDMA core environment to keep track
1617 * of QP references.
1618 */
1619void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
1620{
1621        struct siw_srq *srq = to_siw_srq(base_srq);
1622        struct siw_device *sdev = to_siw_dev(base_srq->device);
1623        struct siw_ucontext *ctx =
1624                rdma_udata_to_drv_context(udata, struct siw_ucontext,
1625                                          base_ucontext);
1626
1627        if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY)
1628                kfree(xa_erase(&ctx->xa, srq->xa_srq_index));
1629
1630        vfree(srq->recvq);
1631        atomic_dec(&sdev->num_srq);
1632}
1633
1634/*
1635 * siw_post_srq_recv()
1636 *
1637 * Post a list of receive queue elements to SRQ.
1638 * NOTE: The function does not check or lock a certain SRQ state
1639 *       during the post operation. The code simply trusts the
1640 *       RDMA core environment.
1641 *
1642 * @base_srq:   Base SRQ contained in siw SRQ
1643 * @wr:         List of R-WR's
1644 * @bad_wr:     Updated to failing WR if posting fails.
1645 */
1646int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
1647                      const struct ib_recv_wr **bad_wr)
1648{
1649        struct siw_srq *srq = to_siw_srq(base_srq);
1650        unsigned long flags;
1651        int rv = 0;
1652
1653        if (unlikely(!srq->kernel_verbs)) {
1654                siw_dbg_pd(base_srq->pd,
1655                           "[SRQ]: no kernel post_recv for mapped srq\n");
1656                rv = -EINVAL;
1657                goto out;
1658        }
1659        /*
1660         * Serialize potentially multiple producers.
1661         * Also needed to serialize potentially multiple
1662         * consumers.
1663         */
1664        spin_lock_irqsave(&srq->lock, flags);
1665
1666        while (wr) {
1667                u32 idx = srq->rq_put % srq->num_rqe;
1668                struct siw_rqe *rqe = &srq->recvq[idx];
1669
1670                if (rqe->flags) {
1671                        siw_dbg_pd(base_srq->pd, "SRQ full\n");
1672                        rv = -ENOMEM;
1673                        break;
1674                }
1675                if (unlikely(wr->num_sge > srq->max_sge)) {
1676                        siw_dbg_pd(base_srq->pd,
1677                                   "[SRQ]: too many sge's: %d\n", wr->num_sge);
1678                        rv = -EINVAL;
1679                        break;
1680                }
1681                rqe->id = wr->wr_id;
1682                rqe->num_sge = wr->num_sge;
1683                siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1684
1685                /* Make sure S-RQE is completely written before valid */
1686                smp_wmb();
1687
1688                rqe->flags = SIW_WQE_VALID;
1689
1690                srq->rq_put++;
1691                wr = wr->next;
1692        }
1693        spin_unlock_irqrestore(&srq->lock, flags);
1694out:
1695        if (unlikely(rv < 0)) {
1696                siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv);
1697                *bad_wr = wr;
1698        }
1699        return rv;
1700}
1701
1702void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
1703{
1704        struct ib_event event;
1705        struct ib_qp *base_qp = qp->ib_qp;
1706
1707        /*
1708         * Do not report asynchronous errors on QP which gets
1709         * destroyed via verbs interface (siw_destroy_qp())
1710         */
1711        if (qp->attrs.flags & SIW_QP_IN_DESTROY)
1712                return;
1713
1714        event.event = etype;
1715        event.device = base_qp->device;
1716        event.element.qp = base_qp;
1717
1718        if (base_qp->event_handler) {
1719                siw_dbg_qp(qp, "reporting event %d\n", etype);
1720                base_qp->event_handler(&event, base_qp->qp_context);
1721        }
1722}
1723
1724void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
1725{
1726        struct ib_event event;
1727        struct ib_cq *base_cq = &cq->base_cq;
1728
1729        event.event = etype;
1730        event.device = base_cq->device;
1731        event.element.cq = base_cq;
1732
1733        if (base_cq->event_handler) {
1734                siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
1735                base_cq->event_handler(&event, base_cq->cq_context);
1736        }
1737}
1738
1739void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
1740{
1741        struct ib_event event;
1742        struct ib_srq *base_srq = &srq->base_srq;
1743
1744        event.event = etype;
1745        event.device = base_srq->device;
1746        event.element.srq = base_srq;
1747
1748        if (base_srq->event_handler) {
1749                siw_dbg_pd(srq->base_srq.pd,
1750                           "reporting SRQ event %d\n", etype);
1751                base_srq->event_handler(&event, base_srq->srq_context);
1752        }
1753}
1754
1755void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype)
1756{
1757        struct ib_event event;
1758
1759        event.event = etype;
1760        event.device = &sdev->base_dev;
1761        event.element.port_num = port;
1762
1763        siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
1764
1765        ib_dispatch_event(&event);
1766}
1767