linux/drivers/infiniband/sw/siw/siw_verbs.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
   2
   3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
   4/* Copyright (c) 2008-2019, IBM Corporation */
   5
   6#include <linux/errno.h>
   7#include <linux/types.h>
   8#include <linux/uaccess.h>
   9#include <linux/vmalloc.h>
  10#include <linux/xarray.h>
  11
  12#include <rdma/iw_cm.h>
  13#include <rdma/ib_verbs.h>
  14#include <rdma/ib_user_verbs.h>
  15#include <rdma/uverbs_ioctl.h>
  16
  17#include "siw.h"
  18#include "siw_verbs.h"
  19#include "siw_mem.h"
  20
  21static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
  22        [IB_QPS_RESET] = SIW_QP_STATE_IDLE,
  23        [IB_QPS_INIT] = SIW_QP_STATE_IDLE,
  24        [IB_QPS_RTR] = SIW_QP_STATE_RTR,
  25        [IB_QPS_RTS] = SIW_QP_STATE_RTS,
  26        [IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
  27        [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
  28        [IB_QPS_ERR] = SIW_QP_STATE_ERROR
  29};
  30
  31static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
  32        [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
  33        [IB_QPS_RTS] = "RTS",     [IB_QPS_SQD] = "SQD",   [IB_QPS_SQE] = "SQE",
  34        [IB_QPS_ERR] = "ERR"
  35};
  36
  37static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size)
  38{
  39        struct siw_uobj *uobj;
  40        struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY);
  41        u32 key;
  42
  43        uobj = kzalloc(sizeof(*uobj), GFP_KERNEL);
  44        if (!uobj)
  45                return SIW_INVAL_UOBJ_KEY;
  46
  47        if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey,
  48                            GFP_KERNEL) < 0) {
  49                kfree(uobj);
  50                return SIW_INVAL_UOBJ_KEY;
  51        }
  52        uobj->size = PAGE_ALIGN(size);
  53        uobj->addr = vaddr;
  54
  55        return key;
  56}
  57
  58static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx,
  59                                     unsigned long off, u32 size)
  60{
  61        struct siw_uobj *uobj = xa_load(&uctx->xa, off);
  62
  63        if (uobj && uobj->size == size)
  64                return uobj;
  65
  66        return NULL;
  67}
  68
  69int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
  70{
  71        struct siw_ucontext *uctx = to_siw_ctx(ctx);
  72        struct siw_uobj *uobj;
  73        unsigned long off = vma->vm_pgoff;
  74        int size = vma->vm_end - vma->vm_start;
  75        int rv = -EINVAL;
  76
  77        /*
  78         * Must be page aligned
  79         */
  80        if (vma->vm_start & (PAGE_SIZE - 1)) {
  81                pr_warn("siw: mmap not page aligned\n");
  82                goto out;
  83        }
  84        uobj = siw_get_uobj(uctx, off, size);
  85        if (!uobj) {
  86                siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n",
  87                        off, size);
  88                goto out;
  89        }
  90        rv = remap_vmalloc_range(vma, uobj->addr, 0);
  91        if (rv)
  92                pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size);
  93out:
  94        return rv;
  95}
  96
  97int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
  98{
  99        struct siw_device *sdev = to_siw_dev(base_ctx->device);
 100        struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
 101        struct siw_uresp_alloc_ctx uresp = {};
 102        int rv;
 103
 104        if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
 105                rv = -ENOMEM;
 106                goto err_out;
 107        }
 108        xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC);
 109        ctx->uobj_nextkey = 0;
 110        ctx->sdev = sdev;
 111
 112        uresp.dev_id = sdev->vendor_part_id;
 113
 114        if (udata->outlen < sizeof(uresp)) {
 115                rv = -EINVAL;
 116                goto err_out;
 117        }
 118        rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 119        if (rv)
 120                goto err_out;
 121
 122        siw_dbg(base_ctx->device, "success. now %d context(s)\n",
 123                atomic_read(&sdev->num_ctx));
 124
 125        return 0;
 126
 127err_out:
 128        atomic_dec(&sdev->num_ctx);
 129        siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
 130                atomic_read(&sdev->num_ctx));
 131
 132        return rv;
 133}
 134
 135void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
 136{
 137        struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
 138        void *entry;
 139        unsigned long index;
 140
 141        /*
 142         * Make sure all user mmap objects are gone. Since QP, CQ
 143         * and SRQ destroy routines destroy related objects, nothing
 144         * should be found here.
 145         */
 146        xa_for_each(&uctx->xa, index, entry) {
 147                kfree(xa_erase(&uctx->xa, index));
 148                pr_warn("siw: dropping orphaned uobj at %lu\n", index);
 149        }
 150        xa_destroy(&uctx->xa);
 151        atomic_dec(&uctx->sdev->num_ctx);
 152}
 153
 154int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
 155                     struct ib_udata *udata)
 156{
 157        struct siw_device *sdev = to_siw_dev(base_dev);
 158
 159        if (udata->inlen || udata->outlen)
 160                return -EINVAL;
 161
 162        memset(attr, 0, sizeof(*attr));
 163
 164        /* Revisit atomic caps if RFC 7306 gets supported */
 165        attr->atomic_cap = 0;
 166        attr->device_cap_flags =
 167                IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG;
 168        attr->max_cq = sdev->attrs.max_cq;
 169        attr->max_cqe = sdev->attrs.max_cqe;
 170        attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
 171        attr->max_fmr = sdev->attrs.max_fmr;
 172        attr->max_mr = sdev->attrs.max_mr;
 173        attr->max_mw = sdev->attrs.max_mw;
 174        attr->max_mr_size = ~0ull;
 175        attr->max_pd = sdev->attrs.max_pd;
 176        attr->max_qp = sdev->attrs.max_qp;
 177        attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
 178        attr->max_qp_rd_atom = sdev->attrs.max_ord;
 179        attr->max_qp_wr = sdev->attrs.max_qp_wr;
 180        attr->max_recv_sge = sdev->attrs.max_sge;
 181        attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
 182        attr->max_send_sge = sdev->attrs.max_sge;
 183        attr->max_sge_rd = sdev->attrs.max_sge_rd;
 184        attr->max_srq = sdev->attrs.max_srq;
 185        attr->max_srq_sge = sdev->attrs.max_srq_sge;
 186        attr->max_srq_wr = sdev->attrs.max_srq_wr;
 187        attr->page_size_cap = PAGE_SIZE;
 188        attr->vendor_id = SIW_VENDOR_ID;
 189        attr->vendor_part_id = sdev->vendor_part_id;
 190
 191        memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6);
 192
 193        return 0;
 194}
 195
 196int siw_query_port(struct ib_device *base_dev, u8 port,
 197                   struct ib_port_attr *attr)
 198{
 199        struct siw_device *sdev = to_siw_dev(base_dev);
 200
 201        memset(attr, 0, sizeof(*attr));
 202
 203        attr->active_mtu = attr->max_mtu;
 204        attr->active_speed = 2;
 205        attr->active_width = 2;
 206        attr->gid_tbl_len = 1;
 207        attr->max_msg_sz = -1;
 208        attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
 209        attr->phys_state = sdev->state == IB_PORT_ACTIVE ?
 210                IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
 211        attr->pkey_tbl_len = 1;
 212        attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
 213        attr->state = sdev->state;
 214        /*
 215         * All zero
 216         *
 217         * attr->lid = 0;
 218         * attr->bad_pkey_cntr = 0;
 219         * attr->qkey_viol_cntr = 0;
 220         * attr->sm_lid = 0;
 221         * attr->lmc = 0;
 222         * attr->max_vl_num = 0;
 223         * attr->sm_sl = 0;
 224         * attr->subnet_timeout = 0;
 225         * attr->init_type_repy = 0;
 226         */
 227        return 0;
 228}
 229
 230int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
 231                           struct ib_port_immutable *port_immutable)
 232{
 233        struct ib_port_attr attr;
 234        int rv = siw_query_port(base_dev, port, &attr);
 235
 236        if (rv)
 237                return rv;
 238
 239        port_immutable->pkey_tbl_len = attr.pkey_tbl_len;
 240        port_immutable->gid_tbl_len = attr.gid_tbl_len;
 241        port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
 242
 243        return 0;
 244}
 245
 246int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey)
 247{
 248        /* Report the default pkey */
 249        *pkey = 0xffff;
 250        return 0;
 251}
 252
 253int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
 254                  union ib_gid *gid)
 255{
 256        struct siw_device *sdev = to_siw_dev(base_dev);
 257
 258        /* subnet_prefix == interface_id == 0; */
 259        memset(gid, 0, sizeof(*gid));
 260        memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6);
 261
 262        return 0;
 263}
 264
 265int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 266{
 267        struct siw_device *sdev = to_siw_dev(pd->device);
 268
 269        if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
 270                atomic_dec(&sdev->num_pd);
 271                return -ENOMEM;
 272        }
 273        siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
 274
 275        return 0;
 276}
 277
 278void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 279{
 280        struct siw_device *sdev = to_siw_dev(pd->device);
 281
 282        siw_dbg_pd(pd, "free PD\n");
 283        atomic_dec(&sdev->num_pd);
 284}
 285
 286void siw_qp_get_ref(struct ib_qp *base_qp)
 287{
 288        siw_qp_get(to_siw_qp(base_qp));
 289}
 290
 291void siw_qp_put_ref(struct ib_qp *base_qp)
 292{
 293        siw_qp_put(to_siw_qp(base_qp));
 294}
 295
 296/*
 297 * siw_create_qp()
 298 *
 299 * Create QP of requested size on given device.
 300 *
 301 * @pd:         Protection Domain
 302 * @attrs:      Initial QP attributes.
 303 * @udata:      used to provide QP ID, SQ and RQ size back to user.
 304 */
 305
 306struct ib_qp *siw_create_qp(struct ib_pd *pd,
 307                            struct ib_qp_init_attr *attrs,
 308                            struct ib_udata *udata)
 309{
 310        struct siw_qp *qp = NULL;
 311        struct siw_base_qp *siw_base_qp = NULL;
 312        struct ib_device *base_dev = pd->device;
 313        struct siw_device *sdev = to_siw_dev(base_dev);
 314        struct siw_ucontext *uctx =
 315                rdma_udata_to_drv_context(udata, struct siw_ucontext,
 316                                          base_ucontext);
 317        struct siw_cq *scq = NULL, *rcq = NULL;
 318        unsigned long flags;
 319        int num_sqe, num_rqe, rv = 0;
 320
 321        siw_dbg(base_dev, "create new QP\n");
 322
 323        if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
 324                siw_dbg(base_dev, "too many QP's\n");
 325                rv = -ENOMEM;
 326                goto err_out;
 327        }
 328        if (attrs->qp_type != IB_QPT_RC) {
 329                siw_dbg(base_dev, "only RC QP's supported\n");
 330                rv = -EINVAL;
 331                goto err_out;
 332        }
 333        if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
 334            (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
 335            (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
 336            (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
 337                siw_dbg(base_dev, "QP size error\n");
 338                rv = -EINVAL;
 339                goto err_out;
 340        }
 341        if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
 342                siw_dbg(base_dev, "max inline send: %d > %d\n",
 343                        attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
 344                rv = -EINVAL;
 345                goto err_out;
 346        }
 347        /*
 348         * NOTE: we allow for zero element SQ and RQ WQE's SGL's
 349         * but not for a QP unable to hold any WQE (SQ + RQ)
 350         */
 351        if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) {
 352                siw_dbg(base_dev, "QP must have send or receive queue\n");
 353                rv = -EINVAL;
 354                goto err_out;
 355        }
 356        scq = to_siw_cq(attrs->send_cq);
 357        rcq = to_siw_cq(attrs->recv_cq);
 358
 359        if (!scq || (!rcq && !attrs->srq)) {
 360                siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
 361                rv = -EINVAL;
 362                goto err_out;
 363        }
 364        siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL);
 365        if (!siw_base_qp) {
 366                rv = -ENOMEM;
 367                goto err_out;
 368        }
 369        qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 370        if (!qp) {
 371                rv = -ENOMEM;
 372                goto err_out;
 373        }
 374        siw_base_qp->qp = qp;
 375        qp->ib_qp = &siw_base_qp->base_qp;
 376
 377        init_rwsem(&qp->state_lock);
 378        spin_lock_init(&qp->sq_lock);
 379        spin_lock_init(&qp->rq_lock);
 380        spin_lock_init(&qp->orq_lock);
 381
 382        qp->kernel_verbs = !udata;
 383        qp->xa_sq_index = SIW_INVAL_UOBJ_KEY;
 384        qp->xa_rq_index = SIW_INVAL_UOBJ_KEY;
 385
 386        rv = siw_qp_add(sdev, qp);
 387        if (rv)
 388                goto err_out;
 389
 390        /* All queue indices are derived from modulo operations
 391         * on a free running 'get' (consumer) and 'put' (producer)
 392         * unsigned counter. Having queue sizes at power of two
 393         * avoids handling counter wrap around.
 394         */
 395        num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr);
 396        num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr);
 397
 398        if (qp->kernel_verbs)
 399                qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe));
 400        else
 401                qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
 402
 403        if (qp->sendq == NULL) {
 404                siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe);
 405                rv = -ENOMEM;
 406                goto err_out_xa;
 407        }
 408        if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
 409                if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
 410                        qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
 411                else {
 412                        rv = -EINVAL;
 413                        goto err_out_xa;
 414                }
 415        }
 416        qp->pd = pd;
 417        qp->scq = scq;
 418        qp->rcq = rcq;
 419
 420        if (attrs->srq) {
 421                /*
 422                 * SRQ support.
 423                 * Verbs 6.3.7: ignore RQ size, if SRQ present
 424                 * Verbs 6.3.5: do not check PD of SRQ against PD of QP
 425                 */
 426                qp->srq = to_siw_srq(attrs->srq);
 427                qp->attrs.rq_size = 0;
 428                siw_dbg(base_dev, "QP [%u]: SRQ attached\n", qp->qp_num);
 429        } else if (num_rqe) {
 430                if (qp->kernel_verbs)
 431                        qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe));
 432                else
 433                        qp->recvq =
 434                                vmalloc_user(num_rqe * sizeof(struct siw_rqe));
 435
 436                if (qp->recvq == NULL) {
 437                        siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe);
 438                        rv = -ENOMEM;
 439                        goto err_out_xa;
 440                }
 441                qp->attrs.rq_size = num_rqe;
 442        }
 443        qp->attrs.sq_size = num_sqe;
 444        qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
 445        qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
 446
 447        /* Make those two tunables fixed for now. */
 448        qp->tx_ctx.gso_seg_limit = 1;
 449        qp->tx_ctx.zcopy_tx = zcopy_tx;
 450
 451        qp->attrs.state = SIW_QP_STATE_IDLE;
 452
 453        if (udata) {
 454                struct siw_uresp_create_qp uresp = {};
 455
 456                uresp.num_sqe = num_sqe;
 457                uresp.num_rqe = num_rqe;
 458                uresp.qp_id = qp_id(qp);
 459
 460                if (qp->sendq) {
 461                        qp->xa_sq_index =
 462                                siw_create_uobj(uctx, qp->sendq,
 463                                        num_sqe * sizeof(struct siw_sqe));
 464                }
 465                if (qp->recvq) {
 466                        qp->xa_rq_index =
 467                                 siw_create_uobj(uctx, qp->recvq,
 468                                        num_rqe * sizeof(struct siw_rqe));
 469                }
 470                if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY ||
 471                    qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) {
 472                        rv = -ENOMEM;
 473                        goto err_out_xa;
 474                }
 475                uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT;
 476                uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT;
 477
 478                if (udata->outlen < sizeof(uresp)) {
 479                        rv = -EINVAL;
 480                        goto err_out_xa;
 481                }
 482                rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 483                if (rv)
 484                        goto err_out_xa;
 485        }
 486        qp->tx_cpu = siw_get_tx_cpu(sdev);
 487        if (qp->tx_cpu < 0) {
 488                rv = -EINVAL;
 489                goto err_out_xa;
 490        }
 491        INIT_LIST_HEAD(&qp->devq);
 492        spin_lock_irqsave(&sdev->lock, flags);
 493        list_add_tail(&qp->devq, &sdev->qp_list);
 494        spin_unlock_irqrestore(&sdev->lock, flags);
 495
 496        return qp->ib_qp;
 497
 498err_out_xa:
 499        xa_erase(&sdev->qp_xa, qp_id(qp));
 500err_out:
 501        kfree(siw_base_qp);
 502
 503        if (qp) {
 504                if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY)
 505                        kfree(xa_erase(&uctx->xa, qp->xa_sq_index));
 506                if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY)
 507                        kfree(xa_erase(&uctx->xa, qp->xa_rq_index));
 508
 509                vfree(qp->sendq);
 510                vfree(qp->recvq);
 511                kfree(qp);
 512        }
 513        atomic_dec(&sdev->num_qp);
 514
 515        return ERR_PTR(rv);
 516}
 517
 518/*
 519 * Minimum siw_query_qp() verb interface.
 520 *
 521 * @qp_attr_mask is not used but all available information is provided
 522 */
 523int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
 524                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
 525{
 526        struct siw_qp *qp;
 527        struct siw_device *sdev;
 528
 529        if (base_qp && qp_attr && qp_init_attr) {
 530                qp = to_siw_qp(base_qp);
 531                sdev = to_siw_dev(base_qp->device);
 532        } else {
 533                return -EINVAL;
 534        }
 535        qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
 536        qp_attr->cap.max_send_wr = qp->attrs.sq_size;
 537        qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
 538        qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
 539        qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
 540        qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
 541        qp_attr->max_rd_atomic = qp->attrs.irq_size;
 542        qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
 543
 544        qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
 545                                   IB_ACCESS_REMOTE_WRITE |
 546                                   IB_ACCESS_REMOTE_READ;
 547
 548        qp_init_attr->qp_type = base_qp->qp_type;
 549        qp_init_attr->send_cq = base_qp->send_cq;
 550        qp_init_attr->recv_cq = base_qp->recv_cq;
 551        qp_init_attr->srq = base_qp->srq;
 552
 553        qp_init_attr->cap = qp_attr->cap;
 554
 555        return 0;
 556}
 557
 558int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
 559                        int attr_mask, struct ib_udata *udata)
 560{
 561        struct siw_qp_attrs new_attrs;
 562        enum siw_qp_attr_mask siw_attr_mask = 0;
 563        struct siw_qp *qp = to_siw_qp(base_qp);
 564        int rv = 0;
 565
 566        if (!attr_mask)
 567                return 0;
 568
 569        memset(&new_attrs, 0, sizeof(new_attrs));
 570
 571        if (attr_mask & IB_QP_ACCESS_FLAGS) {
 572                siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
 573
 574                if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
 575                        new_attrs.flags |= SIW_RDMA_READ_ENABLED;
 576                if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
 577                        new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
 578                if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
 579                        new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
 580        }
 581        if (attr_mask & IB_QP_STATE) {
 582                siw_dbg_qp(qp, "desired IB QP state: %s\n",
 583                           ib_qp_state_to_string[attr->qp_state]);
 584
 585                new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
 586
 587                if (new_attrs.state > SIW_QP_STATE_RTS)
 588                        qp->tx_ctx.tx_suspend = 1;
 589
 590                siw_attr_mask |= SIW_QP_ATTR_STATE;
 591        }
 592        if (!siw_attr_mask)
 593                goto out;
 594
 595        down_write(&qp->state_lock);
 596
 597        rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
 598
 599        up_write(&qp->state_lock);
 600out:
 601        return rv;
 602}
 603
 604int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
 605{
 606        struct siw_qp *qp = to_siw_qp(base_qp);
 607        struct siw_ucontext *uctx =
 608                rdma_udata_to_drv_context(udata, struct siw_ucontext,
 609                                          base_ucontext);
 610        struct siw_qp_attrs qp_attrs;
 611
 612        siw_dbg_qp(qp, "state %d\n", qp->attrs.state);
 613
 614        /*
 615         * Mark QP as in process of destruction to prevent from
 616         * any async callbacks to RDMA core
 617         */
 618        qp->attrs.flags |= SIW_QP_IN_DESTROY;
 619        qp->rx_stream.rx_suspend = 1;
 620
 621        if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY)
 622                kfree(xa_erase(&uctx->xa, qp->xa_sq_index));
 623        if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY)
 624                kfree(xa_erase(&uctx->xa, qp->xa_rq_index));
 625
 626        down_write(&qp->state_lock);
 627
 628        qp_attrs.state = SIW_QP_STATE_ERROR;
 629        siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
 630
 631        if (qp->cep) {
 632                siw_cep_put(qp->cep);
 633                qp->cep = NULL;
 634        }
 635        up_write(&qp->state_lock);
 636
 637        kfree(qp->tx_ctx.mpa_crc_hd);
 638        kfree(qp->rx_stream.mpa_crc_hd);
 639
 640        qp->scq = qp->rcq = NULL;
 641
 642        siw_qp_put(qp);
 643
 644        return 0;
 645}
 646
 647/*
 648 * siw_copy_inline_sgl()
 649 *
 650 * Prepare sgl of inlined data for sending. For userland callers
 651 * function checks if given buffer addresses and len's are within
 652 * process context bounds.
 653 * Data from all provided sge's are copied together into the wqe,
 654 * referenced by a single sge.
 655 */
 656static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
 657                               struct siw_sqe *sqe)
 658{
 659        struct ib_sge *core_sge = core_wr->sg_list;
 660        void *kbuf = &sqe->sge[1];
 661        int num_sge = core_wr->num_sge, bytes = 0;
 662
 663        sqe->sge[0].laddr = (uintptr_t)kbuf;
 664        sqe->sge[0].lkey = 0;
 665
 666        while (num_sge--) {
 667                if (!core_sge->length) {
 668                        core_sge++;
 669                        continue;
 670                }
 671                bytes += core_sge->length;
 672                if (bytes > SIW_MAX_INLINE) {
 673                        bytes = -EINVAL;
 674                        break;
 675                }
 676                memcpy(kbuf, (void *)(uintptr_t)core_sge->addr,
 677                       core_sge->length);
 678
 679                kbuf += core_sge->length;
 680                core_sge++;
 681        }
 682        sqe->sge[0].length = bytes > 0 ? bytes : 0;
 683        sqe->num_sge = bytes > 0 ? 1 : 0;
 684
 685        return bytes;
 686}
 687
 688/*
 689 * siw_post_send()
 690 *
 691 * Post a list of S-WR's to a SQ.
 692 *
 693 * @base_qp:    Base QP contained in siw QP
 694 * @wr:         Null terminated list of user WR's
 695 * @bad_wr:     Points to failing WR in case of synchronous failure.
 696 */
 697int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
 698                  const struct ib_send_wr **bad_wr)
 699{
 700        struct siw_qp *qp = to_siw_qp(base_qp);
 701        struct siw_wqe *wqe = tx_wqe(qp);
 702
 703        unsigned long flags;
 704        int rv = 0;
 705
 706        /*
 707         * Try to acquire QP state lock. Must be non-blocking
 708         * to accommodate kernel clients needs.
 709         */
 710        if (!down_read_trylock(&qp->state_lock)) {
 711                *bad_wr = wr;
 712                siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state);
 713                return -ENOTCONN;
 714        }
 715        if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
 716                up_read(&qp->state_lock);
 717                *bad_wr = wr;
 718                siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state);
 719                return -ENOTCONN;
 720        }
 721        if (wr && !qp->kernel_verbs) {
 722                siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
 723                up_read(&qp->state_lock);
 724                *bad_wr = wr;
 725                return -EINVAL;
 726        }
 727        spin_lock_irqsave(&qp->sq_lock, flags);
 728
 729        while (wr) {
 730                u32 idx = qp->sq_put % qp->attrs.sq_size;
 731                struct siw_sqe *sqe = &qp->sendq[idx];
 732
 733                if (sqe->flags) {
 734                        siw_dbg_qp(qp, "sq full\n");
 735                        rv = -ENOMEM;
 736                        break;
 737                }
 738                if (wr->num_sge > qp->attrs.sq_max_sges) {
 739                        siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
 740                        rv = -EINVAL;
 741                        break;
 742                }
 743                sqe->id = wr->wr_id;
 744
 745                if ((wr->send_flags & IB_SEND_SIGNALED) ||
 746                    (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
 747                        sqe->flags |= SIW_WQE_SIGNALLED;
 748
 749                if (wr->send_flags & IB_SEND_FENCE)
 750                        sqe->flags |= SIW_WQE_READ_FENCE;
 751
 752                switch (wr->opcode) {
 753                case IB_WR_SEND:
 754                case IB_WR_SEND_WITH_INV:
 755                        if (wr->send_flags & IB_SEND_SOLICITED)
 756                                sqe->flags |= SIW_WQE_SOLICITED;
 757
 758                        if (!(wr->send_flags & IB_SEND_INLINE)) {
 759                                siw_copy_sgl(wr->sg_list, sqe->sge,
 760                                             wr->num_sge);
 761                                sqe->num_sge = wr->num_sge;
 762                        } else {
 763                                rv = siw_copy_inline_sgl(wr, sqe);
 764                                if (rv <= 0) {
 765                                        rv = -EINVAL;
 766                                        break;
 767                                }
 768                                sqe->flags |= SIW_WQE_INLINE;
 769                                sqe->num_sge = 1;
 770                        }
 771                        if (wr->opcode == IB_WR_SEND)
 772                                sqe->opcode = SIW_OP_SEND;
 773                        else {
 774                                sqe->opcode = SIW_OP_SEND_REMOTE_INV;
 775                                sqe->rkey = wr->ex.invalidate_rkey;
 776                        }
 777                        break;
 778
 779                case IB_WR_RDMA_READ_WITH_INV:
 780                case IB_WR_RDMA_READ:
 781                        /*
 782                         * iWarp restricts RREAD sink to SGL containing
 783                         * 1 SGE only. we could relax to SGL with multiple
 784                         * elements referring the SAME ltag or even sending
 785                         * a private per-rreq tag referring to a checked
 786                         * local sgl with MULTIPLE ltag's.
 787                         */
 788                        if (unlikely(wr->num_sge != 1)) {
 789                                rv = -EINVAL;
 790                                break;
 791                        }
 792                        siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
 793                        /*
 794                         * NOTE: zero length RREAD is allowed!
 795                         */
 796                        sqe->raddr = rdma_wr(wr)->remote_addr;
 797                        sqe->rkey = rdma_wr(wr)->rkey;
 798                        sqe->num_sge = 1;
 799
 800                        if (wr->opcode == IB_WR_RDMA_READ)
 801                                sqe->opcode = SIW_OP_READ;
 802                        else
 803                                sqe->opcode = SIW_OP_READ_LOCAL_INV;
 804                        break;
 805
 806                case IB_WR_RDMA_WRITE:
 807                        if (!(wr->send_flags & IB_SEND_INLINE)) {
 808                                siw_copy_sgl(wr->sg_list, &sqe->sge[0],
 809                                             wr->num_sge);
 810                                sqe->num_sge = wr->num_sge;
 811                        } else {
 812                                rv = siw_copy_inline_sgl(wr, sqe);
 813                                if (unlikely(rv < 0)) {
 814                                        rv = -EINVAL;
 815                                        break;
 816                                }
 817                                sqe->flags |= SIW_WQE_INLINE;
 818                                sqe->num_sge = 1;
 819                        }
 820                        sqe->raddr = rdma_wr(wr)->remote_addr;
 821                        sqe->rkey = rdma_wr(wr)->rkey;
 822                        sqe->opcode = SIW_OP_WRITE;
 823                        break;
 824
 825                case IB_WR_REG_MR:
 826                        sqe->base_mr = (uintptr_t)reg_wr(wr)->mr;
 827                        sqe->rkey = reg_wr(wr)->key;
 828                        sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
 829                        sqe->opcode = SIW_OP_REG_MR;
 830                        break;
 831
 832                case IB_WR_LOCAL_INV:
 833                        sqe->rkey = wr->ex.invalidate_rkey;
 834                        sqe->opcode = SIW_OP_INVAL_STAG;
 835                        break;
 836
 837                default:
 838                        siw_dbg_qp(qp, "ib wr type %d unsupported\n",
 839                                   wr->opcode);
 840                        rv = -EINVAL;
 841                        break;
 842                }
 843                siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n",
 844                           sqe->opcode, sqe->flags,
 845                           (void *)(uintptr_t)sqe->id);
 846
 847                if (unlikely(rv < 0))
 848                        break;
 849
 850                /* make SQE only valid after completely written */
 851                smp_wmb();
 852                sqe->flags |= SIW_WQE_VALID;
 853
 854                qp->sq_put++;
 855                wr = wr->next;
 856        }
 857
 858        /*
 859         * Send directly if SQ processing is not in progress.
 860         * Eventual immediate errors (rv < 0) do not affect the involved
 861         * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
 862         * processing, if new work is already pending. But rv must be passed
 863         * to caller.
 864         */
 865        if (wqe->wr_status != SIW_WR_IDLE) {
 866                spin_unlock_irqrestore(&qp->sq_lock, flags);
 867                goto skip_direct_sending;
 868        }
 869        rv = siw_activate_tx(qp);
 870        spin_unlock_irqrestore(&qp->sq_lock, flags);
 871
 872        if (rv <= 0)
 873                goto skip_direct_sending;
 874
 875        if (qp->kernel_verbs) {
 876                rv = siw_sq_start(qp);
 877        } else {
 878                qp->tx_ctx.in_syscall = 1;
 879
 880                if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
 881                        siw_qp_cm_drop(qp, 0);
 882
 883                qp->tx_ctx.in_syscall = 0;
 884        }
 885skip_direct_sending:
 886
 887        up_read(&qp->state_lock);
 888
 889        if (rv >= 0)
 890                return 0;
 891        /*
 892         * Immediate error
 893         */
 894        siw_dbg_qp(qp, "error %d\n", rv);
 895
 896        *bad_wr = wr;
 897        return rv;
 898}
 899
 900/*
 901 * siw_post_receive()
 902 *
 903 * Post a list of R-WR's to a RQ.
 904 *
 905 * @base_qp:    Base QP contained in siw QP
 906 * @wr:         Null terminated list of user WR's
 907 * @bad_wr:     Points to failing WR in case of synchronous failure.
 908 */
 909int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
 910                     const struct ib_recv_wr **bad_wr)
 911{
 912        struct siw_qp *qp = to_siw_qp(base_qp);
 913        unsigned long flags;
 914        int rv = 0;
 915
 916        if (qp->srq) {
 917                *bad_wr = wr;
 918                return -EOPNOTSUPP; /* what else from errno.h? */
 919        }
 920        /*
 921         * Try to acquire QP state lock. Must be non-blocking
 922         * to accommodate kernel clients needs.
 923         */
 924        if (!down_read_trylock(&qp->state_lock)) {
 925                *bad_wr = wr;
 926                return -ENOTCONN;
 927        }
 928        if (!qp->kernel_verbs) {
 929                siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n");
 930                up_read(&qp->state_lock);
 931                *bad_wr = wr;
 932                return -EINVAL;
 933        }
 934        if (qp->attrs.state > SIW_QP_STATE_RTS) {
 935                up_read(&qp->state_lock);
 936                *bad_wr = wr;
 937                return -EINVAL;
 938        }
 939        /*
 940         * Serialize potentially multiple producers.
 941         * Not needed for single threaded consumer side.
 942         */
 943        spin_lock_irqsave(&qp->rq_lock, flags);
 944
 945        while (wr) {
 946                u32 idx = qp->rq_put % qp->attrs.rq_size;
 947                struct siw_rqe *rqe = &qp->recvq[idx];
 948
 949                if (rqe->flags) {
 950                        siw_dbg_qp(qp, "RQ full\n");
 951                        rv = -ENOMEM;
 952                        break;
 953                }
 954                if (wr->num_sge > qp->attrs.rq_max_sges) {
 955                        siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
 956                        rv = -EINVAL;
 957                        break;
 958                }
 959                rqe->id = wr->wr_id;
 960                rqe->num_sge = wr->num_sge;
 961                siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
 962
 963                /* make sure RQE is completely written before valid */
 964                smp_wmb();
 965
 966                rqe->flags = SIW_WQE_VALID;
 967
 968                qp->rq_put++;
 969                wr = wr->next;
 970        }
 971        spin_unlock_irqrestore(&qp->rq_lock, flags);
 972
 973        up_read(&qp->state_lock);
 974
 975        if (rv < 0) {
 976                siw_dbg_qp(qp, "error %d\n", rv);
 977                *bad_wr = wr;
 978        }
 979        return rv > 0 ? 0 : rv;
 980}
 981
 982void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
 983{
 984        struct siw_cq *cq = to_siw_cq(base_cq);
 985        struct siw_device *sdev = to_siw_dev(base_cq->device);
 986        struct siw_ucontext *ctx =
 987                rdma_udata_to_drv_context(udata, struct siw_ucontext,
 988                                          base_ucontext);
 989
 990        siw_dbg_cq(cq, "free CQ resources\n");
 991
 992        siw_cq_flush(cq);
 993
 994        if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY)
 995                kfree(xa_erase(&ctx->xa, cq->xa_cq_index));
 996
 997        atomic_dec(&sdev->num_cq);
 998
 999        vfree(cq->queue);
1000}
1001
1002/*
1003 * siw_create_cq()
1004 *
1005 * Populate CQ of requested size
1006 *
1007 * @base_cq: CQ as allocated by RDMA midlayer
1008 * @attr: Initial CQ attributes
1009 * @udata: relates to user context
1010 */
1011
1012int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
1013                  struct ib_udata *udata)
1014{
1015        struct siw_device *sdev = to_siw_dev(base_cq->device);
1016        struct siw_cq *cq = to_siw_cq(base_cq);
1017        int rv, size = attr->cqe;
1018
1019        if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
1020                siw_dbg(base_cq->device, "too many CQ's\n");
1021                rv = -ENOMEM;
1022                goto err_out;
1023        }
1024        if (size < 1 || size > sdev->attrs.max_cqe) {
1025                siw_dbg(base_cq->device, "CQ size error: %d\n", size);
1026                rv = -EINVAL;
1027                goto err_out;
1028        }
1029        size = roundup_pow_of_two(size);
1030        cq->base_cq.cqe = size;
1031        cq->num_cqe = size;
1032        cq->xa_cq_index = SIW_INVAL_UOBJ_KEY;
1033
1034        if (!udata) {
1035                cq->kernel_verbs = 1;
1036                cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
1037                                    sizeof(struct siw_cq_ctrl));
1038        } else {
1039                cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
1040                                         sizeof(struct siw_cq_ctrl));
1041        }
1042        if (cq->queue == NULL) {
1043                rv = -ENOMEM;
1044                goto err_out;
1045        }
1046        get_random_bytes(&cq->id, 4);
1047        siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
1048
1049        spin_lock_init(&cq->lock);
1050
1051        cq->notify = (struct siw_cq_ctrl *)&cq->queue[size];
1052
1053        if (udata) {
1054                struct siw_uresp_create_cq uresp = {};
1055                struct siw_ucontext *ctx =
1056                        rdma_udata_to_drv_context(udata, struct siw_ucontext,
1057                                                  base_ucontext);
1058
1059                cq->xa_cq_index =
1060                        siw_create_uobj(ctx, cq->queue,
1061                                        size * sizeof(struct siw_cqe) +
1062                                                sizeof(struct siw_cq_ctrl));
1063                if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) {
1064                        rv = -ENOMEM;
1065                        goto err_out;
1066                }
1067                uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT;
1068                uresp.cq_id = cq->id;
1069                uresp.num_cqe = size;
1070
1071                if (udata->outlen < sizeof(uresp)) {
1072                        rv = -EINVAL;
1073                        goto err_out;
1074                }
1075                rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1076                if (rv)
1077                        goto err_out;
1078        }
1079        return 0;
1080
1081err_out:
1082        siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
1083
1084        if (cq && cq->queue) {
1085                struct siw_ucontext *ctx =
1086                        rdma_udata_to_drv_context(udata, struct siw_ucontext,
1087                                                  base_ucontext);
1088                if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY)
1089                        kfree(xa_erase(&ctx->xa, cq->xa_cq_index));
1090                vfree(cq->queue);
1091        }
1092        atomic_dec(&sdev->num_cq);
1093
1094        return rv;
1095}
1096
1097/*
1098 * siw_poll_cq()
1099 *
1100 * Reap CQ entries if available and copy work completion status into
1101 * array of WC's provided by caller. Returns number of reaped CQE's.
1102 *
1103 * @base_cq:    Base CQ contained in siw CQ.
1104 * @num_cqe:    Maximum number of CQE's to reap.
1105 * @wc:         Array of work completions to be filled by siw.
1106 */
1107int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
1108{
1109        struct siw_cq *cq = to_siw_cq(base_cq);
1110        int i;
1111
1112        for (i = 0; i < num_cqe; i++) {
1113                if (!siw_reap_cqe(cq, wc))
1114                        break;
1115                wc++;
1116        }
1117        return i;
1118}
1119
1120/*
1121 * siw_req_notify_cq()
1122 *
1123 * Request notification for new CQE's added to that CQ.
1124 * Defined flags:
1125 * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
1126 *   event if a WQE with notification flag set enters the CQ
1127 * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
1128 *   event if a WQE enters the CQ.
1129 * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
1130 *   number of not reaped CQE's regardless of its notification
1131 *   type and current or new CQ notification settings.
1132 *
1133 * @base_cq:    Base CQ contained in siw CQ.
1134 * @flags:      Requested notification flags.
1135 */
1136int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
1137{
1138        struct siw_cq *cq = to_siw_cq(base_cq);
1139
1140        siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
1141
1142        if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
1143                /*
1144                 * Enable CQ event for next solicited completion.
1145                 * and make it visible to all associated producers.
1146                 */
1147                smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED);
1148        else
1149                /*
1150                 * Enable CQ event for any signalled completion.
1151                 * and make it visible to all associated producers.
1152                 */
1153                smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL);
1154
1155        if (flags & IB_CQ_REPORT_MISSED_EVENTS)
1156                return cq->cq_put - cq->cq_get;
1157
1158        return 0;
1159}
1160
1161/*
1162 * siw_dereg_mr()
1163 *
1164 * Release Memory Region.
1165 *
1166 * @base_mr: Base MR contained in siw MR.
1167 * @udata: points to user context, unused.
1168 */
1169int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
1170{
1171        struct siw_mr *mr = to_siw_mr(base_mr);
1172        struct siw_device *sdev = to_siw_dev(base_mr->device);
1173
1174        siw_dbg_mem(mr->mem, "deregister MR\n");
1175
1176        atomic_dec(&sdev->num_mr);
1177
1178        siw_mr_drop_mem(mr);
1179        kfree_rcu(mr, rcu);
1180
1181        return 0;
1182}
1183
1184/*
1185 * siw_reg_user_mr()
1186 *
1187 * Register Memory Region.
1188 *
1189 * @pd:         Protection Domain
1190 * @start:      starting address of MR (virtual address)
1191 * @len:        len of MR
1192 * @rnic_va:    not used by siw
1193 * @rights:     MR access rights
1194 * @udata:      user buffer to communicate STag and Key.
1195 */
1196struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
1197                              u64 rnic_va, int rights, struct ib_udata *udata)
1198{
1199        struct siw_mr *mr = NULL;
1200        struct siw_umem *umem = NULL;
1201        struct siw_ureq_reg_mr ureq;
1202        struct siw_device *sdev = to_siw_dev(pd->device);
1203
1204        unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
1205        int rv;
1206
1207        siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
1208                   (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
1209                   (unsigned long long)len);
1210
1211        if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1212                siw_dbg_pd(pd, "too many mr's\n");
1213                rv = -ENOMEM;
1214                goto err_out;
1215        }
1216        if (!len) {
1217                rv = -EINVAL;
1218                goto err_out;
1219        }
1220        if (mem_limit != RLIM_INFINITY) {
1221                unsigned long num_pages =
1222                        (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
1223                mem_limit >>= PAGE_SHIFT;
1224
1225                if (num_pages > mem_limit - current->mm->locked_vm) {
1226                        siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
1227                                   num_pages, mem_limit,
1228                                   current->mm->locked_vm);
1229                        rv = -ENOMEM;
1230                        goto err_out;
1231                }
1232        }
1233        umem = siw_umem_get(start, len, ib_access_writable(rights));
1234        if (IS_ERR(umem)) {
1235                rv = PTR_ERR(umem);
1236                siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
1237                umem = NULL;
1238                goto err_out;
1239        }
1240        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1241        if (!mr) {
1242                rv = -ENOMEM;
1243                goto err_out;
1244        }
1245        rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
1246        if (rv)
1247                goto err_out;
1248
1249        if (udata) {
1250                struct siw_uresp_reg_mr uresp = {};
1251                struct siw_mem *mem = mr->mem;
1252
1253                if (udata->inlen < sizeof(ureq)) {
1254                        rv = -EINVAL;
1255                        goto err_out;
1256                }
1257                rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
1258                if (rv)
1259                        goto err_out;
1260
1261                mr->base_mr.lkey |= ureq.stag_key;
1262                mr->base_mr.rkey |= ureq.stag_key;
1263                mem->stag |= ureq.stag_key;
1264                uresp.stag = mem->stag;
1265
1266                if (udata->outlen < sizeof(uresp)) {
1267                        rv = -EINVAL;
1268                        goto err_out;
1269                }
1270                rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1271                if (rv)
1272                        goto err_out;
1273        }
1274        mr->mem->stag_valid = 1;
1275
1276        return &mr->base_mr;
1277
1278err_out:
1279        atomic_dec(&sdev->num_mr);
1280        if (mr) {
1281                if (mr->mem)
1282                        siw_mr_drop_mem(mr);
1283                kfree_rcu(mr, rcu);
1284        } else {
1285                if (umem)
1286                        siw_umem_release(umem, false);
1287        }
1288        return ERR_PTR(rv);
1289}
1290
1291struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
1292                           u32 max_sge, struct ib_udata *udata)
1293{
1294        struct siw_device *sdev = to_siw_dev(pd->device);
1295        struct siw_mr *mr = NULL;
1296        struct siw_pbl *pbl = NULL;
1297        int rv;
1298
1299        if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1300                siw_dbg_pd(pd, "too many mr's\n");
1301                rv = -ENOMEM;
1302                goto err_out;
1303        }
1304        if (mr_type != IB_MR_TYPE_MEM_REG) {
1305                siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
1306                rv = -EOPNOTSUPP;
1307                goto err_out;
1308        }
1309        if (max_sge > SIW_MAX_SGE_PBL) {
1310                siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
1311                rv = -ENOMEM;
1312                goto err_out;
1313        }
1314        pbl = siw_pbl_alloc(max_sge);
1315        if (IS_ERR(pbl)) {
1316                rv = PTR_ERR(pbl);
1317                siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
1318                pbl = NULL;
1319                goto err_out;
1320        }
1321        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1322        if (!mr) {
1323                rv = -ENOMEM;
1324                goto err_out;
1325        }
1326        rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
1327        if (rv)
1328                goto err_out;
1329
1330        mr->mem->is_pbl = 1;
1331
1332        siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1333
1334        return &mr->base_mr;
1335
1336err_out:
1337        atomic_dec(&sdev->num_mr);
1338
1339        if (!mr) {
1340                kfree(pbl);
1341        } else {
1342                if (mr->mem)
1343                        siw_mr_drop_mem(mr);
1344                kfree_rcu(mr, rcu);
1345        }
1346        siw_dbg_pd(pd, "failed: %d\n", rv);
1347
1348        return ERR_PTR(rv);
1349}
1350
1351/* Just used to count number of pages being mapped */
1352static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
1353{
1354        return 0;
1355}
1356
1357int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
1358                  unsigned int *sg_off)
1359{
1360        struct scatterlist *slp;
1361        struct siw_mr *mr = to_siw_mr(base_mr);
1362        struct siw_mem *mem = mr->mem;
1363        struct siw_pbl *pbl = mem->pbl;
1364        struct siw_pble *pble;
1365        unsigned long pbl_size;
1366        int i, rv;
1367
1368        if (!pbl) {
1369                siw_dbg_mem(mem, "no PBL allocated\n");
1370                return -EINVAL;
1371        }
1372        pble = pbl->pbe;
1373
1374        if (pbl->max_buf < num_sle) {
1375                siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
1376                            mem->pbl->max_buf, num_sle);
1377                return -ENOMEM;
1378        }
1379        for_each_sg(sl, slp, num_sle, i) {
1380                if (sg_dma_len(slp) == 0) {
1381                        siw_dbg_mem(mem, "empty SGE\n");
1382                        return -EINVAL;
1383                }
1384                if (i == 0) {
1385                        pble->addr = sg_dma_address(slp);
1386                        pble->size = sg_dma_len(slp);
1387                        pble->pbl_off = 0;
1388                        pbl_size = pble->size;
1389                        pbl->num_buf = 1;
1390                } else {
1391                        /* Merge PBL entries if adjacent */
1392                        if (pble->addr + pble->size == sg_dma_address(slp)) {
1393                                pble->size += sg_dma_len(slp);
1394                        } else {
1395                                pble++;
1396                                pbl->num_buf++;
1397                                pble->addr = sg_dma_address(slp);
1398                                pble->size = sg_dma_len(slp);
1399                                pble->pbl_off = pbl_size;
1400                        }
1401                        pbl_size += sg_dma_len(slp);
1402                }
1403                siw_dbg_mem(mem,
1404                        "sge[%d], size %u, addr 0x%p, total %lu\n",
1405                        i, pble->size, (void *)(uintptr_t)pble->addr,
1406                        pbl_size);
1407        }
1408        rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
1409        if (rv > 0) {
1410                mem->len = base_mr->length;
1411                mem->va = base_mr->iova;
1412                siw_dbg_mem(mem,
1413                        "%llu bytes, start 0x%pK, %u SLE to %u entries\n",
1414                        mem->len, (void *)(uintptr_t)mem->va, num_sle,
1415                        pbl->num_buf);
1416        }
1417        return rv;
1418}
1419
1420/*
1421 * siw_get_dma_mr()
1422 *
1423 * Create a (empty) DMA memory region, where no umem is attached.
1424 */
1425struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
1426{
1427        struct siw_device *sdev = to_siw_dev(pd->device);
1428        struct siw_mr *mr = NULL;
1429        int rv;
1430
1431        if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1432                siw_dbg_pd(pd, "too many mr's\n");
1433                rv = -ENOMEM;
1434                goto err_out;
1435        }
1436        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1437        if (!mr) {
1438                rv = -ENOMEM;
1439                goto err_out;
1440        }
1441        rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
1442        if (rv)
1443                goto err_out;
1444
1445        mr->mem->stag_valid = 1;
1446
1447        siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1448
1449        return &mr->base_mr;
1450
1451err_out:
1452        if (rv)
1453                kfree(mr);
1454
1455        atomic_dec(&sdev->num_mr);
1456
1457        return ERR_PTR(rv);
1458}
1459
1460/*
1461 * siw_create_srq()
1462 *
1463 * Create Shared Receive Queue of attributes @init_attrs
1464 * within protection domain given by @pd.
1465 *
1466 * @base_srq:   Base SRQ contained in siw SRQ.
1467 * @init_attrs: SRQ init attributes.
1468 * @udata:      points to user context
1469 */
1470int siw_create_srq(struct ib_srq *base_srq,
1471                   struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
1472{
1473        struct siw_srq *srq = to_siw_srq(base_srq);
1474        struct ib_srq_attr *attrs = &init_attrs->attr;
1475        struct siw_device *sdev = to_siw_dev(base_srq->device);
1476        struct siw_ucontext *ctx =
1477                rdma_udata_to_drv_context(udata, struct siw_ucontext,
1478                                          base_ucontext);
1479        int rv;
1480
1481        if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
1482                siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
1483                rv = -ENOMEM;
1484                goto err_out;
1485        }
1486        if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
1487            attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
1488                rv = -EINVAL;
1489                goto err_out;
1490        }
1491        srq->max_sge = attrs->max_sge;
1492        srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
1493        srq->xa_srq_index = SIW_INVAL_UOBJ_KEY;
1494        srq->limit = attrs->srq_limit;
1495        if (srq->limit)
1496                srq->armed = 1;
1497
1498        srq->kernel_verbs = !udata;
1499
1500        if (udata)
1501                srq->recvq =
1502                        vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
1503        else
1504                srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe));
1505
1506        if (srq->recvq == NULL) {
1507                rv = -ENOMEM;
1508                goto err_out;
1509        }
1510        if (udata) {
1511                struct siw_uresp_create_srq uresp = {};
1512
1513                srq->xa_srq_index = siw_create_uobj(
1514                        ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe));
1515
1516                if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) {
1517                        rv = -ENOMEM;
1518                        goto err_out;
1519                }
1520                uresp.srq_key = srq->xa_srq_index;
1521                uresp.num_rqe = srq->num_rqe;
1522
1523                if (udata->outlen < sizeof(uresp)) {
1524                        rv = -EINVAL;
1525                        goto err_out;
1526                }
1527                rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1528                if (rv)
1529                        goto err_out;
1530        }
1531        spin_lock_init(&srq->lock);
1532
1533        siw_dbg_pd(base_srq->pd, "[SRQ]: success\n");
1534
1535        return 0;
1536
1537err_out:
1538        if (srq->recvq) {
1539                if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY)
1540                        kfree(xa_erase(&ctx->xa, srq->xa_srq_index));
1541                vfree(srq->recvq);
1542        }
1543        atomic_dec(&sdev->num_srq);
1544
1545        return rv;
1546}
1547
1548/*
1549 * siw_modify_srq()
1550 *
1551 * Modify SRQ. The caller may resize SRQ and/or set/reset notification
1552 * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
1553 *
1554 * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
1555 * parameter. siw_modify_srq() does not check the attrs->max_sge param.
1556 */
1557int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
1558                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
1559{
1560        struct siw_srq *srq = to_siw_srq(base_srq);
1561        unsigned long flags;
1562        int rv = 0;
1563
1564        spin_lock_irqsave(&srq->lock, flags);
1565
1566        if (attr_mask & IB_SRQ_MAX_WR) {
1567                /* resize request not yet supported */
1568                rv = -EOPNOTSUPP;
1569                goto out;
1570        }
1571        if (attr_mask & IB_SRQ_LIMIT) {
1572                if (attrs->srq_limit) {
1573                        if (unlikely(attrs->srq_limit > srq->num_rqe)) {
1574                                rv = -EINVAL;
1575                                goto out;
1576                        }
1577                        srq->armed = 1;
1578                } else {
1579                        srq->armed = 0;
1580                }
1581                srq->limit = attrs->srq_limit;
1582        }
1583out:
1584        spin_unlock_irqrestore(&srq->lock, flags);
1585
1586        return rv;
1587}
1588
1589/*
1590 * siw_query_srq()
1591 *
1592 * Query SRQ attributes.
1593 */
1594int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
1595{
1596        struct siw_srq *srq = to_siw_srq(base_srq);
1597        unsigned long flags;
1598
1599        spin_lock_irqsave(&srq->lock, flags);
1600
1601        attrs->max_wr = srq->num_rqe;
1602        attrs->max_sge = srq->max_sge;
1603        attrs->srq_limit = srq->limit;
1604
1605        spin_unlock_irqrestore(&srq->lock, flags);
1606
1607        return 0;
1608}
1609
1610/*
1611 * siw_destroy_srq()
1612 *
1613 * Destroy SRQ.
1614 * It is assumed that the SRQ is not referenced by any
1615 * QP anymore - the code trusts the RDMA core environment to keep track
1616 * of QP references.
1617 */
1618void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
1619{
1620        struct siw_srq *srq = to_siw_srq(base_srq);
1621        struct siw_device *sdev = to_siw_dev(base_srq->device);
1622        struct siw_ucontext *ctx =
1623                rdma_udata_to_drv_context(udata, struct siw_ucontext,
1624                                          base_ucontext);
1625
1626        if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY)
1627                kfree(xa_erase(&ctx->xa, srq->xa_srq_index));
1628
1629        vfree(srq->recvq);
1630        atomic_dec(&sdev->num_srq);
1631}
1632
1633/*
1634 * siw_post_srq_recv()
1635 *
1636 * Post a list of receive queue elements to SRQ.
1637 * NOTE: The function does not check or lock a certain SRQ state
1638 *       during the post operation. The code simply trusts the
1639 *       RDMA core environment.
1640 *
1641 * @base_srq:   Base SRQ contained in siw SRQ
1642 * @wr:         List of R-WR's
1643 * @bad_wr:     Updated to failing WR if posting fails.
1644 */
1645int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
1646                      const struct ib_recv_wr **bad_wr)
1647{
1648        struct siw_srq *srq = to_siw_srq(base_srq);
1649        unsigned long flags;
1650        int rv = 0;
1651
1652        if (unlikely(!srq->kernel_verbs)) {
1653                siw_dbg_pd(base_srq->pd,
1654                           "[SRQ]: no kernel post_recv for mapped srq\n");
1655                rv = -EINVAL;
1656                goto out;
1657        }
1658        /*
1659         * Serialize potentially multiple producers.
1660         * Also needed to serialize potentially multiple
1661         * consumers.
1662         */
1663        spin_lock_irqsave(&srq->lock, flags);
1664
1665        while (wr) {
1666                u32 idx = srq->rq_put % srq->num_rqe;
1667                struct siw_rqe *rqe = &srq->recvq[idx];
1668
1669                if (rqe->flags) {
1670                        siw_dbg_pd(base_srq->pd, "SRQ full\n");
1671                        rv = -ENOMEM;
1672                        break;
1673                }
1674                if (unlikely(wr->num_sge > srq->max_sge)) {
1675                        siw_dbg_pd(base_srq->pd,
1676                                   "[SRQ]: too many sge's: %d\n", wr->num_sge);
1677                        rv = -EINVAL;
1678                        break;
1679                }
1680                rqe->id = wr->wr_id;
1681                rqe->num_sge = wr->num_sge;
1682                siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1683
1684                /* Make sure S-RQE is completely written before valid */
1685                smp_wmb();
1686
1687                rqe->flags = SIW_WQE_VALID;
1688
1689                srq->rq_put++;
1690                wr = wr->next;
1691        }
1692        spin_unlock_irqrestore(&srq->lock, flags);
1693out:
1694        if (unlikely(rv < 0)) {
1695                siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv);
1696                *bad_wr = wr;
1697        }
1698        return rv;
1699}
1700
1701void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
1702{
1703        struct ib_event event;
1704        struct ib_qp *base_qp = qp->ib_qp;
1705
1706        /*
1707         * Do not report asynchronous errors on QP which gets
1708         * destroyed via verbs interface (siw_destroy_qp())
1709         */
1710        if (qp->attrs.flags & SIW_QP_IN_DESTROY)
1711                return;
1712
1713        event.event = etype;
1714        event.device = base_qp->device;
1715        event.element.qp = base_qp;
1716
1717        if (base_qp->event_handler) {
1718                siw_dbg_qp(qp, "reporting event %d\n", etype);
1719                base_qp->event_handler(&event, base_qp->qp_context);
1720        }
1721}
1722
1723void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
1724{
1725        struct ib_event event;
1726        struct ib_cq *base_cq = &cq->base_cq;
1727
1728        event.event = etype;
1729        event.device = base_cq->device;
1730        event.element.cq = base_cq;
1731
1732        if (base_cq->event_handler) {
1733                siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
1734                base_cq->event_handler(&event, base_cq->cq_context);
1735        }
1736}
1737
1738void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
1739{
1740        struct ib_event event;
1741        struct ib_srq *base_srq = &srq->base_srq;
1742
1743        event.event = etype;
1744        event.device = base_srq->device;
1745        event.element.srq = base_srq;
1746
1747        if (base_srq->event_handler) {
1748                siw_dbg_pd(srq->base_srq.pd,
1749                           "reporting SRQ event %d\n", etype);
1750                base_srq->event_handler(&event, base_srq->srq_context);
1751        }
1752}
1753
1754void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype)
1755{
1756        struct ib_event event;
1757
1758        event.event = etype;
1759        event.device = &sdev->base_dev;
1760        event.element.port_num = port;
1761
1762        siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
1763
1764        ib_dispatch_event(&event);
1765}
1766