linux/net/sunrpc/xprtrdma/verbs.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the BSD-type
   8 * license below:
   9 *
  10 * Redistribution and use in source and binary forms, with or without
  11 * modification, are permitted provided that the following conditions
  12 * are met:
  13 *
  14 *      Redistributions of source code must retain the above copyright
  15 *      notice, this list of conditions and the following disclaimer.
  16 *
  17 *      Redistributions in binary form must reproduce the above
  18 *      copyright notice, this list of conditions and the following
  19 *      disclaimer in the documentation and/or other materials provided
  20 *      with the distribution.
  21 *
  22 *      Neither the name of the Network Appliance, Inc. nor the names of
  23 *      its contributors may be used to endorse or promote products
  24 *      derived from this software without specific prior written
  25 *      permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38 */
  39
  40/*
  41 * verbs.c
  42 *
  43 * Encapsulates the major functions managing:
  44 *  o adapters
  45 *  o endpoints
  46 *  o connections
  47 *  o buffer memory
  48 */
  49
  50#include <linux/pci.h>  /* for Tavor hack below */
  51#include <linux/slab.h>
  52
  53#include "xprt_rdma.h"
  54
  55/*
  56 * Globals/Macros
  57 */
  58
  59#ifdef RPC_DEBUG
  60# define RPCDBG_FACILITY        RPCDBG_TRANS
  61#endif
  62
  63/*
  64 * internal functions
  65 */
  66
  67/*
  68 * handle replies in tasklet context, using a single, global list
  69 * rdma tasklet function -- just turn around and call the func
  70 * for all replies on the list
  71 */
  72
  73static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
  74static LIST_HEAD(rpcrdma_tasklets_g);
  75
  76static void
  77rpcrdma_run_tasklet(unsigned long data)
  78{
  79        struct rpcrdma_rep *rep;
  80        void (*func)(struct rpcrdma_rep *);
  81        unsigned long flags;
  82
  83        data = data;
  84        spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
  85        while (!list_empty(&rpcrdma_tasklets_g)) {
  86                rep = list_entry(rpcrdma_tasklets_g.next,
  87                                 struct rpcrdma_rep, rr_list);
  88                list_del(&rep->rr_list);
  89                func = rep->rr_func;
  90                rep->rr_func = NULL;
  91                spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
  92
  93                if (func)
  94                        func(rep);
  95                else
  96                        rpcrdma_recv_buffer_put(rep);
  97
  98                spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
  99        }
 100        spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
 101}
 102
 103static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
 104
 105static inline void
 106rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
 107{
 108        unsigned long flags;
 109
 110        spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
 111        list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
 112        spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
 113        tasklet_schedule(&rpcrdma_tasklet_g);
 114}
 115
 116static void
 117rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
 118{
 119        struct rpcrdma_ep *ep = context;
 120
 121        dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
 122                __func__, event->event, event->device->name, context);
 123        if (ep->rep_connected == 1) {
 124                ep->rep_connected = -EIO;
 125                ep->rep_func(ep);
 126                wake_up_all(&ep->rep_connect_wait);
 127        }
 128}
 129
 130static void
 131rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
 132{
 133        struct rpcrdma_ep *ep = context;
 134
 135        dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
 136                __func__, event->event, event->device->name, context);
 137        if (ep->rep_connected == 1) {
 138                ep->rep_connected = -EIO;
 139                ep->rep_func(ep);
 140                wake_up_all(&ep->rep_connect_wait);
 141        }
 142}
 143
 144static inline
 145void rpcrdma_event_process(struct ib_wc *wc)
 146{
 147        struct rpcrdma_mw *frmr;
 148        struct rpcrdma_rep *rep =
 149                        (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
 150
 151        dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
 152                __func__, rep, wc->status, wc->opcode, wc->byte_len);
 153
 154        if (!rep) /* send or bind completion that we don't care about */
 155                return;
 156
 157        if (IB_WC_SUCCESS != wc->status) {
 158                dprintk("RPC:       %s: WC opcode %d status %X, connection lost\n",
 159                        __func__, wc->opcode, wc->status);
 160                rep->rr_len = ~0U;
 161                if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
 162                        rpcrdma_schedule_tasklet(rep);
 163                return;
 164        }
 165
 166        switch (wc->opcode) {
 167        case IB_WC_FAST_REG_MR:
 168                frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
 169                frmr->r.frmr.state = FRMR_IS_VALID;
 170                break;
 171        case IB_WC_LOCAL_INV:
 172                frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
 173                frmr->r.frmr.state = FRMR_IS_INVALID;
 174                break;
 175        case IB_WC_RECV:
 176                rep->rr_len = wc->byte_len;
 177                ib_dma_sync_single_for_cpu(
 178                        rdmab_to_ia(rep->rr_buffer)->ri_id->device,
 179                        rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
 180                /* Keep (only) the most recent credits, after check validity */
 181                if (rep->rr_len >= 16) {
 182                        struct rpcrdma_msg *p =
 183                                        (struct rpcrdma_msg *) rep->rr_base;
 184                        unsigned int credits = ntohl(p->rm_credit);
 185                        if (credits == 0) {
 186                                dprintk("RPC:       %s: server"
 187                                        " dropped credits to 0!\n", __func__);
 188                                /* don't deadlock */
 189                                credits = 1;
 190                        } else if (credits > rep->rr_buffer->rb_max_requests) {
 191                                dprintk("RPC:       %s: server"
 192                                        " over-crediting: %d (%d)\n",
 193                                        __func__, credits,
 194                                        rep->rr_buffer->rb_max_requests);
 195                                credits = rep->rr_buffer->rb_max_requests;
 196                        }
 197                        atomic_set(&rep->rr_buffer->rb_credits, credits);
 198                }
 199                /* fall through */
 200        case IB_WC_BIND_MW:
 201                rpcrdma_schedule_tasklet(rep);
 202                break;
 203        default:
 204                dprintk("RPC:       %s: unexpected WC event %X\n",
 205                        __func__, wc->opcode);
 206                break;
 207        }
 208}
 209
 210static inline int
 211rpcrdma_cq_poll(struct ib_cq *cq)
 212{
 213        struct ib_wc wc;
 214        int rc;
 215
 216        for (;;) {
 217                rc = ib_poll_cq(cq, 1, &wc);
 218                if (rc < 0) {
 219                        dprintk("RPC:       %s: ib_poll_cq failed %i\n",
 220                                __func__, rc);
 221                        return rc;
 222                }
 223                if (rc == 0)
 224                        break;
 225
 226                rpcrdma_event_process(&wc);
 227        }
 228
 229        return 0;
 230}
 231
 232/*
 233 * rpcrdma_cq_event_upcall
 234 *
 235 * This upcall handles recv, send, bind and unbind events.
 236 * It is reentrant but processes single events in order to maintain
 237 * ordering of receives to keep server credits.
 238 *
 239 * It is the responsibility of the scheduled tasklet to return
 240 * recv buffers to the pool. NOTE: this affects synchronization of
 241 * connection shutdown. That is, the structures required for
 242 * the completion of the reply handler must remain intact until
 243 * all memory has been reclaimed.
 244 *
 245 * Note that send events are suppressed and do not result in an upcall.
 246 */
 247static void
 248rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
 249{
 250        int rc;
 251
 252        rc = rpcrdma_cq_poll(cq);
 253        if (rc)
 254                return;
 255
 256        rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 257        if (rc) {
 258                dprintk("RPC:       %s: ib_req_notify_cq failed %i\n",
 259                        __func__, rc);
 260                return;
 261        }
 262
 263        rpcrdma_cq_poll(cq);
 264}
 265
 266#ifdef RPC_DEBUG
 267static const char * const conn[] = {
 268        "address resolved",
 269        "address error",
 270        "route resolved",
 271        "route error",
 272        "connect request",
 273        "connect response",
 274        "connect error",
 275        "unreachable",
 276        "rejected",
 277        "established",
 278        "disconnected",
 279        "device removal"
 280};
 281#endif
 282
 283static int
 284rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 285{
 286        struct rpcrdma_xprt *xprt = id->context;
 287        struct rpcrdma_ia *ia = &xprt->rx_ia;
 288        struct rpcrdma_ep *ep = &xprt->rx_ep;
 289#ifdef RPC_DEBUG
 290        struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
 291#endif
 292        struct ib_qp_attr attr;
 293        struct ib_qp_init_attr iattr;
 294        int connstate = 0;
 295
 296        switch (event->event) {
 297        case RDMA_CM_EVENT_ADDR_RESOLVED:
 298        case RDMA_CM_EVENT_ROUTE_RESOLVED:
 299                ia->ri_async_rc = 0;
 300                complete(&ia->ri_done);
 301                break;
 302        case RDMA_CM_EVENT_ADDR_ERROR:
 303                ia->ri_async_rc = -EHOSTUNREACH;
 304                dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
 305                        __func__, ep);
 306                complete(&ia->ri_done);
 307                break;
 308        case RDMA_CM_EVENT_ROUTE_ERROR:
 309                ia->ri_async_rc = -ENETUNREACH;
 310                dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
 311                        __func__, ep);
 312                complete(&ia->ri_done);
 313                break;
 314        case RDMA_CM_EVENT_ESTABLISHED:
 315                connstate = 1;
 316                ib_query_qp(ia->ri_id->qp, &attr,
 317                        IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
 318                        &iattr);
 319                dprintk("RPC:       %s: %d responder resources"
 320                        " (%d initiator)\n",
 321                        __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
 322                goto connected;
 323        case RDMA_CM_EVENT_CONNECT_ERROR:
 324                connstate = -ENOTCONN;
 325                goto connected;
 326        case RDMA_CM_EVENT_UNREACHABLE:
 327                connstate = -ENETDOWN;
 328                goto connected;
 329        case RDMA_CM_EVENT_REJECTED:
 330                connstate = -ECONNREFUSED;
 331                goto connected;
 332        case RDMA_CM_EVENT_DISCONNECTED:
 333                connstate = -ECONNABORTED;
 334                goto connected;
 335        case RDMA_CM_EVENT_DEVICE_REMOVAL:
 336                connstate = -ENODEV;
 337connected:
 338                dprintk("RPC:       %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
 339                        __func__,
 340                        (event->event <= 11) ? conn[event->event] :
 341                                                "unknown connection error",
 342                        &addr->sin_addr.s_addr,
 343                        ntohs(addr->sin_port),
 344                        ep, event->event);
 345                atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
 346                dprintk("RPC:       %s: %sconnected\n",
 347                                        __func__, connstate > 0 ? "" : "dis");
 348                ep->rep_connected = connstate;
 349                ep->rep_func(ep);
 350                wake_up_all(&ep->rep_connect_wait);
 351                break;
 352        default:
 353                dprintk("RPC:       %s: unexpected CM event %d\n",
 354                        __func__, event->event);
 355                break;
 356        }
 357
 358#ifdef RPC_DEBUG
 359        if (connstate == 1) {
 360                int ird = attr.max_dest_rd_atomic;
 361                int tird = ep->rep_remote_cma.responder_resources;
 362                printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
 363                        "on %s, memreg %d slots %d ird %d%s\n",
 364                        &addr->sin_addr.s_addr,
 365                        ntohs(addr->sin_port),
 366                        ia->ri_id->device->name,
 367                        ia->ri_memreg_strategy,
 368                        xprt->rx_buf.rb_max_requests,
 369                        ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
 370        } else if (connstate < 0) {
 371                printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
 372                        &addr->sin_addr.s_addr,
 373                        ntohs(addr->sin_port),
 374                        connstate);
 375        }
 376#endif
 377
 378        return 0;
 379}
 380
 381static struct rdma_cm_id *
 382rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 383                        struct rpcrdma_ia *ia, struct sockaddr *addr)
 384{
 385        struct rdma_cm_id *id;
 386        int rc;
 387
 388        init_completion(&ia->ri_done);
 389
 390        id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
 391        if (IS_ERR(id)) {
 392                rc = PTR_ERR(id);
 393                dprintk("RPC:       %s: rdma_create_id() failed %i\n",
 394                        __func__, rc);
 395                return id;
 396        }
 397
 398        ia->ri_async_rc = -ETIMEDOUT;
 399        rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
 400        if (rc) {
 401                dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
 402                        __func__, rc);
 403                goto out;
 404        }
 405        wait_for_completion_interruptible_timeout(&ia->ri_done,
 406                                msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
 407        rc = ia->ri_async_rc;
 408        if (rc)
 409                goto out;
 410
 411        ia->ri_async_rc = -ETIMEDOUT;
 412        rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
 413        if (rc) {
 414                dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
 415                        __func__, rc);
 416                goto out;
 417        }
 418        wait_for_completion_interruptible_timeout(&ia->ri_done,
 419                                msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
 420        rc = ia->ri_async_rc;
 421        if (rc)
 422                goto out;
 423
 424        return id;
 425
 426out:
 427        rdma_destroy_id(id);
 428        return ERR_PTR(rc);
 429}
 430
 431/*
 432 * Drain any cq, prior to teardown.
 433 */
 434static void
 435rpcrdma_clean_cq(struct ib_cq *cq)
 436{
 437        struct ib_wc wc;
 438        int count = 0;
 439
 440        while (1 == ib_poll_cq(cq, 1, &wc))
 441                ++count;
 442
 443        if (count)
 444                dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
 445                        __func__, count, wc.opcode);
 446}
 447
 448/*
 449 * Exported functions.
 450 */
 451
 452/*
 453 * Open and initialize an Interface Adapter.
 454 *  o initializes fields of struct rpcrdma_ia, including
 455 *    interface and provider attributes and protection zone.
 456 */
 457int
 458rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 459{
 460        int rc, mem_priv;
 461        struct ib_device_attr devattr;
 462        struct rpcrdma_ia *ia = &xprt->rx_ia;
 463
 464        ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
 465        if (IS_ERR(ia->ri_id)) {
 466                rc = PTR_ERR(ia->ri_id);
 467                goto out1;
 468        }
 469
 470        ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
 471        if (IS_ERR(ia->ri_pd)) {
 472                rc = PTR_ERR(ia->ri_pd);
 473                dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
 474                        __func__, rc);
 475                goto out2;
 476        }
 477
 478        /*
 479         * Query the device to determine if the requested memory
 480         * registration strategy is supported. If it isn't, set the
 481         * strategy to a globally supported model.
 482         */
 483        rc = ib_query_device(ia->ri_id->device, &devattr);
 484        if (rc) {
 485                dprintk("RPC:       %s: ib_query_device failed %d\n",
 486                        __func__, rc);
 487                goto out2;
 488        }
 489
 490        if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
 491                ia->ri_have_dma_lkey = 1;
 492                ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
 493        }
 494
 495        switch (memreg) {
 496        case RPCRDMA_MEMWINDOWS:
 497        case RPCRDMA_MEMWINDOWS_ASYNC:
 498                if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
 499                        dprintk("RPC:       %s: MEMWINDOWS registration "
 500                                "specified but not supported by adapter, "
 501                                "using slower RPCRDMA_REGISTER\n",
 502                                __func__);
 503                        memreg = RPCRDMA_REGISTER;
 504                }
 505                break;
 506        case RPCRDMA_MTHCAFMR:
 507                if (!ia->ri_id->device->alloc_fmr) {
 508#if RPCRDMA_PERSISTENT_REGISTRATION
 509                        dprintk("RPC:       %s: MTHCAFMR registration "
 510                                "specified but not supported by adapter, "
 511                                "using riskier RPCRDMA_ALLPHYSICAL\n",
 512                                __func__);
 513                        memreg = RPCRDMA_ALLPHYSICAL;
 514#else
 515                        dprintk("RPC:       %s: MTHCAFMR registration "
 516                                "specified but not supported by adapter, "
 517                                "using slower RPCRDMA_REGISTER\n",
 518                                __func__);
 519                        memreg = RPCRDMA_REGISTER;
 520#endif
 521                }
 522                break;
 523        case RPCRDMA_FRMR:
 524                /* Requires both frmr reg and local dma lkey */
 525                if ((devattr.device_cap_flags &
 526                     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
 527                    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
 528#if RPCRDMA_PERSISTENT_REGISTRATION
 529                        dprintk("RPC:       %s: FRMR registration "
 530                                "specified but not supported by adapter, "
 531                                "using riskier RPCRDMA_ALLPHYSICAL\n",
 532                                __func__);
 533                        memreg = RPCRDMA_ALLPHYSICAL;
 534#else
 535                        dprintk("RPC:       %s: FRMR registration "
 536                                "specified but not supported by adapter, "
 537                                "using slower RPCRDMA_REGISTER\n",
 538                                __func__);
 539                        memreg = RPCRDMA_REGISTER;
 540#endif
 541                }
 542                break;
 543        }
 544
 545        /*
 546         * Optionally obtain an underlying physical identity mapping in
 547         * order to do a memory window-based bind. This base registration
 548         * is protected from remote access - that is enabled only by binding
 549         * for the specific bytes targeted during each RPC operation, and
 550         * revoked after the corresponding completion similar to a storage
 551         * adapter.
 552         */
 553        switch (memreg) {
 554        case RPCRDMA_BOUNCEBUFFERS:
 555        case RPCRDMA_REGISTER:
 556        case RPCRDMA_FRMR:
 557                break;
 558#if RPCRDMA_PERSISTENT_REGISTRATION
 559        case RPCRDMA_ALLPHYSICAL:
 560                mem_priv = IB_ACCESS_LOCAL_WRITE |
 561                                IB_ACCESS_REMOTE_WRITE |
 562                                IB_ACCESS_REMOTE_READ;
 563                goto register_setup;
 564#endif
 565        case RPCRDMA_MEMWINDOWS_ASYNC:
 566        case RPCRDMA_MEMWINDOWS:
 567                mem_priv = IB_ACCESS_LOCAL_WRITE |
 568                                IB_ACCESS_MW_BIND;
 569                goto register_setup;
 570        case RPCRDMA_MTHCAFMR:
 571                if (ia->ri_have_dma_lkey)
 572                        break;
 573                mem_priv = IB_ACCESS_LOCAL_WRITE;
 574        register_setup:
 575                ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
 576                if (IS_ERR(ia->ri_bind_mem)) {
 577                        printk(KERN_ALERT "%s: ib_get_dma_mr for "
 578                                "phys register failed with %lX\n\t"
 579                                "Will continue with degraded performance\n",
 580                                __func__, PTR_ERR(ia->ri_bind_mem));
 581                        memreg = RPCRDMA_REGISTER;
 582                        ia->ri_bind_mem = NULL;
 583                }
 584                break;
 585        default:
 586                printk(KERN_ERR "%s: invalid memory registration mode %d\n",
 587                                __func__, memreg);
 588                rc = -EINVAL;
 589                goto out2;
 590        }
 591        dprintk("RPC:       %s: memory registration strategy is %d\n",
 592                __func__, memreg);
 593
 594        /* Else will do memory reg/dereg for each chunk */
 595        ia->ri_memreg_strategy = memreg;
 596
 597        return 0;
 598out2:
 599        rdma_destroy_id(ia->ri_id);
 600        ia->ri_id = NULL;
 601out1:
 602        return rc;
 603}
 604
 605/*
 606 * Clean up/close an IA.
 607 *   o if event handles and PD have been initialized, free them.
 608 *   o close the IA
 609 */
 610void
 611rpcrdma_ia_close(struct rpcrdma_ia *ia)
 612{
 613        int rc;
 614
 615        dprintk("RPC:       %s: entering\n", __func__);
 616        if (ia->ri_bind_mem != NULL) {
 617                rc = ib_dereg_mr(ia->ri_bind_mem);
 618                dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
 619                        __func__, rc);
 620        }
 621        if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
 622                if (ia->ri_id->qp)
 623                        rdma_destroy_qp(ia->ri_id);
 624                rdma_destroy_id(ia->ri_id);
 625                ia->ri_id = NULL;
 626        }
 627        if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
 628                rc = ib_dealloc_pd(ia->ri_pd);
 629                dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
 630                        __func__, rc);
 631        }
 632}
 633
 634/*
 635 * Create unconnected endpoint.
 636 */
 637int
 638rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 639                                struct rpcrdma_create_data_internal *cdata)
 640{
 641        struct ib_device_attr devattr;
 642        int rc, err;
 643
 644        rc = ib_query_device(ia->ri_id->device, &devattr);
 645        if (rc) {
 646                dprintk("RPC:       %s: ib_query_device failed %d\n",
 647                        __func__, rc);
 648                return rc;
 649        }
 650
 651        /* check provider's send/recv wr limits */
 652        if (cdata->max_requests > devattr.max_qp_wr)
 653                cdata->max_requests = devattr.max_qp_wr;
 654
 655        ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
 656        ep->rep_attr.qp_context = ep;
 657        /* send_cq and recv_cq initialized below */
 658        ep->rep_attr.srq = NULL;
 659        ep->rep_attr.cap.max_send_wr = cdata->max_requests;
 660        switch (ia->ri_memreg_strategy) {
 661        case RPCRDMA_FRMR:
 662                /* Add room for frmr register and invalidate WRs.
 663                 * 1. FRMR reg WR for head
 664                 * 2. FRMR invalidate WR for head
 665                 * 3. FRMR reg WR for pagelist
 666                 * 4. FRMR invalidate WR for pagelist
 667                 * 5. FRMR reg WR for tail
 668                 * 6. FRMR invalidate WR for tail
 669                 * 7. The RDMA_SEND WR
 670                 */
 671                ep->rep_attr.cap.max_send_wr *= 7;
 672                if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
 673                        cdata->max_requests = devattr.max_qp_wr / 7;
 674                        if (!cdata->max_requests)
 675                                return -EINVAL;
 676                        ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
 677                }
 678                break;
 679        case RPCRDMA_MEMWINDOWS_ASYNC:
 680        case RPCRDMA_MEMWINDOWS:
 681                /* Add room for mw_binds+unbinds - overkill! */
 682                ep->rep_attr.cap.max_send_wr++;
 683                ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
 684                if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
 685                        return -EINVAL;
 686                break;
 687        default:
 688                break;
 689        }
 690        ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
 691        ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
 692        ep->rep_attr.cap.max_recv_sge = 1;
 693        ep->rep_attr.cap.max_inline_data = 0;
 694        ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 695        ep->rep_attr.qp_type = IB_QPT_RC;
 696        ep->rep_attr.port_num = ~0;
 697
 698        dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
 699                "iovs: send %d recv %d\n",
 700                __func__,
 701                ep->rep_attr.cap.max_send_wr,
 702                ep->rep_attr.cap.max_recv_wr,
 703                ep->rep_attr.cap.max_send_sge,
 704                ep->rep_attr.cap.max_recv_sge);
 705
 706        /* set trigger for requesting send completion */
 707        ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
 708        switch (ia->ri_memreg_strategy) {
 709        case RPCRDMA_MEMWINDOWS_ASYNC:
 710        case RPCRDMA_MEMWINDOWS:
 711                ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
 712                break;
 713        default:
 714                break;
 715        }
 716        if (ep->rep_cqinit <= 2)
 717                ep->rep_cqinit = 0;
 718        INIT_CQCOUNT(ep);
 719        ep->rep_ia = ia;
 720        init_waitqueue_head(&ep->rep_connect_wait);
 721
 722        /*
 723         * Create a single cq for receive dto and mw_bind (only ever
 724         * care about unbind, really). Send completions are suppressed.
 725         * Use single threaded tasklet upcalls to maintain ordering.
 726         */
 727        ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
 728                                  rpcrdma_cq_async_error_upcall, NULL,
 729                                  ep->rep_attr.cap.max_recv_wr +
 730                                  ep->rep_attr.cap.max_send_wr + 1, 0);
 731        if (IS_ERR(ep->rep_cq)) {
 732                rc = PTR_ERR(ep->rep_cq);
 733                dprintk("RPC:       %s: ib_create_cq failed: %i\n",
 734                        __func__, rc);
 735                goto out1;
 736        }
 737
 738        rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
 739        if (rc) {
 740                dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
 741                        __func__, rc);
 742                goto out2;
 743        }
 744
 745        ep->rep_attr.send_cq = ep->rep_cq;
 746        ep->rep_attr.recv_cq = ep->rep_cq;
 747
 748        /* Initialize cma parameters */
 749
 750        /* RPC/RDMA does not use private data */
 751        ep->rep_remote_cma.private_data = NULL;
 752        ep->rep_remote_cma.private_data_len = 0;
 753
 754        /* Client offers RDMA Read but does not initiate */
 755        ep->rep_remote_cma.initiator_depth = 0;
 756        if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
 757                ep->rep_remote_cma.responder_resources = 0;
 758        else if (devattr.max_qp_rd_atom > 32)   /* arbitrary but <= 255 */
 759                ep->rep_remote_cma.responder_resources = 32;
 760        else
 761                ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
 762
 763        ep->rep_remote_cma.retry_count = 7;
 764        ep->rep_remote_cma.flow_control = 0;
 765        ep->rep_remote_cma.rnr_retry_count = 0;
 766
 767        return 0;
 768
 769out2:
 770        err = ib_destroy_cq(ep->rep_cq);
 771        if (err)
 772                dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
 773                        __func__, err);
 774out1:
 775        return rc;
 776}
 777
 778/*
 779 * rpcrdma_ep_destroy
 780 *
 781 * Disconnect and destroy endpoint. After this, the only
 782 * valid operations on the ep are to free it (if dynamically
 783 * allocated) or re-create it.
 784 *
 785 * The caller's error handling must be sure to not leak the endpoint
 786 * if this function fails.
 787 */
 788int
 789rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 790{
 791        int rc;
 792
 793        dprintk("RPC:       %s: entering, connected is %d\n",
 794                __func__, ep->rep_connected);
 795
 796        if (ia->ri_id->qp) {
 797                rc = rpcrdma_ep_disconnect(ep, ia);
 798                if (rc)
 799                        dprintk("RPC:       %s: rpcrdma_ep_disconnect"
 800                                " returned %i\n", __func__, rc);
 801                rdma_destroy_qp(ia->ri_id);
 802                ia->ri_id->qp = NULL;
 803        }
 804
 805        /* padding - could be done in rpcrdma_buffer_destroy... */
 806        if (ep->rep_pad_mr) {
 807                rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
 808                ep->rep_pad_mr = NULL;
 809        }
 810
 811        rpcrdma_clean_cq(ep->rep_cq);
 812        rc = ib_destroy_cq(ep->rep_cq);
 813        if (rc)
 814                dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
 815                        __func__, rc);
 816
 817        return rc;
 818}
 819
 820/*
 821 * Connect unconnected endpoint.
 822 */
 823int
 824rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 825{
 826        struct rdma_cm_id *id;
 827        int rc = 0;
 828        int retry_count = 0;
 829
 830        if (ep->rep_connected != 0) {
 831                struct rpcrdma_xprt *xprt;
 832retry:
 833                rc = rpcrdma_ep_disconnect(ep, ia);
 834                if (rc && rc != -ENOTCONN)
 835                        dprintk("RPC:       %s: rpcrdma_ep_disconnect"
 836                                " status %i\n", __func__, rc);
 837                rpcrdma_clean_cq(ep->rep_cq);
 838
 839                xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
 840                id = rpcrdma_create_id(xprt, ia,
 841                                (struct sockaddr *)&xprt->rx_data.addr);
 842                if (IS_ERR(id)) {
 843                        rc = PTR_ERR(id);
 844                        goto out;
 845                }
 846                /* TEMP TEMP TEMP - fail if new device:
 847                 * Deregister/remarshal *all* requests!
 848                 * Close and recreate adapter, pd, etc!
 849                 * Re-determine all attributes still sane!
 850                 * More stuff I haven't thought of!
 851                 * Rrrgh!
 852                 */
 853                if (ia->ri_id->device != id->device) {
 854                        printk("RPC:       %s: can't reconnect on "
 855                                "different device!\n", __func__);
 856                        rdma_destroy_id(id);
 857                        rc = -ENETDOWN;
 858                        goto out;
 859                }
 860                /* END TEMP */
 861                rdma_destroy_qp(ia->ri_id);
 862                rdma_destroy_id(ia->ri_id);
 863                ia->ri_id = id;
 864        }
 865
 866        rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
 867        if (rc) {
 868                dprintk("RPC:       %s: rdma_create_qp failed %i\n",
 869                        __func__, rc);
 870                goto out;
 871        }
 872
 873/* XXX Tavor device performs badly with 2K MTU! */
 874if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
 875        struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
 876        if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
 877            (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
 878             pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
 879                struct ib_qp_attr attr = {
 880                        .path_mtu = IB_MTU_1024
 881                };
 882                rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
 883        }
 884}
 885
 886        ep->rep_connected = 0;
 887
 888        rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
 889        if (rc) {
 890                dprintk("RPC:       %s: rdma_connect() failed with %i\n",
 891                                __func__, rc);
 892                goto out;
 893        }
 894
 895        wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
 896
 897        /*
 898         * Check state. A non-peer reject indicates no listener
 899         * (ECONNREFUSED), which may be a transient state. All
 900         * others indicate a transport condition which has already
 901         * undergone a best-effort.
 902         */
 903        if (ep->rep_connected == -ECONNREFUSED &&
 904            ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
 905                dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
 906                goto retry;
 907        }
 908        if (ep->rep_connected <= 0) {
 909                /* Sometimes, the only way to reliably connect to remote
 910                 * CMs is to use same nonzero values for ORD and IRD. */
 911                if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
 912                    (ep->rep_remote_cma.responder_resources == 0 ||
 913                     ep->rep_remote_cma.initiator_depth !=
 914                                ep->rep_remote_cma.responder_resources)) {
 915                        if (ep->rep_remote_cma.responder_resources == 0)
 916                                ep->rep_remote_cma.responder_resources = 1;
 917                        ep->rep_remote_cma.initiator_depth =
 918                                ep->rep_remote_cma.responder_resources;
 919                        goto retry;
 920                }
 921                rc = ep->rep_connected;
 922        } else {
 923                dprintk("RPC:       %s: connected\n", __func__);
 924        }
 925
 926out:
 927        if (rc)
 928                ep->rep_connected = rc;
 929        return rc;
 930}
 931
 932/*
 933 * rpcrdma_ep_disconnect
 934 *
 935 * This is separate from destroy to facilitate the ability
 936 * to reconnect without recreating the endpoint.
 937 *
 938 * This call is not reentrant, and must not be made in parallel
 939 * on the same endpoint.
 940 */
 941int
 942rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 943{
 944        int rc;
 945
 946        rpcrdma_clean_cq(ep->rep_cq);
 947        rc = rdma_disconnect(ia->ri_id);
 948        if (!rc) {
 949                /* returns without wait if not connected */
 950                wait_event_interruptible(ep->rep_connect_wait,
 951                                                        ep->rep_connected != 1);
 952                dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
 953                        (ep->rep_connected == 1) ? "still " : "dis");
 954        } else {
 955                dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
 956                ep->rep_connected = rc;
 957        }
 958        return rc;
 959}
 960
 961/*
 962 * Initialize buffer memory
 963 */
 964int
 965rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 966        struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
 967{
 968        char *p;
 969        size_t len;
 970        int i, rc;
 971        struct rpcrdma_mw *r;
 972
 973        buf->rb_max_requests = cdata->max_requests;
 974        spin_lock_init(&buf->rb_lock);
 975        atomic_set(&buf->rb_credits, 1);
 976
 977        /* Need to allocate:
 978         *   1.  arrays for send and recv pointers
 979         *   2.  arrays of struct rpcrdma_req to fill in pointers
 980         *   3.  array of struct rpcrdma_rep for replies
 981         *   4.  padding, if any
 982         *   5.  mw's, fmr's or frmr's, if any
 983         * Send/recv buffers in req/rep need to be registered
 984         */
 985
 986        len = buf->rb_max_requests *
 987                (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
 988        len += cdata->padding;
 989        switch (ia->ri_memreg_strategy) {
 990        case RPCRDMA_FRMR:
 991                len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
 992                                sizeof(struct rpcrdma_mw);
 993                break;
 994        case RPCRDMA_MTHCAFMR:
 995                /* TBD we are perhaps overallocating here */
 996                len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
 997                                sizeof(struct rpcrdma_mw);
 998                break;
 999        case RPCRDMA_MEMWINDOWS_ASYNC:
1000        case RPCRDMA_MEMWINDOWS:
1001                len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1002                                sizeof(struct rpcrdma_mw);
1003                break;
1004        default:
1005                break;
1006        }
1007
1008        /* allocate 1, 4 and 5 in one shot */
1009        p = kzalloc(len, GFP_KERNEL);
1010        if (p == NULL) {
1011                dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1012                        __func__, len);
1013                rc = -ENOMEM;
1014                goto out;
1015        }
1016        buf->rb_pool = p;       /* for freeing it later */
1017
1018        buf->rb_send_bufs = (struct rpcrdma_req **) p;
1019        p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1020        buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1021        p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1022
1023        /*
1024         * Register the zeroed pad buffer, if any.
1025         */
1026        if (cdata->padding) {
1027                rc = rpcrdma_register_internal(ia, p, cdata->padding,
1028                                            &ep->rep_pad_mr, &ep->rep_pad);
1029                if (rc)
1030                        goto out;
1031        }
1032        p += cdata->padding;
1033
1034        /*
1035         * Allocate the fmr's, or mw's for mw_bind chunk registration.
1036         * We "cycle" the mw's in order to minimize rkey reuse,
1037         * and also reduce unbind-to-bind collision.
1038         */
1039        INIT_LIST_HEAD(&buf->rb_mws);
1040        r = (struct rpcrdma_mw *)p;
1041        switch (ia->ri_memreg_strategy) {
1042        case RPCRDMA_FRMR:
1043                for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1044                        r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1045                                                         RPCRDMA_MAX_SEGS);
1046                        if (IS_ERR(r->r.frmr.fr_mr)) {
1047                                rc = PTR_ERR(r->r.frmr.fr_mr);
1048                                dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
1049                                        " failed %i\n", __func__, rc);
1050                                goto out;
1051                        }
1052                        r->r.frmr.fr_pgl =
1053                                ib_alloc_fast_reg_page_list(ia->ri_id->device,
1054                                                            RPCRDMA_MAX_SEGS);
1055                        if (IS_ERR(r->r.frmr.fr_pgl)) {
1056                                rc = PTR_ERR(r->r.frmr.fr_pgl);
1057                                dprintk("RPC:       %s: "
1058                                        "ib_alloc_fast_reg_page_list "
1059                                        "failed %i\n", __func__, rc);
1060                                goto out;
1061                        }
1062                        list_add(&r->mw_list, &buf->rb_mws);
1063                        ++r;
1064                }
1065                break;
1066        case RPCRDMA_MTHCAFMR:
1067                /* TBD we are perhaps overallocating here */
1068                for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1069                        static struct ib_fmr_attr fa =
1070                                { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1071                        r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1072                                IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1073                                &fa);
1074                        if (IS_ERR(r->r.fmr)) {
1075                                rc = PTR_ERR(r->r.fmr);
1076                                dprintk("RPC:       %s: ib_alloc_fmr"
1077                                        " failed %i\n", __func__, rc);
1078                                goto out;
1079                        }
1080                        list_add(&r->mw_list, &buf->rb_mws);
1081                        ++r;
1082                }
1083                break;
1084        case RPCRDMA_MEMWINDOWS_ASYNC:
1085        case RPCRDMA_MEMWINDOWS:
1086                /* Allocate one extra request's worth, for full cycling */
1087                for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1088                        r->r.mw = ib_alloc_mw(ia->ri_pd);
1089                        if (IS_ERR(r->r.mw)) {
1090                                rc = PTR_ERR(r->r.mw);
1091                                dprintk("RPC:       %s: ib_alloc_mw"
1092                                        " failed %i\n", __func__, rc);
1093                                goto out;
1094                        }
1095                        list_add(&r->mw_list, &buf->rb_mws);
1096                        ++r;
1097                }
1098                break;
1099        default:
1100                break;
1101        }
1102
1103        /*
1104         * Allocate/init the request/reply buffers. Doing this
1105         * using kmalloc for now -- one for each buf.
1106         */
1107        for (i = 0; i < buf->rb_max_requests; i++) {
1108                struct rpcrdma_req *req;
1109                struct rpcrdma_rep *rep;
1110
1111                len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1112                /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1113                /* Typical ~2400b, so rounding up saves work later */
1114                if (len < 4096)
1115                        len = 4096;
1116                req = kmalloc(len, GFP_KERNEL);
1117                if (req == NULL) {
1118                        dprintk("RPC:       %s: request buffer %d alloc"
1119                                " failed\n", __func__, i);
1120                        rc = -ENOMEM;
1121                        goto out;
1122                }
1123                memset(req, 0, sizeof(struct rpcrdma_req));
1124                buf->rb_send_bufs[i] = req;
1125                buf->rb_send_bufs[i]->rl_buffer = buf;
1126
1127                rc = rpcrdma_register_internal(ia, req->rl_base,
1128                                len - offsetof(struct rpcrdma_req, rl_base),
1129                                &buf->rb_send_bufs[i]->rl_handle,
1130                                &buf->rb_send_bufs[i]->rl_iov);
1131                if (rc)
1132                        goto out;
1133
1134                buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1135
1136                len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1137                rep = kmalloc(len, GFP_KERNEL);
1138                if (rep == NULL) {
1139                        dprintk("RPC:       %s: reply buffer %d alloc failed\n",
1140                                __func__, i);
1141                        rc = -ENOMEM;
1142                        goto out;
1143                }
1144                memset(rep, 0, sizeof(struct rpcrdma_rep));
1145                buf->rb_recv_bufs[i] = rep;
1146                buf->rb_recv_bufs[i]->rr_buffer = buf;
1147                init_waitqueue_head(&rep->rr_unbind);
1148
1149                rc = rpcrdma_register_internal(ia, rep->rr_base,
1150                                len - offsetof(struct rpcrdma_rep, rr_base),
1151                                &buf->rb_recv_bufs[i]->rr_handle,
1152                                &buf->rb_recv_bufs[i]->rr_iov);
1153                if (rc)
1154                        goto out;
1155
1156        }
1157        dprintk("RPC:       %s: max_requests %d\n",
1158                __func__, buf->rb_max_requests);
1159        /* done */
1160        return 0;
1161out:
1162        rpcrdma_buffer_destroy(buf);
1163        return rc;
1164}
1165
1166/*
1167 * Unregister and destroy buffer memory. Need to deal with
1168 * partial initialization, so it's callable from failed create.
1169 * Must be called before destroying endpoint, as registrations
1170 * reference it.
1171 */
1172void
1173rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1174{
1175        int rc, i;
1176        struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1177        struct rpcrdma_mw *r;
1178
1179        /* clean up in reverse order from create
1180         *   1.  recv mr memory (mr free, then kfree)
1181         *   1a. bind mw memory
1182         *   2.  send mr memory (mr free, then kfree)
1183         *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
1184         *   4.  arrays
1185         */
1186        dprintk("RPC:       %s: entering\n", __func__);
1187
1188        for (i = 0; i < buf->rb_max_requests; i++) {
1189                if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1190                        rpcrdma_deregister_internal(ia,
1191                                        buf->rb_recv_bufs[i]->rr_handle,
1192                                        &buf->rb_recv_bufs[i]->rr_iov);
1193                        kfree(buf->rb_recv_bufs[i]);
1194                }
1195                if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1196                        while (!list_empty(&buf->rb_mws)) {
1197                                r = list_entry(buf->rb_mws.next,
1198                                        struct rpcrdma_mw, mw_list);
1199                                list_del(&r->mw_list);
1200                                switch (ia->ri_memreg_strategy) {
1201                                case RPCRDMA_FRMR:
1202                                        rc = ib_dereg_mr(r->r.frmr.fr_mr);
1203                                        if (rc)
1204                                                dprintk("RPC:       %s:"
1205                                                        " ib_dereg_mr"
1206                                                        " failed %i\n",
1207                                                        __func__, rc);
1208                                        ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1209                                        break;
1210                                case RPCRDMA_MTHCAFMR:
1211                                        rc = ib_dealloc_fmr(r->r.fmr);
1212                                        if (rc)
1213                                                dprintk("RPC:       %s:"
1214                                                        " ib_dealloc_fmr"
1215                                                        " failed %i\n",
1216                                                        __func__, rc);
1217                                        break;
1218                                case RPCRDMA_MEMWINDOWS_ASYNC:
1219                                case RPCRDMA_MEMWINDOWS:
1220                                        rc = ib_dealloc_mw(r->r.mw);
1221                                        if (rc)
1222                                                dprintk("RPC:       %s:"
1223                                                        " ib_dealloc_mw"
1224                                                        " failed %i\n",
1225                                                        __func__, rc);
1226                                        break;
1227                                default:
1228                                        break;
1229                                }
1230                        }
1231                        rpcrdma_deregister_internal(ia,
1232                                        buf->rb_send_bufs[i]->rl_handle,
1233                                        &buf->rb_send_bufs[i]->rl_iov);
1234                        kfree(buf->rb_send_bufs[i]);
1235                }
1236        }
1237
1238        kfree(buf->rb_pool);
1239}
1240
1241/*
1242 * Get a set of request/reply buffers.
1243 *
1244 * Reply buffer (if needed) is attached to send buffer upon return.
1245 * Rule:
1246 *    rb_send_index and rb_recv_index MUST always be pointing to the
1247 *    *next* available buffer (non-NULL). They are incremented after
1248 *    removing buffers, and decremented *before* returning them.
1249 */
1250struct rpcrdma_req *
1251rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1252{
1253        struct rpcrdma_req *req;
1254        unsigned long flags;
1255        int i;
1256        struct rpcrdma_mw *r;
1257
1258        spin_lock_irqsave(&buffers->rb_lock, flags);
1259        if (buffers->rb_send_index == buffers->rb_max_requests) {
1260                spin_unlock_irqrestore(&buffers->rb_lock, flags);
1261                dprintk("RPC:       %s: out of request buffers\n", __func__);
1262                return ((struct rpcrdma_req *)NULL);
1263        }
1264
1265        req = buffers->rb_send_bufs[buffers->rb_send_index];
1266        if (buffers->rb_send_index < buffers->rb_recv_index) {
1267                dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
1268                        __func__,
1269                        buffers->rb_recv_index - buffers->rb_send_index);
1270                req->rl_reply = NULL;
1271        } else {
1272                req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1273                buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1274        }
1275        buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1276        if (!list_empty(&buffers->rb_mws)) {
1277                i = RPCRDMA_MAX_SEGS - 1;
1278                do {
1279                        r = list_entry(buffers->rb_mws.next,
1280                                        struct rpcrdma_mw, mw_list);
1281                        list_del(&r->mw_list);
1282                        req->rl_segments[i].mr_chunk.rl_mw = r;
1283                } while (--i >= 0);
1284        }
1285        spin_unlock_irqrestore(&buffers->rb_lock, flags);
1286        return req;
1287}
1288
1289/*
1290 * Put request/reply buffers back into pool.
1291 * Pre-decrement counter/array index.
1292 */
1293void
1294rpcrdma_buffer_put(struct rpcrdma_req *req)
1295{
1296        struct rpcrdma_buffer *buffers = req->rl_buffer;
1297        struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1298        int i;
1299        unsigned long flags;
1300
1301        BUG_ON(req->rl_nchunks != 0);
1302        spin_lock_irqsave(&buffers->rb_lock, flags);
1303        buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1304        req->rl_niovs = 0;
1305        if (req->rl_reply) {
1306                buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1307                init_waitqueue_head(&req->rl_reply->rr_unbind);
1308                req->rl_reply->rr_func = NULL;
1309                req->rl_reply = NULL;
1310        }
1311        switch (ia->ri_memreg_strategy) {
1312        case RPCRDMA_FRMR:
1313        case RPCRDMA_MTHCAFMR:
1314        case RPCRDMA_MEMWINDOWS_ASYNC:
1315        case RPCRDMA_MEMWINDOWS:
1316                /*
1317                 * Cycle mw's back in reverse order, and "spin" them.
1318                 * This delays and scrambles reuse as much as possible.
1319                 */
1320                i = 1;
1321                do {
1322                        struct rpcrdma_mw **mw;
1323                        mw = &req->rl_segments[i].mr_chunk.rl_mw;
1324                        list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1325                        *mw = NULL;
1326                } while (++i < RPCRDMA_MAX_SEGS);
1327                list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1328                                        &buffers->rb_mws);
1329                req->rl_segments[0].mr_chunk.rl_mw = NULL;
1330                break;
1331        default:
1332                break;
1333        }
1334        spin_unlock_irqrestore(&buffers->rb_lock, flags);
1335}
1336
1337/*
1338 * Recover reply buffers from pool.
1339 * This happens when recovering from error conditions.
1340 * Post-increment counter/array index.
1341 */
1342void
1343rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1344{
1345        struct rpcrdma_buffer *buffers = req->rl_buffer;
1346        unsigned long flags;
1347
1348        if (req->rl_iov.length == 0)    /* special case xprt_rdma_allocate() */
1349                buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1350        spin_lock_irqsave(&buffers->rb_lock, flags);
1351        if (buffers->rb_recv_index < buffers->rb_max_requests) {
1352                req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1353                buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1354        }
1355        spin_unlock_irqrestore(&buffers->rb_lock, flags);
1356}
1357
1358/*
1359 * Put reply buffers back into pool when not attached to
1360 * request. This happens in error conditions, and when
1361 * aborting unbinds. Pre-decrement counter/array index.
1362 */
1363void
1364rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1365{
1366        struct rpcrdma_buffer *buffers = rep->rr_buffer;
1367        unsigned long flags;
1368
1369        rep->rr_func = NULL;
1370        spin_lock_irqsave(&buffers->rb_lock, flags);
1371        buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1372        spin_unlock_irqrestore(&buffers->rb_lock, flags);
1373}
1374
1375/*
1376 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1377 */
1378
1379int
1380rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1381                                struct ib_mr **mrp, struct ib_sge *iov)
1382{
1383        struct ib_phys_buf ipb;
1384        struct ib_mr *mr;
1385        int rc;
1386
1387        /*
1388         * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1389         */
1390        iov->addr = ib_dma_map_single(ia->ri_id->device,
1391                        va, len, DMA_BIDIRECTIONAL);
1392        iov->length = len;
1393
1394        if (ia->ri_have_dma_lkey) {
1395                *mrp = NULL;
1396                iov->lkey = ia->ri_dma_lkey;
1397                return 0;
1398        } else if (ia->ri_bind_mem != NULL) {
1399                *mrp = NULL;
1400                iov->lkey = ia->ri_bind_mem->lkey;
1401                return 0;
1402        }
1403
1404        ipb.addr = iov->addr;
1405        ipb.size = iov->length;
1406        mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1407                        IB_ACCESS_LOCAL_WRITE, &iov->addr);
1408
1409        dprintk("RPC:       %s: phys convert: 0x%llx "
1410                        "registered 0x%llx length %d\n",
1411                        __func__, (unsigned long long)ipb.addr,
1412                        (unsigned long long)iov->addr, len);
1413
1414        if (IS_ERR(mr)) {
1415                *mrp = NULL;
1416                rc = PTR_ERR(mr);
1417                dprintk("RPC:       %s: failed with %i\n", __func__, rc);
1418        } else {
1419                *mrp = mr;
1420                iov->lkey = mr->lkey;
1421                rc = 0;
1422        }
1423
1424        return rc;
1425}
1426
1427int
1428rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1429                                struct ib_mr *mr, struct ib_sge *iov)
1430{
1431        int rc;
1432
1433        ib_dma_unmap_single(ia->ri_id->device,
1434                        iov->addr, iov->length, DMA_BIDIRECTIONAL);
1435
1436        if (NULL == mr)
1437                return 0;
1438
1439        rc = ib_dereg_mr(mr);
1440        if (rc)
1441                dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
1442        return rc;
1443}
1444
1445/*
1446 * Wrappers for chunk registration, shared by read/write chunk code.
1447 */
1448
1449static void
1450rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1451{
1452        seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1453        seg->mr_dmalen = seg->mr_len;
1454        if (seg->mr_page)
1455                seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1456                                seg->mr_page, offset_in_page(seg->mr_offset),
1457                                seg->mr_dmalen, seg->mr_dir);
1458        else
1459                seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1460                                seg->mr_offset,
1461                                seg->mr_dmalen, seg->mr_dir);
1462        if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1463                dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1464                        __func__,
1465                        (unsigned long long)seg->mr_dma,
1466                        seg->mr_offset, seg->mr_dmalen);
1467        }
1468}
1469
1470static void
1471rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1472{
1473        if (seg->mr_page)
1474                ib_dma_unmap_page(ia->ri_id->device,
1475                                seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1476        else
1477                ib_dma_unmap_single(ia->ri_id->device,
1478                                seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1479}
1480
1481static int
1482rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1483                        int *nsegs, int writing, struct rpcrdma_ia *ia,
1484                        struct rpcrdma_xprt *r_xprt)
1485{
1486        struct rpcrdma_mr_seg *seg1 = seg;
1487        struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1488
1489        u8 key;
1490        int len, pageoff;
1491        int i, rc;
1492
1493        pageoff = offset_in_page(seg1->mr_offset);
1494        seg1->mr_offset -= pageoff;     /* start of page */
1495        seg1->mr_len += pageoff;
1496        len = -pageoff;
1497        if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1498                *nsegs = RPCRDMA_MAX_DATA_SEGS;
1499        for (i = 0; i < *nsegs;) {
1500                rpcrdma_map_one(ia, seg, writing);
1501                seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1502                len += seg->mr_len;
1503                BUG_ON(seg->mr_len > PAGE_SIZE);
1504                ++seg;
1505                ++i;
1506                /* Check for holes */
1507                if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1508                    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1509                        break;
1510        }
1511        dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
1512                __func__, seg1->mr_chunk.rl_mw, i);
1513
1514        if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1515                dprintk("RPC:       %s: frmr %x left valid, posting invalidate.\n",
1516                        __func__,
1517                        seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1518                /* Invalidate before using. */
1519                memset(&invalidate_wr, 0, sizeof invalidate_wr);
1520                invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1521                invalidate_wr.next = &frmr_wr;
1522                invalidate_wr.opcode = IB_WR_LOCAL_INV;
1523                invalidate_wr.send_flags = IB_SEND_SIGNALED;
1524                invalidate_wr.ex.invalidate_rkey =
1525                        seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1526                DECR_CQCOUNT(&r_xprt->rx_ep);
1527                post_wr = &invalidate_wr;
1528        } else
1529                post_wr = &frmr_wr;
1530
1531        /* Bump the key */
1532        key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1533        ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1534
1535        /* Prepare FRMR WR */
1536        memset(&frmr_wr, 0, sizeof frmr_wr);
1537        frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1538        frmr_wr.opcode = IB_WR_FAST_REG_MR;
1539        frmr_wr.send_flags = IB_SEND_SIGNALED;
1540        frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1541        frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1542        frmr_wr.wr.fast_reg.page_list_len = i;
1543        frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1544        frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1545        BUG_ON(frmr_wr.wr.fast_reg.length < len);
1546        frmr_wr.wr.fast_reg.access_flags = (writing ?
1547                                IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1548                                IB_ACCESS_REMOTE_READ);
1549        frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1550        DECR_CQCOUNT(&r_xprt->rx_ep);
1551
1552        rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
1553
1554        if (rc) {
1555                dprintk("RPC:       %s: failed ib_post_send for register,"
1556                        " status %i\n", __func__, rc);
1557                while (i--)
1558                        rpcrdma_unmap_one(ia, --seg);
1559        } else {
1560                seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1561                seg1->mr_base = seg1->mr_dma + pageoff;
1562                seg1->mr_nsegs = i;
1563                seg1->mr_len = len;
1564        }
1565        *nsegs = i;
1566        return rc;
1567}
1568
1569static int
1570rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1571                        struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1572{
1573        struct rpcrdma_mr_seg *seg1 = seg;
1574        struct ib_send_wr invalidate_wr, *bad_wr;
1575        int rc;
1576
1577        while (seg1->mr_nsegs--)
1578                rpcrdma_unmap_one(ia, seg++);
1579
1580        memset(&invalidate_wr, 0, sizeof invalidate_wr);
1581        invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1582        invalidate_wr.opcode = IB_WR_LOCAL_INV;
1583        invalidate_wr.send_flags = IB_SEND_SIGNALED;
1584        invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1585        DECR_CQCOUNT(&r_xprt->rx_ep);
1586
1587        rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1588        if (rc)
1589                dprintk("RPC:       %s: failed ib_post_send for invalidate,"
1590                        " status %i\n", __func__, rc);
1591        return rc;
1592}
1593
1594static int
1595rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1596                        int *nsegs, int writing, struct rpcrdma_ia *ia)
1597{
1598        struct rpcrdma_mr_seg *seg1 = seg;
1599        u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1600        int len, pageoff, i, rc;
1601
1602        pageoff = offset_in_page(seg1->mr_offset);
1603        seg1->mr_offset -= pageoff;     /* start of page */
1604        seg1->mr_len += pageoff;
1605        len = -pageoff;
1606        if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1607                *nsegs = RPCRDMA_MAX_DATA_SEGS;
1608        for (i = 0; i < *nsegs;) {
1609                rpcrdma_map_one(ia, seg, writing);
1610                physaddrs[i] = seg->mr_dma;
1611                len += seg->mr_len;
1612                ++seg;
1613                ++i;
1614                /* Check for holes */
1615                if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1616                    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1617                        break;
1618        }
1619        rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1620                                physaddrs, i, seg1->mr_dma);
1621        if (rc) {
1622                dprintk("RPC:       %s: failed ib_map_phys_fmr "
1623                        "%u@0x%llx+%i (%d)... status %i\n", __func__,
1624                        len, (unsigned long long)seg1->mr_dma,
1625                        pageoff, i, rc);
1626                while (i--)
1627                        rpcrdma_unmap_one(ia, --seg);
1628        } else {
1629                seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1630                seg1->mr_base = seg1->mr_dma + pageoff;
1631                seg1->mr_nsegs = i;
1632                seg1->mr_len = len;
1633        }
1634        *nsegs = i;
1635        return rc;
1636}
1637
1638static int
1639rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1640                        struct rpcrdma_ia *ia)
1641{
1642        struct rpcrdma_mr_seg *seg1 = seg;
1643        LIST_HEAD(l);
1644        int rc;
1645
1646        list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1647        rc = ib_unmap_fmr(&l);
1648        while (seg1->mr_nsegs--)
1649                rpcrdma_unmap_one(ia, seg++);
1650        if (rc)
1651                dprintk("RPC:       %s: failed ib_unmap_fmr,"
1652                        " status %i\n", __func__, rc);
1653        return rc;
1654}
1655
1656static int
1657rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1658                        int *nsegs, int writing, struct rpcrdma_ia *ia,
1659                        struct rpcrdma_xprt *r_xprt)
1660{
1661        int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1662                                  IB_ACCESS_REMOTE_READ);
1663        struct ib_mw_bind param;
1664        int rc;
1665
1666        *nsegs = 1;
1667        rpcrdma_map_one(ia, seg, writing);
1668        param.mr = ia->ri_bind_mem;
1669        param.wr_id = 0ULL;     /* no send cookie */
1670        param.addr = seg->mr_dma;
1671        param.length = seg->mr_len;
1672        param.send_flags = 0;
1673        param.mw_access_flags = mem_priv;
1674
1675        DECR_CQCOUNT(&r_xprt->rx_ep);
1676        rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1677        if (rc) {
1678                dprintk("RPC:       %s: failed ib_bind_mw "
1679                        "%u@0x%llx status %i\n",
1680                        __func__, seg->mr_len,
1681                        (unsigned long long)seg->mr_dma, rc);
1682                rpcrdma_unmap_one(ia, seg);
1683        } else {
1684                seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1685                seg->mr_base = param.addr;
1686                seg->mr_nsegs = 1;
1687        }
1688        return rc;
1689}
1690
1691static int
1692rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1693                        struct rpcrdma_ia *ia,
1694                        struct rpcrdma_xprt *r_xprt, void **r)
1695{
1696        struct ib_mw_bind param;
1697        LIST_HEAD(l);
1698        int rc;
1699
1700        BUG_ON(seg->mr_nsegs != 1);
1701        param.mr = ia->ri_bind_mem;
1702        param.addr = 0ULL;      /* unbind */
1703        param.length = 0;
1704        param.mw_access_flags = 0;
1705        if (*r) {
1706                param.wr_id = (u64) (unsigned long) *r;
1707                param.send_flags = IB_SEND_SIGNALED;
1708                INIT_CQCOUNT(&r_xprt->rx_ep);
1709        } else {
1710                param.wr_id = 0ULL;
1711                param.send_flags = 0;
1712                DECR_CQCOUNT(&r_xprt->rx_ep);
1713        }
1714        rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1715        rpcrdma_unmap_one(ia, seg);
1716        if (rc)
1717                dprintk("RPC:       %s: failed ib_(un)bind_mw,"
1718                        " status %i\n", __func__, rc);
1719        else
1720                *r = NULL;      /* will upcall on completion */
1721        return rc;
1722}
1723
1724static int
1725rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1726                        int *nsegs, int writing, struct rpcrdma_ia *ia)
1727{
1728        int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1729                                  IB_ACCESS_REMOTE_READ);
1730        struct rpcrdma_mr_seg *seg1 = seg;
1731        struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1732        int len, i, rc = 0;
1733
1734        if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1735                *nsegs = RPCRDMA_MAX_DATA_SEGS;
1736        for (len = 0, i = 0; i < *nsegs;) {
1737                rpcrdma_map_one(ia, seg, writing);
1738                ipb[i].addr = seg->mr_dma;
1739                ipb[i].size = seg->mr_len;
1740                len += seg->mr_len;
1741                ++seg;
1742                ++i;
1743                /* Check for holes */
1744                if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1745                    offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1746                        break;
1747        }
1748        seg1->mr_base = seg1->mr_dma;
1749        seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1750                                ipb, i, mem_priv, &seg1->mr_base);
1751        if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1752                rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1753                dprintk("RPC:       %s: failed ib_reg_phys_mr "
1754                        "%u@0x%llx (%d)... status %i\n",
1755                        __func__, len,
1756                        (unsigned long long)seg1->mr_dma, i, rc);
1757                while (i--)
1758                        rpcrdma_unmap_one(ia, --seg);
1759        } else {
1760                seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1761                seg1->mr_nsegs = i;
1762                seg1->mr_len = len;
1763        }
1764        *nsegs = i;
1765        return rc;
1766}
1767
1768static int
1769rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1770                        struct rpcrdma_ia *ia)
1771{
1772        struct rpcrdma_mr_seg *seg1 = seg;
1773        int rc;
1774
1775        rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1776        seg1->mr_chunk.rl_mr = NULL;
1777        while (seg1->mr_nsegs--)
1778                rpcrdma_unmap_one(ia, seg++);
1779        if (rc)
1780                dprintk("RPC:       %s: failed ib_dereg_mr,"
1781                        " status %i\n", __func__, rc);
1782        return rc;
1783}
1784
1785int
1786rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1787                        int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1788{
1789        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1790        int rc = 0;
1791
1792        switch (ia->ri_memreg_strategy) {
1793
1794#if RPCRDMA_PERSISTENT_REGISTRATION
1795        case RPCRDMA_ALLPHYSICAL:
1796                rpcrdma_map_one(ia, seg, writing);
1797                seg->mr_rkey = ia->ri_bind_mem->rkey;
1798                seg->mr_base = seg->mr_dma;
1799                seg->mr_nsegs = 1;
1800                nsegs = 1;
1801                break;
1802#endif
1803
1804        /* Registration using frmr registration */
1805        case RPCRDMA_FRMR:
1806                rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1807                break;
1808
1809        /* Registration using fmr memory registration */
1810        case RPCRDMA_MTHCAFMR:
1811                rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1812                break;
1813
1814        /* Registration using memory windows */
1815        case RPCRDMA_MEMWINDOWS_ASYNC:
1816        case RPCRDMA_MEMWINDOWS:
1817                rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1818                break;
1819
1820        /* Default registration each time */
1821        default:
1822                rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1823                break;
1824        }
1825        if (rc)
1826                return -1;
1827
1828        return nsegs;
1829}
1830
1831int
1832rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1833                struct rpcrdma_xprt *r_xprt, void *r)
1834{
1835        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1836        int nsegs = seg->mr_nsegs, rc;
1837
1838        switch (ia->ri_memreg_strategy) {
1839
1840#if RPCRDMA_PERSISTENT_REGISTRATION
1841        case RPCRDMA_ALLPHYSICAL:
1842                BUG_ON(nsegs != 1);
1843                rpcrdma_unmap_one(ia, seg);
1844                rc = 0;
1845                break;
1846#endif
1847
1848        case RPCRDMA_FRMR:
1849                rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1850                break;
1851
1852        case RPCRDMA_MTHCAFMR:
1853                rc = rpcrdma_deregister_fmr_external(seg, ia);
1854                break;
1855
1856        case RPCRDMA_MEMWINDOWS_ASYNC:
1857        case RPCRDMA_MEMWINDOWS:
1858                rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1859                break;
1860
1861        default:
1862                rc = rpcrdma_deregister_default_external(seg, ia);
1863                break;
1864        }
1865        if (r) {
1866                struct rpcrdma_rep *rep = r;
1867                void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1868                rep->rr_func = NULL;
1869                func(rep);      /* dereg done, callback now */
1870        }
1871        return nsegs;
1872}
1873
1874/*
1875 * Prepost any receive buffer, then post send.
1876 *
1877 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1878 */
1879int
1880rpcrdma_ep_post(struct rpcrdma_ia *ia,
1881                struct rpcrdma_ep *ep,
1882                struct rpcrdma_req *req)
1883{
1884        struct ib_send_wr send_wr, *send_wr_fail;
1885        struct rpcrdma_rep *rep = req->rl_reply;
1886        int rc;
1887
1888        if (rep) {
1889                rc = rpcrdma_ep_post_recv(ia, ep, rep);
1890                if (rc)
1891                        goto out;
1892                req->rl_reply = NULL;
1893        }
1894
1895        send_wr.next = NULL;
1896        send_wr.wr_id = 0ULL;   /* no send cookie */
1897        send_wr.sg_list = req->rl_send_iov;
1898        send_wr.num_sge = req->rl_niovs;
1899        send_wr.opcode = IB_WR_SEND;
1900        if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
1901                ib_dma_sync_single_for_device(ia->ri_id->device,
1902                        req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1903                        DMA_TO_DEVICE);
1904        ib_dma_sync_single_for_device(ia->ri_id->device,
1905                req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1906                DMA_TO_DEVICE);
1907        ib_dma_sync_single_for_device(ia->ri_id->device,
1908                req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1909                DMA_TO_DEVICE);
1910
1911        if (DECR_CQCOUNT(ep) > 0)
1912                send_wr.send_flags = 0;
1913        else { /* Provider must take a send completion every now and then */
1914                INIT_CQCOUNT(ep);
1915                send_wr.send_flags = IB_SEND_SIGNALED;
1916        }
1917
1918        rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1919        if (rc)
1920                dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
1921                        rc);
1922out:
1923        return rc;
1924}
1925
1926/*
1927 * (Re)post a receive buffer.
1928 */
1929int
1930rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1931                     struct rpcrdma_ep *ep,
1932                     struct rpcrdma_rep *rep)
1933{
1934        struct ib_recv_wr recv_wr, *recv_wr_fail;
1935        int rc;
1936
1937        recv_wr.next = NULL;
1938        recv_wr.wr_id = (u64) (unsigned long) rep;
1939        recv_wr.sg_list = &rep->rr_iov;
1940        recv_wr.num_sge = 1;
1941
1942        ib_dma_sync_single_for_cpu(ia->ri_id->device,
1943                rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1944
1945        DECR_CQCOUNT(ep);
1946        rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1947
1948        if (rc)
1949                dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
1950                        rc);
1951        return rc;
1952}
1953