linux/net/sunrpc/xprtrdma/verbs.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the BSD-type
   8 * license below:
   9 *
  10 * Redistribution and use in source and binary forms, with or without
  11 * modification, are permitted provided that the following conditions
  12 * are met:
  13 *
  14 *      Redistributions of source code must retain the above copyright
  15 *      notice, this list of conditions and the following disclaimer.
  16 *
  17 *      Redistributions in binary form must reproduce the above
  18 *      copyright notice, this list of conditions and the following
  19 *      disclaimer in the documentation and/or other materials provided
  20 *      with the distribution.
  21 *
  22 *      Neither the name of the Network Appliance, Inc. nor the names of
  23 *      its contributors may be used to endorse or promote products
  24 *      derived from this software without specific prior written
  25 *      permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38 */
  39
  40/*
  41 * verbs.c
  42 *
  43 * Encapsulates the major functions managing:
  44 *  o adapters
  45 *  o endpoints
  46 *  o connections
  47 *  o buffer memory
  48 */
  49
  50#include <linux/pci.h>  /* for Tavor hack below */
  51#include <linux/slab.h>
  52
  53#include "xprt_rdma.h"
  54
  55/*
  56 * Globals/Macros
  57 */
  58
  59#ifdef RPC_DEBUG
  60# define RPCDBG_FACILITY        RPCDBG_TRANS
  61#endif
  62
  63/*
  64 * internal functions
  65 */
  66
  67/*
  68 * handle replies in tasklet context, using a single, global list
  69 * rdma tasklet function -- just turn around and call the func
  70 * for all replies on the list
  71 */
  72
  73static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
  74static LIST_HEAD(rpcrdma_tasklets_g);
  75
  76static void
  77rpcrdma_run_tasklet(unsigned long data)
  78{
  79        struct rpcrdma_rep *rep;
  80        void (*func)(struct rpcrdma_rep *);
  81        unsigned long flags;
  82
  83        data = data;
  84        spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
  85        while (!list_empty(&rpcrdma_tasklets_g)) {
  86                rep = list_entry(rpcrdma_tasklets_g.next,
  87                                 struct rpcrdma_rep, rr_list);
  88                list_del(&rep->rr_list);
  89                func = rep->rr_func;
  90                rep->rr_func = NULL;
  91                spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
  92
  93                if (func)
  94                        func(rep);
  95                else
  96                        rpcrdma_recv_buffer_put(rep);
  97
  98                spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
  99        }
 100        spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
 101}
 102
 103static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
 104
 105static inline void
 106rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
 107{
 108        unsigned long flags;
 109
 110        spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
 111        list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
 112        spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
 113        tasklet_schedule(&rpcrdma_tasklet_g);
 114}
 115
 116static void
 117rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
 118{
 119        struct rpcrdma_ep *ep = context;
 120
 121        dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
 122                __func__, event->event, event->device->name, context);
 123        if (ep->rep_connected == 1) {
 124                ep->rep_connected = -EIO;
 125                ep->rep_func(ep);
 126                wake_up_all(&ep->rep_connect_wait);
 127        }
 128}
 129
 130static void
 131rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
 132{
 133        struct rpcrdma_ep *ep = context;
 134
 135        dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
 136                __func__, event->event, event->device->name, context);
 137        if (ep->rep_connected == 1) {
 138                ep->rep_connected = -EIO;
 139                ep->rep_func(ep);
 140                wake_up_all(&ep->rep_connect_wait);
 141        }
 142}
 143
 144static inline
 145void rpcrdma_event_process(struct ib_wc *wc)
 146{
 147        struct rpcrdma_rep *rep =
 148                        (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
 149
 150        dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
 151                __func__, rep, wc->status, wc->opcode, wc->byte_len);
 152
 153        if (!rep) /* send or bind completion that we don't care about */
 154                return;
 155
 156        if (IB_WC_SUCCESS != wc->status) {
 157                dprintk("RPC:       %s: %s WC status %X, connection lost\n",
 158                        __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
 159                         wc->status);
 160                rep->rr_len = ~0U;
 161                rpcrdma_schedule_tasklet(rep);
 162                return;
 163        }
 164
 165        switch (wc->opcode) {
 166        case IB_WC_RECV:
 167                rep->rr_len = wc->byte_len;
 168                ib_dma_sync_single_for_cpu(
 169                        rdmab_to_ia(rep->rr_buffer)->ri_id->device,
 170                        rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
 171                /* Keep (only) the most recent credits, after check validity */
 172                if (rep->rr_len >= 16) {
 173                        struct rpcrdma_msg *p =
 174                                        (struct rpcrdma_msg *) rep->rr_base;
 175                        unsigned int credits = ntohl(p->rm_credit);
 176                        if (credits == 0) {
 177                                dprintk("RPC:       %s: server"
 178                                        " dropped credits to 0!\n", __func__);
 179                                /* don't deadlock */
 180                                credits = 1;
 181                        } else if (credits > rep->rr_buffer->rb_max_requests) {
 182                                dprintk("RPC:       %s: server"
 183                                        " over-crediting: %d (%d)\n",
 184                                        __func__, credits,
 185                                        rep->rr_buffer->rb_max_requests);
 186                                credits = rep->rr_buffer->rb_max_requests;
 187                        }
 188                        atomic_set(&rep->rr_buffer->rb_credits, credits);
 189                }
 190                /* fall through */
 191        case IB_WC_BIND_MW:
 192                rpcrdma_schedule_tasklet(rep);
 193                break;
 194        default:
 195                dprintk("RPC:       %s: unexpected WC event %X\n",
 196                        __func__, wc->opcode);
 197                break;
 198        }
 199}
 200
 201static inline int
 202rpcrdma_cq_poll(struct ib_cq *cq)
 203{
 204        struct ib_wc wc;
 205        int rc;
 206
 207        for (;;) {
 208                rc = ib_poll_cq(cq, 1, &wc);
 209                if (rc < 0) {
 210                        dprintk("RPC:       %s: ib_poll_cq failed %i\n",
 211                                __func__, rc);
 212                        return rc;
 213                }
 214                if (rc == 0)
 215                        break;
 216
 217                rpcrdma_event_process(&wc);
 218        }
 219
 220        return 0;
 221}
 222
 223/*
 224 * rpcrdma_cq_event_upcall
 225 *
 226 * This upcall handles recv, send, bind and unbind events.
 227 * It is reentrant but processes single events in order to maintain
 228 * ordering of receives to keep server credits.
 229 *
 230 * It is the responsibility of the scheduled tasklet to return
 231 * recv buffers to the pool. NOTE: this affects synchronization of
 232 * connection shutdown. That is, the structures required for
 233 * the completion of the reply handler must remain intact until
 234 * all memory has been reclaimed.
 235 *
 236 * Note that send events are suppressed and do not result in an upcall.
 237 */
 238static void
 239rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
 240{
 241        int rc;
 242
 243        rc = rpcrdma_cq_poll(cq);
 244        if (rc)
 245                return;
 246
 247        rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 248        if (rc) {
 249                dprintk("RPC:       %s: ib_req_notify_cq failed %i\n",
 250                        __func__, rc);
 251                return;
 252        }
 253
 254        rpcrdma_cq_poll(cq);
 255}
 256
 257#ifdef RPC_DEBUG
 258static const char * const conn[] = {
 259        "address resolved",
 260        "address error",
 261        "route resolved",
 262        "route error",
 263        "connect request",
 264        "connect response",
 265        "connect error",
 266        "unreachable",
 267        "rejected",
 268        "established",
 269        "disconnected",
 270        "device removal"
 271};
 272#endif
 273
 274static int
 275rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 276{
 277        struct rpcrdma_xprt *xprt = id->context;
 278        struct rpcrdma_ia *ia = &xprt->rx_ia;
 279        struct rpcrdma_ep *ep = &xprt->rx_ep;
 280#ifdef RPC_DEBUG
 281        struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
 282#endif
 283        struct ib_qp_attr attr;
 284        struct ib_qp_init_attr iattr;
 285        int connstate = 0;
 286
 287        switch (event->event) {
 288        case RDMA_CM_EVENT_ADDR_RESOLVED:
 289        case RDMA_CM_EVENT_ROUTE_RESOLVED:
 290                ia->ri_async_rc = 0;
 291                complete(&ia->ri_done);
 292                break;
 293        case RDMA_CM_EVENT_ADDR_ERROR:
 294                ia->ri_async_rc = -EHOSTUNREACH;
 295                dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
 296                        __func__, ep);
 297                complete(&ia->ri_done);
 298                break;
 299        case RDMA_CM_EVENT_ROUTE_ERROR:
 300                ia->ri_async_rc = -ENETUNREACH;
 301                dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
 302                        __func__, ep);
 303                complete(&ia->ri_done);
 304                break;
 305        case RDMA_CM_EVENT_ESTABLISHED:
 306                connstate = 1;
 307                ib_query_qp(ia->ri_id->qp, &attr,
 308                        IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
 309                        &iattr);
 310                dprintk("RPC:       %s: %d responder resources"
 311                        " (%d initiator)\n",
 312                        __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
 313                goto connected;
 314        case RDMA_CM_EVENT_CONNECT_ERROR:
 315                connstate = -ENOTCONN;
 316                goto connected;
 317        case RDMA_CM_EVENT_UNREACHABLE:
 318                connstate = -ENETDOWN;
 319                goto connected;
 320        case RDMA_CM_EVENT_REJECTED:
 321                connstate = -ECONNREFUSED;
 322                goto connected;
 323        case RDMA_CM_EVENT_DISCONNECTED:
 324                connstate = -ECONNABORTED;
 325                goto connected;
 326        case RDMA_CM_EVENT_DEVICE_REMOVAL:
 327                connstate = -ENODEV;
 328connected:
 329                dprintk("RPC:       %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
 330                        __func__,
 331                        (event->event <= 11) ? conn[event->event] :
 332                                                "unknown connection error",
 333                        &addr->sin_addr.s_addr,
 334                        ntohs(addr->sin_port),
 335                        ep, event->event);
 336                atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
 337                dprintk("RPC:       %s: %sconnected\n",
 338                                        __func__, connstate > 0 ? "" : "dis");
 339                ep->rep_connected = connstate;
 340                ep->rep_func(ep);
 341                wake_up_all(&ep->rep_connect_wait);
 342                break;
 343        default:
 344                dprintk("RPC:       %s: unexpected CM event %d\n",
 345                        __func__, event->event);
 346                break;
 347        }
 348
 349#ifdef RPC_DEBUG
 350        if (connstate == 1) {
 351                int ird = attr.max_dest_rd_atomic;
 352                int tird = ep->rep_remote_cma.responder_resources;
 353                printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
 354                        "on %s, memreg %d slots %d ird %d%s\n",
 355                        &addr->sin_addr.s_addr,
 356                        ntohs(addr->sin_port),
 357                        ia->ri_id->device->name,
 358                        ia->ri_memreg_strategy,
 359                        xprt->rx_buf.rb_max_requests,
 360                        ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
 361        } else if (connstate < 0) {
 362                printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
 363                        &addr->sin_addr.s_addr,
 364                        ntohs(addr->sin_port),
 365                        connstate);
 366        }
 367#endif
 368
 369        return 0;
 370}
 371
 372static struct rdma_cm_id *
 373rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 374                        struct rpcrdma_ia *ia, struct sockaddr *addr)
 375{
 376        struct rdma_cm_id *id;
 377        int rc;
 378
 379        init_completion(&ia->ri_done);
 380
 381        id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
 382        if (IS_ERR(id)) {
 383                rc = PTR_ERR(id);
 384                dprintk("RPC:       %s: rdma_create_id() failed %i\n",
 385                        __func__, rc);
 386                return id;
 387        }
 388
 389        ia->ri_async_rc = -ETIMEDOUT;
 390        rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
 391        if (rc) {
 392                dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
 393                        __func__, rc);
 394                goto out;
 395        }
 396        wait_for_completion_interruptible_timeout(&ia->ri_done,
 397                                msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
 398        rc = ia->ri_async_rc;
 399        if (rc)
 400                goto out;
 401
 402        ia->ri_async_rc = -ETIMEDOUT;
 403        rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
 404        if (rc) {
 405                dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
 406                        __func__, rc);
 407                goto out;
 408        }
 409        wait_for_completion_interruptible_timeout(&ia->ri_done,
 410                                msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
 411        rc = ia->ri_async_rc;
 412        if (rc)
 413                goto out;
 414
 415        return id;
 416
 417out:
 418        rdma_destroy_id(id);
 419        return ERR_PTR(rc);
 420}
 421
 422/*
 423 * Drain any cq, prior to teardown.
 424 */
 425static void
 426rpcrdma_clean_cq(struct ib_cq *cq)
 427{
 428        struct ib_wc wc;
 429        int count = 0;
 430
 431        while (1 == ib_poll_cq(cq, 1, &wc))
 432                ++count;
 433
 434        if (count)
 435                dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
 436                        __func__, count, wc.opcode);
 437}
 438
 439/*
 440 * Exported functions.
 441 */
 442
 443/*
 444 * Open and initialize an Interface Adapter.
 445 *  o initializes fields of struct rpcrdma_ia, including
 446 *    interface and provider attributes and protection zone.
 447 */
 448int
 449rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 450{
 451        int rc, mem_priv;
 452        struct ib_device_attr devattr;
 453        struct rpcrdma_ia *ia = &xprt->rx_ia;
 454
 455        ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
 456        if (IS_ERR(ia->ri_id)) {
 457                rc = PTR_ERR(ia->ri_id);
 458                goto out1;
 459        }
 460
 461        ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
 462        if (IS_ERR(ia->ri_pd)) {
 463                rc = PTR_ERR(ia->ri_pd);
 464                dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
 465                        __func__, rc);
 466                goto out2;
 467        }
 468
 469        /*
 470         * Query the device to determine if the requested memory
 471         * registration strategy is supported. If it isn't, set the
 472         * strategy to a globally supported model.
 473         */
 474        rc = ib_query_device(ia->ri_id->device, &devattr);
 475        if (rc) {
 476                dprintk("RPC:       %s: ib_query_device failed %d\n",
 477                        __func__, rc);
 478                goto out2;
 479        }
 480
 481        if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
 482                ia->ri_have_dma_lkey = 1;
 483                ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
 484        }
 485
 486        switch (memreg) {
 487        case RPCRDMA_MEMWINDOWS:
 488        case RPCRDMA_MEMWINDOWS_ASYNC:
 489                if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
 490                        dprintk("RPC:       %s: MEMWINDOWS registration "
 491                                "specified but not supported by adapter, "
 492                                "using slower RPCRDMA_REGISTER\n",
 493                                __func__);
 494                        memreg = RPCRDMA_REGISTER;
 495                }
 496                break;
 497        case RPCRDMA_MTHCAFMR:
 498                if (!ia->ri_id->device->alloc_fmr) {
 499#if RPCRDMA_PERSISTENT_REGISTRATION
 500                        dprintk("RPC:       %s: MTHCAFMR registration "
 501                                "specified but not supported by adapter, "
 502                                "using riskier RPCRDMA_ALLPHYSICAL\n",
 503                                __func__);
 504                        memreg = RPCRDMA_ALLPHYSICAL;
 505#else
 506                        dprintk("RPC:       %s: MTHCAFMR registration "
 507                                "specified but not supported by adapter, "
 508                                "using slower RPCRDMA_REGISTER\n",
 509                                __func__);
 510                        memreg = RPCRDMA_REGISTER;
 511#endif
 512                }
 513                break;
 514        case RPCRDMA_FRMR:
 515                /* Requires both frmr reg and local dma lkey */
 516                if ((devattr.device_cap_flags &
 517                     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
 518                    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
 519#if RPCRDMA_PERSISTENT_REGISTRATION
 520                        dprintk("RPC:       %s: FRMR registration "
 521                                "specified but not supported by adapter, "
 522                                "using riskier RPCRDMA_ALLPHYSICAL\n",
 523                                __func__);
 524                        memreg = RPCRDMA_ALLPHYSICAL;
 525#else
 526                        dprintk("RPC:       %s: FRMR registration "
 527                                "specified but not supported by adapter, "
 528                                "using slower RPCRDMA_REGISTER\n",
 529                                __func__);
 530                        memreg = RPCRDMA_REGISTER;
 531#endif
 532                }
 533                break;
 534        }
 535
 536        /*
 537         * Optionally obtain an underlying physical identity mapping in
 538         * order to do a memory window-based bind. This base registration
 539         * is protected from remote access - that is enabled only by binding
 540         * for the specific bytes targeted during each RPC operation, and
 541         * revoked after the corresponding completion similar to a storage
 542         * adapter.
 543         */
 544        switch (memreg) {
 545        case RPCRDMA_BOUNCEBUFFERS:
 546        case RPCRDMA_REGISTER:
 547        case RPCRDMA_FRMR:
 548                break;
 549#if RPCRDMA_PERSISTENT_REGISTRATION
 550        case RPCRDMA_ALLPHYSICAL:
 551                mem_priv = IB_ACCESS_LOCAL_WRITE |
 552                                IB_ACCESS_REMOTE_WRITE |
 553                                IB_ACCESS_REMOTE_READ;
 554                goto register_setup;
 555#endif
 556        case RPCRDMA_MEMWINDOWS_ASYNC:
 557        case RPCRDMA_MEMWINDOWS:
 558                mem_priv = IB_ACCESS_LOCAL_WRITE |
 559                                IB_ACCESS_MW_BIND;
 560                goto register_setup;
 561        case RPCRDMA_MTHCAFMR:
 562                if (ia->ri_have_dma_lkey)
 563                        break;
 564                mem_priv = IB_ACCESS_LOCAL_WRITE;
 565        register_setup:
 566                ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
 567                if (IS_ERR(ia->ri_bind_mem)) {
 568                        printk(KERN_ALERT "%s: ib_get_dma_mr for "
 569                                "phys register failed with %lX\n\t"
 570                                "Will continue with degraded performance\n",
 571                                __func__, PTR_ERR(ia->ri_bind_mem));
 572                        memreg = RPCRDMA_REGISTER;
 573                        ia->ri_bind_mem = NULL;
 574                }
 575                break;
 576        default:
 577                printk(KERN_ERR "%s: invalid memory registration mode %d\n",
 578                                __func__, memreg);
 579                rc = -EINVAL;
 580                goto out2;
 581        }
 582        dprintk("RPC:       %s: memory registration strategy is %d\n",
 583                __func__, memreg);
 584
 585        /* Else will do memory reg/dereg for each chunk */
 586        ia->ri_memreg_strategy = memreg;
 587
 588        return 0;
 589out2:
 590        rdma_destroy_id(ia->ri_id);
 591        ia->ri_id = NULL;
 592out1:
 593        return rc;
 594}
 595
 596/*
 597 * Clean up/close an IA.
 598 *   o if event handles and PD have been initialized, free them.
 599 *   o close the IA
 600 */
 601void
 602rpcrdma_ia_close(struct rpcrdma_ia *ia)
 603{
 604        int rc;
 605
 606        dprintk("RPC:       %s: entering\n", __func__);
 607        if (ia->ri_bind_mem != NULL) {
 608                rc = ib_dereg_mr(ia->ri_bind_mem);
 609                dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
 610                        __func__, rc);
 611        }
 612        if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
 613                if (ia->ri_id->qp)
 614                        rdma_destroy_qp(ia->ri_id);
 615                rdma_destroy_id(ia->ri_id);
 616                ia->ri_id = NULL;
 617        }
 618        if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
 619                rc = ib_dealloc_pd(ia->ri_pd);
 620                dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
 621                        __func__, rc);
 622        }
 623}
 624
 625/*
 626 * Create unconnected endpoint.
 627 */
 628int
 629rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 630                                struct rpcrdma_create_data_internal *cdata)
 631{
 632        struct ib_device_attr devattr;
 633        int rc, err;
 634
 635        rc = ib_query_device(ia->ri_id->device, &devattr);
 636        if (rc) {
 637                dprintk("RPC:       %s: ib_query_device failed %d\n",
 638                        __func__, rc);
 639                return rc;
 640        }
 641
 642        /* check provider's send/recv wr limits */
 643        if (cdata->max_requests > devattr.max_qp_wr)
 644                cdata->max_requests = devattr.max_qp_wr;
 645
 646        ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
 647        ep->rep_attr.qp_context = ep;
 648        /* send_cq and recv_cq initialized below */
 649        ep->rep_attr.srq = NULL;
 650        ep->rep_attr.cap.max_send_wr = cdata->max_requests;
 651        switch (ia->ri_memreg_strategy) {
 652        case RPCRDMA_FRMR:
 653                /* Add room for frmr register and invalidate WRs.
 654                 * 1. FRMR reg WR for head
 655                 * 2. FRMR invalidate WR for head
 656                 * 3. FRMR reg WR for pagelist
 657                 * 4. FRMR invalidate WR for pagelist
 658                 * 5. FRMR reg WR for tail
 659                 * 6. FRMR invalidate WR for tail
 660                 * 7. The RDMA_SEND WR
 661                 */
 662                ep->rep_attr.cap.max_send_wr *= 7;
 663                if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
 664                        cdata->max_requests = devattr.max_qp_wr / 7;
 665                        if (!cdata->max_requests)
 666                                return -EINVAL;
 667                        ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
 668                }
 669                break;
 670        case RPCRDMA_MEMWINDOWS_ASYNC:
 671        case RPCRDMA_MEMWINDOWS:
 672                /* Add room for mw_binds+unbinds - overkill! */
 673                ep->rep_attr.cap.max_send_wr++;
 674                ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
 675                if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
 676                        return -EINVAL;
 677                break;
 678        default:
 679                break;
 680        }
 681        ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
 682        ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
 683        ep->rep_attr.cap.max_recv_sge = 1;
 684        ep->rep_attr.cap.max_inline_data = 0;
 685        ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 686        ep->rep_attr.qp_type = IB_QPT_RC;
 687        ep->rep_attr.port_num = ~0;
 688
 689        dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
 690                "iovs: send %d recv %d\n",
 691                __func__,
 692                ep->rep_attr.cap.max_send_wr,
 693                ep->rep_attr.cap.max_recv_wr,
 694                ep->rep_attr.cap.max_send_sge,
 695                ep->rep_attr.cap.max_recv_sge);
 696
 697        /* set trigger for requesting send completion */
 698        ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
 699        switch (ia->ri_memreg_strategy) {
 700        case RPCRDMA_MEMWINDOWS_ASYNC:
 701        case RPCRDMA_MEMWINDOWS:
 702                ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
 703                break;
 704        default:
 705                break;
 706        }
 707        if (ep->rep_cqinit <= 2)
 708                ep->rep_cqinit = 0;
 709        INIT_CQCOUNT(ep);
 710        ep->rep_ia = ia;
 711        init_waitqueue_head(&ep->rep_connect_wait);
 712
 713        /*
 714         * Create a single cq for receive dto and mw_bind (only ever
 715         * care about unbind, really). Send completions are suppressed.
 716         * Use single threaded tasklet upcalls to maintain ordering.
 717         */
 718        ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
 719                                  rpcrdma_cq_async_error_upcall, NULL,
 720                                  ep->rep_attr.cap.max_recv_wr +
 721                                  ep->rep_attr.cap.max_send_wr + 1, 0);
 722        if (IS_ERR(ep->rep_cq)) {
 723                rc = PTR_ERR(ep->rep_cq);
 724                dprintk("RPC:       %s: ib_create_cq failed: %i\n",
 725                        __func__, rc);
 726                goto out1;
 727        }
 728
 729        rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
 730        if (rc) {
 731                dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
 732                        __func__, rc);
 733                goto out2;
 734        }
 735
 736        ep->rep_attr.send_cq = ep->rep_cq;
 737        ep->rep_attr.recv_cq = ep->rep_cq;
 738
 739        /* Initialize cma parameters */
 740
 741        /* RPC/RDMA does not use private data */
 742        ep->rep_remote_cma.private_data = NULL;
 743        ep->rep_remote_cma.private_data_len = 0;
 744
 745        /* Client offers RDMA Read but does not initiate */
 746        ep->rep_remote_cma.initiator_depth = 0;
 747        if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
 748                ep->rep_remote_cma.responder_resources = 0;
 749        else if (devattr.max_qp_rd_atom > 32)   /* arbitrary but <= 255 */
 750                ep->rep_remote_cma.responder_resources = 32;
 751        else
 752                ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
 753
 754        ep->rep_remote_cma.retry_count = 7;
 755        ep->rep_remote_cma.flow_control = 0;
 756        ep->rep_remote_cma.rnr_retry_count = 0;
 757
 758        return 0;
 759
 760out2:
 761        err = ib_destroy_cq(ep->rep_cq);
 762        if (err)
 763                dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
 764                        __func__, err);
 765out1:
 766        return rc;
 767}
 768
 769/*
 770 * rpcrdma_ep_destroy
 771 *
 772 * Disconnect and destroy endpoint. After this, the only
 773 * valid operations on the ep are to free it (if dynamically
 774 * allocated) or re-create it.
 775 *
 776 * The caller's error handling must be sure to not leak the endpoint
 777 * if this function fails.
 778 */
 779int
 780rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 781{
 782        int rc;
 783
 784        dprintk("RPC:       %s: entering, connected is %d\n",
 785                __func__, ep->rep_connected);
 786
 787        if (ia->ri_id->qp) {
 788                rc = rpcrdma_ep_disconnect(ep, ia);
 789                if (rc)
 790                        dprintk("RPC:       %s: rpcrdma_ep_disconnect"
 791                                " returned %i\n", __func__, rc);
 792                rdma_destroy_qp(ia->ri_id);
 793                ia->ri_id->qp = NULL;
 794        }
 795
 796        /* padding - could be done in rpcrdma_buffer_destroy... */
 797        if (ep->rep_pad_mr) {
 798                rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
 799                ep->rep_pad_mr = NULL;
 800        }
 801
 802        rpcrdma_clean_cq(ep->rep_cq);
 803        rc = ib_destroy_cq(ep->rep_cq);
 804        if (rc)
 805                dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
 806                        __func__, rc);
 807
 808        return rc;
 809}
 810
 811/*
 812 * Connect unconnected endpoint.
 813 */
 814int
 815rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 816{
 817        struct rdma_cm_id *id;
 818        int rc = 0;
 819        int retry_count = 0;
 820
 821        if (ep->rep_connected != 0) {
 822                struct rpcrdma_xprt *xprt;
 823retry:
 824                rc = rpcrdma_ep_disconnect(ep, ia);
 825                if (rc && rc != -ENOTCONN)
 826                        dprintk("RPC:       %s: rpcrdma_ep_disconnect"
 827                                " status %i\n", __func__, rc);
 828                rpcrdma_clean_cq(ep->rep_cq);
 829
 830                xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
 831                id = rpcrdma_create_id(xprt, ia,
 832                                (struct sockaddr *)&xprt->rx_data.addr);
 833                if (IS_ERR(id)) {
 834                        rc = PTR_ERR(id);
 835                        goto out;
 836                }
 837                /* TEMP TEMP TEMP - fail if new device:
 838                 * Deregister/remarshal *all* requests!
 839                 * Close and recreate adapter, pd, etc!
 840                 * Re-determine all attributes still sane!
 841                 * More stuff I haven't thought of!
 842                 * Rrrgh!
 843                 */
 844                if (ia->ri_id->device != id->device) {
 845                        printk("RPC:       %s: can't reconnect on "
 846                                "different device!\n", __func__);
 847                        rdma_destroy_id(id);
 848                        rc = -ENETDOWN;
 849                        goto out;
 850                }
 851                /* END TEMP */
 852                rdma_destroy_qp(ia->ri_id);
 853                rdma_destroy_id(ia->ri_id);
 854                ia->ri_id = id;
 855        }
 856
 857        rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
 858        if (rc) {
 859                dprintk("RPC:       %s: rdma_create_qp failed %i\n",
 860                        __func__, rc);
 861                goto out;
 862        }
 863
 864/* XXX Tavor device performs badly with 2K MTU! */
 865if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
 866        struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
 867        if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
 868            (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
 869             pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
 870                struct ib_qp_attr attr = {
 871                        .path_mtu = IB_MTU_1024
 872                };
 873                rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
 874        }
 875}
 876
 877        ep->rep_connected = 0;
 878
 879        rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
 880        if (rc) {
 881                dprintk("RPC:       %s: rdma_connect() failed with %i\n",
 882                                __func__, rc);
 883                goto out;
 884        }
 885
 886        wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
 887
 888        /*
 889         * Check state. A non-peer reject indicates no listener
 890         * (ECONNREFUSED), which may be a transient state. All
 891         * others indicate a transport condition which has already
 892         * undergone a best-effort.
 893         */
 894        if (ep->rep_connected == -ECONNREFUSED &&
 895            ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
 896                dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
 897                goto retry;
 898        }
 899        if (ep->rep_connected <= 0) {
 900                /* Sometimes, the only way to reliably connect to remote
 901                 * CMs is to use same nonzero values for ORD and IRD. */
 902                if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
 903                    (ep->rep_remote_cma.responder_resources == 0 ||
 904                     ep->rep_remote_cma.initiator_depth !=
 905                                ep->rep_remote_cma.responder_resources)) {
 906                        if (ep->rep_remote_cma.responder_resources == 0)
 907                                ep->rep_remote_cma.responder_resources = 1;
 908                        ep->rep_remote_cma.initiator_depth =
 909                                ep->rep_remote_cma.responder_resources;
 910                        goto retry;
 911                }
 912                rc = ep->rep_connected;
 913        } else {
 914                dprintk("RPC:       %s: connected\n", __func__);
 915        }
 916
 917out:
 918        if (rc)
 919                ep->rep_connected = rc;
 920        return rc;
 921}
 922
 923/*
 924 * rpcrdma_ep_disconnect
 925 *
 926 * This is separate from destroy to facilitate the ability
 927 * to reconnect without recreating the endpoint.
 928 *
 929 * This call is not reentrant, and must not be made in parallel
 930 * on the same endpoint.
 931 */
 932int
 933rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 934{
 935        int rc;
 936
 937        rpcrdma_clean_cq(ep->rep_cq);
 938        rc = rdma_disconnect(ia->ri_id);
 939        if (!rc) {
 940                /* returns without wait if not connected */
 941                wait_event_interruptible(ep->rep_connect_wait,
 942                                                        ep->rep_connected != 1);
 943                dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
 944                        (ep->rep_connected == 1) ? "still " : "dis");
 945        } else {
 946                dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
 947                ep->rep_connected = rc;
 948        }
 949        return rc;
 950}
 951
 952/*
 953 * Initialize buffer memory
 954 */
 955int
 956rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
 957        struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
 958{
 959        char *p;
 960        size_t len;
 961        int i, rc;
 962        struct rpcrdma_mw *r;
 963
 964        buf->rb_max_requests = cdata->max_requests;
 965        spin_lock_init(&buf->rb_lock);
 966        atomic_set(&buf->rb_credits, 1);
 967
 968        /* Need to allocate:
 969         *   1.  arrays for send and recv pointers
 970         *   2.  arrays of struct rpcrdma_req to fill in pointers
 971         *   3.  array of struct rpcrdma_rep for replies
 972         *   4.  padding, if any
 973         *   5.  mw's, fmr's or frmr's, if any
 974         * Send/recv buffers in req/rep need to be registered
 975         */
 976
 977        len = buf->rb_max_requests *
 978                (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
 979        len += cdata->padding;
 980        switch (ia->ri_memreg_strategy) {
 981        case RPCRDMA_FRMR:
 982                len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
 983                                sizeof(struct rpcrdma_mw);
 984                break;
 985        case RPCRDMA_MTHCAFMR:
 986                /* TBD we are perhaps overallocating here */
 987                len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
 988                                sizeof(struct rpcrdma_mw);
 989                break;
 990        case RPCRDMA_MEMWINDOWS_ASYNC:
 991        case RPCRDMA_MEMWINDOWS:
 992                len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
 993                                sizeof(struct rpcrdma_mw);
 994                break;
 995        default:
 996                break;
 997        }
 998
 999        /* allocate 1, 4 and 5 in one shot */
1000        p = kzalloc(len, GFP_KERNEL);
1001        if (p == NULL) {
1002                dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1003                        __func__, len);
1004                rc = -ENOMEM;
1005                goto out;
1006        }
1007        buf->rb_pool = p;       /* for freeing it later */
1008
1009        buf->rb_send_bufs = (struct rpcrdma_req **) p;
1010        p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1011        buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1012        p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1013
1014        /*
1015         * Register the zeroed pad buffer, if any.
1016         */
1017        if (cdata->padding) {
1018                rc = rpcrdma_register_internal(ia, p, cdata->padding,
1019                                            &ep->rep_pad_mr, &ep->rep_pad);
1020                if (rc)
1021                        goto out;
1022        }
1023        p += cdata->padding;
1024
1025        /*
1026         * Allocate the fmr's, or mw's for mw_bind chunk registration.
1027         * We "cycle" the mw's in order to minimize rkey reuse,
1028         * and also reduce unbind-to-bind collision.
1029         */
1030        INIT_LIST_HEAD(&buf->rb_mws);
1031        r = (struct rpcrdma_mw *)p;
1032        switch (ia->ri_memreg_strategy) {
1033        case RPCRDMA_FRMR:
1034                for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1035                        r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1036                                                         RPCRDMA_MAX_SEGS);
1037                        if (IS_ERR(r->r.frmr.fr_mr)) {
1038                                rc = PTR_ERR(r->r.frmr.fr_mr);
1039                                dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
1040                                        " failed %i\n", __func__, rc);
1041                                goto out;
1042                        }
1043                        r->r.frmr.fr_pgl =
1044                                ib_alloc_fast_reg_page_list(ia->ri_id->device,
1045                                                            RPCRDMA_MAX_SEGS);
1046                        if (IS_ERR(r->r.frmr.fr_pgl)) {
1047                                rc = PTR_ERR(r->r.frmr.fr_pgl);
1048                                dprintk("RPC:       %s: "
1049                                        "ib_alloc_fast_reg_page_list "
1050                                        "failed %i\n", __func__, rc);
1051                                goto out;
1052                        }
1053                        list_add(&r->mw_list, &buf->rb_mws);
1054                        ++r;
1055                }
1056                break;
1057        case RPCRDMA_MTHCAFMR:
1058                /* TBD we are perhaps overallocating here */
1059                for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1060                        static struct ib_fmr_attr fa =
1061                                { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1062                        r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1063                                IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1064                                &fa);
1065                        if (IS_ERR(r->r.fmr)) {
1066                                rc = PTR_ERR(r->r.fmr);
1067                                dprintk("RPC:       %s: ib_alloc_fmr"
1068                                        " failed %i\n", __func__, rc);
1069                                goto out;
1070                        }
1071                        list_add(&r->mw_list, &buf->rb_mws);
1072                        ++r;
1073                }
1074                break;
1075        case RPCRDMA_MEMWINDOWS_ASYNC:
1076        case RPCRDMA_MEMWINDOWS:
1077                /* Allocate one extra request's worth, for full cycling */
1078                for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1079                        r->r.mw = ib_alloc_mw(ia->ri_pd);
1080                        if (IS_ERR(r->r.mw)) {
1081                                rc = PTR_ERR(r->r.mw);
1082                                dprintk("RPC:       %s: ib_alloc_mw"
1083                                        " failed %i\n", __func__, rc);
1084                                goto out;
1085                        }
1086                        list_add(&r->mw_list, &buf->rb_mws);
1087                        ++r;
1088                }
1089                break;
1090        default:
1091                break;
1092        }
1093
1094        /*
1095         * Allocate/init the request/reply buffers. Doing this
1096         * using kmalloc for now -- one for each buf.
1097         */
1098        for (i = 0; i < buf->rb_max_requests; i++) {
1099                struct rpcrdma_req *req;
1100                struct rpcrdma_rep *rep;
1101
1102                len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1103                /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1104                /* Typical ~2400b, so rounding up saves work later */
1105                if (len < 4096)
1106                        len = 4096;
1107                req = kmalloc(len, GFP_KERNEL);
1108                if (req == NULL) {
1109                        dprintk("RPC:       %s: request buffer %d alloc"
1110                                " failed\n", __func__, i);
1111                        rc = -ENOMEM;
1112                        goto out;
1113                }
1114                memset(req, 0, sizeof(struct rpcrdma_req));
1115                buf->rb_send_bufs[i] = req;
1116                buf->rb_send_bufs[i]->rl_buffer = buf;
1117
1118                rc = rpcrdma_register_internal(ia, req->rl_base,
1119                                len - offsetof(struct rpcrdma_req, rl_base),
1120                                &buf->rb_send_bufs[i]->rl_handle,
1121                                &buf->rb_send_bufs[i]->rl_iov);
1122                if (rc)
1123                        goto out;
1124
1125                buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1126
1127                len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1128                rep = kmalloc(len, GFP_KERNEL);
1129                if (rep == NULL) {
1130                        dprintk("RPC:       %s: reply buffer %d alloc failed\n",
1131                                __func__, i);
1132                        rc = -ENOMEM;
1133                        goto out;
1134                }
1135                memset(rep, 0, sizeof(struct rpcrdma_rep));
1136                buf->rb_recv_bufs[i] = rep;
1137                buf->rb_recv_bufs[i]->rr_buffer = buf;
1138                init_waitqueue_head(&rep->rr_unbind);
1139
1140                rc = rpcrdma_register_internal(ia, rep->rr_base,
1141                                len - offsetof(struct rpcrdma_rep, rr_base),
1142                                &buf->rb_recv_bufs[i]->rr_handle,
1143                                &buf->rb_recv_bufs[i]->rr_iov);
1144                if (rc)
1145                        goto out;
1146
1147        }
1148        dprintk("RPC:       %s: max_requests %d\n",
1149                __func__, buf->rb_max_requests);
1150        /* done */
1151        return 0;
1152out:
1153        rpcrdma_buffer_destroy(buf);
1154        return rc;
1155}
1156
1157/*
1158 * Unregister and destroy buffer memory. Need to deal with
1159 * partial initialization, so it's callable from failed create.
1160 * Must be called before destroying endpoint, as registrations
1161 * reference it.
1162 */
1163void
1164rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1165{
1166        int rc, i;
1167        struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1168        struct rpcrdma_mw *r;
1169
1170        /* clean up in reverse order from create
1171         *   1.  recv mr memory (mr free, then kfree)
1172         *   1a. bind mw memory
1173         *   2.  send mr memory (mr free, then kfree)
1174         *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
1175         *   4.  arrays
1176         */
1177        dprintk("RPC:       %s: entering\n", __func__);
1178
1179        for (i = 0; i < buf->rb_max_requests; i++) {
1180                if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1181                        rpcrdma_deregister_internal(ia,
1182                                        buf->rb_recv_bufs[i]->rr_handle,
1183                                        &buf->rb_recv_bufs[i]->rr_iov);
1184                        kfree(buf->rb_recv_bufs[i]);
1185                }
1186                if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1187                        while (!list_empty(&buf->rb_mws)) {
1188                                r = list_entry(buf->rb_mws.next,
1189                                        struct rpcrdma_mw, mw_list);
1190                                list_del(&r->mw_list);
1191                                switch (ia->ri_memreg_strategy) {
1192                                case RPCRDMA_FRMR:
1193                                        rc = ib_dereg_mr(r->r.frmr.fr_mr);
1194                                        if (rc)
1195                                                dprintk("RPC:       %s:"
1196                                                        " ib_dereg_mr"
1197                                                        " failed %i\n",
1198                                                        __func__, rc);
1199                                        ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1200                                        break;
1201                                case RPCRDMA_MTHCAFMR:
1202                                        rc = ib_dealloc_fmr(r->r.fmr);
1203                                        if (rc)
1204                                                dprintk("RPC:       %s:"
1205                                                        " ib_dealloc_fmr"
1206                                                        " failed %i\n",
1207                                                        __func__, rc);
1208                                        break;
1209                                case RPCRDMA_MEMWINDOWS_ASYNC:
1210                                case RPCRDMA_MEMWINDOWS:
1211                                        rc = ib_dealloc_mw(r->r.mw);
1212                                        if (rc)
1213                                                dprintk("RPC:       %s:"
1214                                                        " ib_dealloc_mw"
1215                                                        " failed %i\n",
1216                                                        __func__, rc);
1217                                        break;
1218                                default:
1219                                        break;
1220                                }
1221                        }
1222                        rpcrdma_deregister_internal(ia,
1223                                        buf->rb_send_bufs[i]->rl_handle,
1224                                        &buf->rb_send_bufs[i]->rl_iov);
1225                        kfree(buf->rb_send_bufs[i]);
1226                }
1227        }
1228
1229        kfree(buf->rb_pool);
1230}
1231
1232/*
1233 * Get a set of request/reply buffers.
1234 *
1235 * Reply buffer (if needed) is attached to send buffer upon return.
1236 * Rule:
1237 *    rb_send_index and rb_recv_index MUST always be pointing to the
1238 *    *next* available buffer (non-NULL). They are incremented after
1239 *    removing buffers, and decremented *before* returning them.
1240 */
1241struct rpcrdma_req *
1242rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1243{
1244        struct rpcrdma_req *req;
1245        unsigned long flags;
1246        int i;
1247        struct rpcrdma_mw *r;
1248
1249        spin_lock_irqsave(&buffers->rb_lock, flags);
1250        if (buffers->rb_send_index == buffers->rb_max_requests) {
1251                spin_unlock_irqrestore(&buffers->rb_lock, flags);
1252                dprintk("RPC:       %s: out of request buffers\n", __func__);
1253                return ((struct rpcrdma_req *)NULL);
1254        }
1255
1256        req = buffers->rb_send_bufs[buffers->rb_send_index];
1257        if (buffers->rb_send_index < buffers->rb_recv_index) {
1258                dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
1259                        __func__,
1260                        buffers->rb_recv_index - buffers->rb_send_index);
1261                req->rl_reply = NULL;
1262        } else {
1263                req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1264                buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1265        }
1266        buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1267        if (!list_empty(&buffers->rb_mws)) {
1268                i = RPCRDMA_MAX_SEGS - 1;
1269                do {
1270                        r = list_entry(buffers->rb_mws.next,
1271                                        struct rpcrdma_mw, mw_list);
1272                        list_del(&r->mw_list);
1273                        req->rl_segments[i].mr_chunk.rl_mw = r;
1274                } while (--i >= 0);
1275        }
1276        spin_unlock_irqrestore(&buffers->rb_lock, flags);
1277        return req;
1278}
1279
1280/*
1281 * Put request/reply buffers back into pool.
1282 * Pre-decrement counter/array index.
1283 */
1284void
1285rpcrdma_buffer_put(struct rpcrdma_req *req)
1286{
1287        struct rpcrdma_buffer *buffers = req->rl_buffer;
1288        struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1289        int i;
1290        unsigned long flags;
1291
1292        BUG_ON(req->rl_nchunks != 0);
1293        spin_lock_irqsave(&buffers->rb_lock, flags);
1294        buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1295        req->rl_niovs = 0;
1296        if (req->rl_reply) {
1297                buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1298                init_waitqueue_head(&req->rl_reply->rr_unbind);
1299                req->rl_reply->rr_func = NULL;
1300                req->rl_reply = NULL;
1301        }
1302        switch (ia->ri_memreg_strategy) {
1303        case RPCRDMA_FRMR:
1304        case RPCRDMA_MTHCAFMR:
1305        case RPCRDMA_MEMWINDOWS_ASYNC:
1306        case RPCRDMA_MEMWINDOWS:
1307                /*
1308                 * Cycle mw's back in reverse order, and "spin" them.
1309                 * This delays and scrambles reuse as much as possible.
1310                 */
1311                i = 1;
1312                do {
1313                        struct rpcrdma_mw **mw;
1314                        mw = &req->rl_segments[i].mr_chunk.rl_mw;
1315                        list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1316                        *mw = NULL;
1317                } while (++i < RPCRDMA_MAX_SEGS);
1318                list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1319                                        &buffers->rb_mws);
1320                req->rl_segments[0].mr_chunk.rl_mw = NULL;
1321                break;
1322        default:
1323                break;
1324        }
1325        spin_unlock_irqrestore(&buffers->rb_lock, flags);
1326}
1327
1328/*
1329 * Recover reply buffers from pool.
1330 * This happens when recovering from error conditions.
1331 * Post-increment counter/array index.
1332 */
1333void
1334rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1335{
1336        struct rpcrdma_buffer *buffers = req->rl_buffer;
1337        unsigned long flags;
1338
1339        if (req->rl_iov.length == 0)    /* special case xprt_rdma_allocate() */
1340                buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1341        spin_lock_irqsave(&buffers->rb_lock, flags);
1342        if (buffers->rb_recv_index < buffers->rb_max_requests) {
1343                req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1344                buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1345        }
1346        spin_unlock_irqrestore(&buffers->rb_lock, flags);
1347}
1348
1349/*
1350 * Put reply buffers back into pool when not attached to
1351 * request. This happens in error conditions, and when
1352 * aborting unbinds. Pre-decrement counter/array index.
1353 */
1354void
1355rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1356{
1357        struct rpcrdma_buffer *buffers = rep->rr_buffer;
1358        unsigned long flags;
1359
1360        rep->rr_func = NULL;
1361        spin_lock_irqsave(&buffers->rb_lock, flags);
1362        buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1363        spin_unlock_irqrestore(&buffers->rb_lock, flags);
1364}
1365
1366/*
1367 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1368 */
1369
1370int
1371rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1372                                struct ib_mr **mrp, struct ib_sge *iov)
1373{
1374        struct ib_phys_buf ipb;
1375        struct ib_mr *mr;
1376        int rc;
1377
1378        /*
1379         * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1380         */
1381        iov->addr = ib_dma_map_single(ia->ri_id->device,
1382                        va, len, DMA_BIDIRECTIONAL);
1383        iov->length = len;
1384
1385        if (ia->ri_have_dma_lkey) {
1386                *mrp = NULL;
1387                iov->lkey = ia->ri_dma_lkey;
1388                return 0;
1389        } else if (ia->ri_bind_mem != NULL) {
1390                *mrp = NULL;
1391                iov->lkey = ia->ri_bind_mem->lkey;
1392                return 0;
1393        }
1394
1395        ipb.addr = iov->addr;
1396        ipb.size = iov->length;
1397        mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1398                        IB_ACCESS_LOCAL_WRITE, &iov->addr);
1399
1400        dprintk("RPC:       %s: phys convert: 0x%llx "
1401                        "registered 0x%llx length %d\n",
1402                        __func__, (unsigned long long)ipb.addr,
1403                        (unsigned long long)iov->addr, len);
1404
1405        if (IS_ERR(mr)) {
1406                *mrp = NULL;
1407                rc = PTR_ERR(mr);
1408                dprintk("RPC:       %s: failed with %i\n", __func__, rc);
1409        } else {
1410                *mrp = mr;
1411                iov->lkey = mr->lkey;
1412                rc = 0;
1413        }
1414
1415        return rc;
1416}
1417
1418int
1419rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1420                                struct ib_mr *mr, struct ib_sge *iov)
1421{
1422        int rc;
1423
1424        ib_dma_unmap_single(ia->ri_id->device,
1425                        iov->addr, iov->length, DMA_BIDIRECTIONAL);
1426
1427        if (NULL == mr)
1428                return 0;
1429
1430        rc = ib_dereg_mr(mr);
1431        if (rc)
1432                dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
1433        return rc;
1434}
1435
1436/*
1437 * Wrappers for chunk registration, shared by read/write chunk code.
1438 */
1439
1440static void
1441rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1442{
1443        seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1444        seg->mr_dmalen = seg->mr_len;
1445        if (seg->mr_page)
1446                seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1447                                seg->mr_page, offset_in_page(seg->mr_offset),
1448                                seg->mr_dmalen, seg->mr_dir);
1449        else
1450                seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1451                                seg->mr_offset,
1452                                seg->mr_dmalen, seg->mr_dir);
1453}
1454
1455static void
1456rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1457{
1458        if (seg->mr_page)
1459                ib_dma_unmap_page(ia->ri_id->device,
1460                                seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1461        else
1462                ib_dma_unmap_single(ia->ri_id->device,
1463                                seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1464}
1465
1466static int
1467rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1468                        int *nsegs, int writing, struct rpcrdma_ia *ia,
1469                        struct rpcrdma_xprt *r_xprt)
1470{
1471        struct rpcrdma_mr_seg *seg1 = seg;
1472        struct ib_send_wr frmr_wr, *bad_wr;
1473        u8 key;
1474        int len, pageoff;
1475        int i, rc;
1476
1477        pageoff = offset_in_page(seg1->mr_offset);
1478        seg1->mr_offset -= pageoff;     /* start of page */
1479        seg1->mr_len += pageoff;
1480        len = -pageoff;
1481        if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1482                *nsegs = RPCRDMA_MAX_DATA_SEGS;
1483        for (i = 0; i < *nsegs;) {
1484                rpcrdma_map_one(ia, seg, writing);
1485                seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1486                len += seg->mr_len;
1487                ++seg;
1488                ++i;
1489                /* Check for holes */
1490                if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1491                    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1492                        break;
1493        }
1494        dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
1495                __func__, seg1->mr_chunk.rl_mw, i);
1496
1497        /* Bump the key */
1498        key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1499        ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1500
1501        /* Prepare FRMR WR */
1502        memset(&frmr_wr, 0, sizeof frmr_wr);
1503        frmr_wr.opcode = IB_WR_FAST_REG_MR;
1504        frmr_wr.send_flags = 0;                 /* unsignaled */
1505        frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1506        frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1507        frmr_wr.wr.fast_reg.page_list_len = i;
1508        frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1509        frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1510        frmr_wr.wr.fast_reg.access_flags = (writing ?
1511                                IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1512                                IB_ACCESS_REMOTE_READ);
1513        frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1514        DECR_CQCOUNT(&r_xprt->rx_ep);
1515
1516        rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
1517
1518        if (rc) {
1519                dprintk("RPC:       %s: failed ib_post_send for register,"
1520                        " status %i\n", __func__, rc);
1521                while (i--)
1522                        rpcrdma_unmap_one(ia, --seg);
1523        } else {
1524                seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1525                seg1->mr_base = seg1->mr_dma + pageoff;
1526                seg1->mr_nsegs = i;
1527                seg1->mr_len = len;
1528        }
1529        *nsegs = i;
1530        return rc;
1531}
1532
1533static int
1534rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1535                        struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1536{
1537        struct rpcrdma_mr_seg *seg1 = seg;
1538        struct ib_send_wr invalidate_wr, *bad_wr;
1539        int rc;
1540
1541        while (seg1->mr_nsegs--)
1542                rpcrdma_unmap_one(ia, seg++);
1543
1544        memset(&invalidate_wr, 0, sizeof invalidate_wr);
1545        invalidate_wr.opcode = IB_WR_LOCAL_INV;
1546        invalidate_wr.send_flags = 0;                   /* unsignaled */
1547        invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1548        DECR_CQCOUNT(&r_xprt->rx_ep);
1549
1550        rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1551        if (rc)
1552                dprintk("RPC:       %s: failed ib_post_send for invalidate,"
1553                        " status %i\n", __func__, rc);
1554        return rc;
1555}
1556
1557static int
1558rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1559                        int *nsegs, int writing, struct rpcrdma_ia *ia)
1560{
1561        struct rpcrdma_mr_seg *seg1 = seg;
1562        u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1563        int len, pageoff, i, rc;
1564
1565        pageoff = offset_in_page(seg1->mr_offset);
1566        seg1->mr_offset -= pageoff;     /* start of page */
1567        seg1->mr_len += pageoff;
1568        len = -pageoff;
1569        if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1570                *nsegs = RPCRDMA_MAX_DATA_SEGS;
1571        for (i = 0; i < *nsegs;) {
1572                rpcrdma_map_one(ia, seg, writing);
1573                physaddrs[i] = seg->mr_dma;
1574                len += seg->mr_len;
1575                ++seg;
1576                ++i;
1577                /* Check for holes */
1578                if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1579                    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1580                        break;
1581        }
1582        rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1583                                physaddrs, i, seg1->mr_dma);
1584        if (rc) {
1585                dprintk("RPC:       %s: failed ib_map_phys_fmr "
1586                        "%u@0x%llx+%i (%d)... status %i\n", __func__,
1587                        len, (unsigned long long)seg1->mr_dma,
1588                        pageoff, i, rc);
1589                while (i--)
1590                        rpcrdma_unmap_one(ia, --seg);
1591        } else {
1592                seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1593                seg1->mr_base = seg1->mr_dma + pageoff;
1594                seg1->mr_nsegs = i;
1595                seg1->mr_len = len;
1596        }
1597        *nsegs = i;
1598        return rc;
1599}
1600
1601static int
1602rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1603                        struct rpcrdma_ia *ia)
1604{
1605        struct rpcrdma_mr_seg *seg1 = seg;
1606        LIST_HEAD(l);
1607        int rc;
1608
1609        list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1610        rc = ib_unmap_fmr(&l);
1611        while (seg1->mr_nsegs--)
1612                rpcrdma_unmap_one(ia, seg++);
1613        if (rc)
1614                dprintk("RPC:       %s: failed ib_unmap_fmr,"
1615                        " status %i\n", __func__, rc);
1616        return rc;
1617}
1618
1619static int
1620rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1621                        int *nsegs, int writing, struct rpcrdma_ia *ia,
1622                        struct rpcrdma_xprt *r_xprt)
1623{
1624        int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1625                                  IB_ACCESS_REMOTE_READ);
1626        struct ib_mw_bind param;
1627        int rc;
1628
1629        *nsegs = 1;
1630        rpcrdma_map_one(ia, seg, writing);
1631        param.mr = ia->ri_bind_mem;
1632        param.wr_id = 0ULL;     /* no send cookie */
1633        param.addr = seg->mr_dma;
1634        param.length = seg->mr_len;
1635        param.send_flags = 0;
1636        param.mw_access_flags = mem_priv;
1637
1638        DECR_CQCOUNT(&r_xprt->rx_ep);
1639        rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1640        if (rc) {
1641                dprintk("RPC:       %s: failed ib_bind_mw "
1642                        "%u@0x%llx status %i\n",
1643                        __func__, seg->mr_len,
1644                        (unsigned long long)seg->mr_dma, rc);
1645                rpcrdma_unmap_one(ia, seg);
1646        } else {
1647                seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1648                seg->mr_base = param.addr;
1649                seg->mr_nsegs = 1;
1650        }
1651        return rc;
1652}
1653
1654static int
1655rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1656                        struct rpcrdma_ia *ia,
1657                        struct rpcrdma_xprt *r_xprt, void **r)
1658{
1659        struct ib_mw_bind param;
1660        LIST_HEAD(l);
1661        int rc;
1662
1663        BUG_ON(seg->mr_nsegs != 1);
1664        param.mr = ia->ri_bind_mem;
1665        param.addr = 0ULL;      /* unbind */
1666        param.length = 0;
1667        param.mw_access_flags = 0;
1668        if (*r) {
1669                param.wr_id = (u64) (unsigned long) *r;
1670                param.send_flags = IB_SEND_SIGNALED;
1671                INIT_CQCOUNT(&r_xprt->rx_ep);
1672        } else {
1673                param.wr_id = 0ULL;
1674                param.send_flags = 0;
1675                DECR_CQCOUNT(&r_xprt->rx_ep);
1676        }
1677        rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1678        rpcrdma_unmap_one(ia, seg);
1679        if (rc)
1680                dprintk("RPC:       %s: failed ib_(un)bind_mw,"
1681                        " status %i\n", __func__, rc);
1682        else
1683                *r = NULL;      /* will upcall on completion */
1684        return rc;
1685}
1686
1687static int
1688rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1689                        int *nsegs, int writing, struct rpcrdma_ia *ia)
1690{
1691        int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1692                                  IB_ACCESS_REMOTE_READ);
1693        struct rpcrdma_mr_seg *seg1 = seg;
1694        struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1695        int len, i, rc = 0;
1696
1697        if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1698                *nsegs = RPCRDMA_MAX_DATA_SEGS;
1699        for (len = 0, i = 0; i < *nsegs;) {
1700                rpcrdma_map_one(ia, seg, writing);
1701                ipb[i].addr = seg->mr_dma;
1702                ipb[i].size = seg->mr_len;
1703                len += seg->mr_len;
1704                ++seg;
1705                ++i;
1706                /* Check for holes */
1707                if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1708                    offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1709                        break;
1710        }
1711        seg1->mr_base = seg1->mr_dma;
1712        seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1713                                ipb, i, mem_priv, &seg1->mr_base);
1714        if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1715                rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1716                dprintk("RPC:       %s: failed ib_reg_phys_mr "
1717                        "%u@0x%llx (%d)... status %i\n",
1718                        __func__, len,
1719                        (unsigned long long)seg1->mr_dma, i, rc);
1720                while (i--)
1721                        rpcrdma_unmap_one(ia, --seg);
1722        } else {
1723                seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1724                seg1->mr_nsegs = i;
1725                seg1->mr_len = len;
1726        }
1727        *nsegs = i;
1728        return rc;
1729}
1730
1731static int
1732rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1733                        struct rpcrdma_ia *ia)
1734{
1735        struct rpcrdma_mr_seg *seg1 = seg;
1736        int rc;
1737
1738        rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1739        seg1->mr_chunk.rl_mr = NULL;
1740        while (seg1->mr_nsegs--)
1741                rpcrdma_unmap_one(ia, seg++);
1742        if (rc)
1743                dprintk("RPC:       %s: failed ib_dereg_mr,"
1744                        " status %i\n", __func__, rc);
1745        return rc;
1746}
1747
1748int
1749rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1750                        int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1751{
1752        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1753        int rc = 0;
1754
1755        switch (ia->ri_memreg_strategy) {
1756
1757#if RPCRDMA_PERSISTENT_REGISTRATION
1758        case RPCRDMA_ALLPHYSICAL:
1759                rpcrdma_map_one(ia, seg, writing);
1760                seg->mr_rkey = ia->ri_bind_mem->rkey;
1761                seg->mr_base = seg->mr_dma;
1762                seg->mr_nsegs = 1;
1763                nsegs = 1;
1764                break;
1765#endif
1766
1767        /* Registration using frmr registration */
1768        case RPCRDMA_FRMR:
1769                rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1770                break;
1771
1772        /* Registration using fmr memory registration */
1773        case RPCRDMA_MTHCAFMR:
1774                rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1775                break;
1776
1777        /* Registration using memory windows */
1778        case RPCRDMA_MEMWINDOWS_ASYNC:
1779        case RPCRDMA_MEMWINDOWS:
1780                rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1781                break;
1782
1783        /* Default registration each time */
1784        default:
1785                rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1786                break;
1787        }
1788        if (rc)
1789                return -1;
1790
1791        return nsegs;
1792}
1793
1794int
1795rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1796                struct rpcrdma_xprt *r_xprt, void *r)
1797{
1798        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1799        int nsegs = seg->mr_nsegs, rc;
1800
1801        switch (ia->ri_memreg_strategy) {
1802
1803#if RPCRDMA_PERSISTENT_REGISTRATION
1804        case RPCRDMA_ALLPHYSICAL:
1805                BUG_ON(nsegs != 1);
1806                rpcrdma_unmap_one(ia, seg);
1807                rc = 0;
1808                break;
1809#endif
1810
1811        case RPCRDMA_FRMR:
1812                rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1813                break;
1814
1815        case RPCRDMA_MTHCAFMR:
1816                rc = rpcrdma_deregister_fmr_external(seg, ia);
1817                break;
1818
1819        case RPCRDMA_MEMWINDOWS_ASYNC:
1820        case RPCRDMA_MEMWINDOWS:
1821                rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1822                break;
1823
1824        default:
1825                rc = rpcrdma_deregister_default_external(seg, ia);
1826                break;
1827        }
1828        if (r) {
1829                struct rpcrdma_rep *rep = r;
1830                void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1831                rep->rr_func = NULL;
1832                func(rep);      /* dereg done, callback now */
1833        }
1834        return nsegs;
1835}
1836
1837/*
1838 * Prepost any receive buffer, then post send.
1839 *
1840 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1841 */
1842int
1843rpcrdma_ep_post(struct rpcrdma_ia *ia,
1844                struct rpcrdma_ep *ep,
1845                struct rpcrdma_req *req)
1846{
1847        struct ib_send_wr send_wr, *send_wr_fail;
1848        struct rpcrdma_rep *rep = req->rl_reply;
1849        int rc;
1850
1851        if (rep) {
1852                rc = rpcrdma_ep_post_recv(ia, ep, rep);
1853                if (rc)
1854                        goto out;
1855                req->rl_reply = NULL;
1856        }
1857
1858        send_wr.next = NULL;
1859        send_wr.wr_id = 0ULL;   /* no send cookie */
1860        send_wr.sg_list = req->rl_send_iov;
1861        send_wr.num_sge = req->rl_niovs;
1862        send_wr.opcode = IB_WR_SEND;
1863        if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
1864                ib_dma_sync_single_for_device(ia->ri_id->device,
1865                        req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1866                        DMA_TO_DEVICE);
1867        ib_dma_sync_single_for_device(ia->ri_id->device,
1868                req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1869                DMA_TO_DEVICE);
1870        ib_dma_sync_single_for_device(ia->ri_id->device,
1871                req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1872                DMA_TO_DEVICE);
1873
1874        if (DECR_CQCOUNT(ep) > 0)
1875                send_wr.send_flags = 0;
1876        else { /* Provider must take a send completion every now and then */
1877                INIT_CQCOUNT(ep);
1878                send_wr.send_flags = IB_SEND_SIGNALED;
1879        }
1880
1881        rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1882        if (rc)
1883                dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
1884                        rc);
1885out:
1886        return rc;
1887}
1888
1889/*
1890 * (Re)post a receive buffer.
1891 */
1892int
1893rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1894                     struct rpcrdma_ep *ep,
1895                     struct rpcrdma_rep *rep)
1896{
1897        struct ib_recv_wr recv_wr, *recv_wr_fail;
1898        int rc;
1899
1900        recv_wr.next = NULL;
1901        recv_wr.wr_id = (u64) (unsigned long) rep;
1902        recv_wr.sg_list = &rep->rr_iov;
1903        recv_wr.num_sge = 1;
1904
1905        ib_dma_sync_single_for_cpu(ia->ri_id->device,
1906                rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1907
1908        DECR_CQCOUNT(ep);
1909        rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1910
1911        if (rc)
1912                dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
1913                        rc);
1914        return rc;
1915}
1916