linux/net/9p/trans_rdma.c
<<
>>
Prefs
   1/*
   2 * linux/fs/9p/trans_rdma.c
   3 *
   4 * RDMA transport layer based on the trans_fd.c implementation.
   5 *
   6 *  Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com>
   7 *  Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
   8 *  Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
   9 *  Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
  10 *  Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
  11 *
  12 *  This program is free software; you can redistribute it and/or modify
  13 *  it under the terms of the GNU General Public License version 2
  14 *  as published by the Free Software Foundation.
  15 *
  16 *  This program is distributed in the hope that it will be useful,
  17 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 *  GNU General Public License for more details.
  20 *
  21 *  You should have received a copy of the GNU General Public License
  22 *  along with this program; if not, write to:
  23 *  Free Software Foundation
  24 *  51 Franklin Street, Fifth Floor
  25 *  Boston, MA  02111-1301  USA
  26 *
  27 */
  28
  29#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  30
  31#include <linux/in.h>
  32#include <linux/module.h>
  33#include <linux/net.h>
  34#include <linux/ipv6.h>
  35#include <linux/kthread.h>
  36#include <linux/errno.h>
  37#include <linux/kernel.h>
  38#include <linux/un.h>
  39#include <linux/uaccess.h>
  40#include <linux/inet.h>
  41#include <linux/idr.h>
  42#include <linux/file.h>
  43#include <linux/parser.h>
  44#include <linux/semaphore.h>
  45#include <linux/slab.h>
  46#include <linux/seq_file.h>
  47#include <net/9p/9p.h>
  48#include <net/9p/client.h>
  49#include <net/9p/transport.h>
  50#include <rdma/ib_verbs.h>
  51#include <rdma/rdma_cm.h>
  52
  53#define P9_PORT                 5640
  54#define P9_RDMA_SQ_DEPTH        32
  55#define P9_RDMA_RQ_DEPTH        32
  56#define P9_RDMA_SEND_SGE        4
  57#define P9_RDMA_RECV_SGE        4
  58#define P9_RDMA_IRD             0
  59#define P9_RDMA_ORD             0
  60#define P9_RDMA_TIMEOUT         30000           /* 30 seconds */
  61#define P9_RDMA_MAXSIZE         (1024*1024)     /* 1MB */
  62
  63/**
  64 * struct p9_trans_rdma - RDMA transport instance
  65 *
  66 * @state: tracks the transport state machine for connection setup and tear down
  67 * @cm_id: The RDMA CM ID
  68 * @pd: Protection Domain pointer
  69 * @qp: Queue Pair pointer
  70 * @cq: Completion Queue pointer
  71 * @timeout: Number of uSecs to wait for connection management events
  72 * @privport: Whether a privileged port may be used
  73 * @port: The port to use
  74 * @sq_depth: The depth of the Send Queue
  75 * @sq_sem: Semaphore for the SQ
  76 * @rq_depth: The depth of the Receive Queue.
  77 * @rq_sem: Semaphore for the RQ
  78 * @excess_rc : Amount of posted Receive Contexts without a pending request.
  79 *              See rdma_request()
  80 * @addr: The remote peer's address
  81 * @req_lock: Protects the active request list
  82 * @cm_done: Completion event for connection management tracking
  83 */
  84struct p9_trans_rdma {
  85        enum {
  86                P9_RDMA_INIT,
  87                P9_RDMA_ADDR_RESOLVED,
  88                P9_RDMA_ROUTE_RESOLVED,
  89                P9_RDMA_CONNECTED,
  90                P9_RDMA_FLUSHING,
  91                P9_RDMA_CLOSING,
  92                P9_RDMA_CLOSED,
  93        } state;
  94        struct rdma_cm_id *cm_id;
  95        struct ib_pd *pd;
  96        struct ib_qp *qp;
  97        struct ib_cq *cq;
  98        long timeout;
  99        bool privport;
 100        u16 port;
 101        int sq_depth;
 102        struct semaphore sq_sem;
 103        int rq_depth;
 104        struct semaphore rq_sem;
 105        atomic_t excess_rc;
 106        struct sockaddr_in addr;
 107        spinlock_t req_lock;
 108
 109        struct completion cm_done;
 110};
 111
 112/**
 113 * p9_rdma_context - Keeps track of in-process WR
 114 *
 115 * @busa: Bus address to unmap when the WR completes
 116 * @req: Keeps track of requests (send)
 117 * @rc: Keepts track of replies (receive)
 118 */
 119struct p9_rdma_req;
 120struct p9_rdma_context {
 121        struct ib_cqe cqe;
 122        dma_addr_t busa;
 123        union {
 124                struct p9_req_t *req;
 125                struct p9_fcall rc;
 126        };
 127};
 128
 129/**
 130 * p9_rdma_opts - Collection of mount options
 131 * @port: port of connection
 132 * @sq_depth: The requested depth of the SQ. This really doesn't need
 133 * to be any deeper than the number of threads used in the client
 134 * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth
 135 * @timeout: Time to wait in msecs for CM events
 136 */
 137struct p9_rdma_opts {
 138        short port;
 139        bool privport;
 140        int sq_depth;
 141        int rq_depth;
 142        long timeout;
 143};
 144
 145/*
 146 * Option Parsing (code inspired by NFS code)
 147 */
 148enum {
 149        /* Options that take integer arguments */
 150        Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout,
 151        /* Options that take no argument */
 152        Opt_privport,
 153        Opt_err,
 154};
 155
 156static match_table_t tokens = {
 157        {Opt_port, "port=%u"},
 158        {Opt_sq_depth, "sq=%u"},
 159        {Opt_rq_depth, "rq=%u"},
 160        {Opt_timeout, "timeout=%u"},
 161        {Opt_privport, "privport"},
 162        {Opt_err, NULL},
 163};
 164
 165static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt)
 166{
 167        struct p9_trans_rdma *rdma = clnt->trans;
 168
 169        if (rdma->port != P9_PORT)
 170                seq_printf(m, ",port=%u", rdma->port);
 171        if (rdma->sq_depth != P9_RDMA_SQ_DEPTH)
 172                seq_printf(m, ",sq=%u", rdma->sq_depth);
 173        if (rdma->rq_depth != P9_RDMA_RQ_DEPTH)
 174                seq_printf(m, ",rq=%u", rdma->rq_depth);
 175        if (rdma->timeout != P9_RDMA_TIMEOUT)
 176                seq_printf(m, ",timeout=%lu", rdma->timeout);
 177        if (rdma->privport)
 178                seq_puts(m, ",privport");
 179        return 0;
 180}
 181
 182/**
 183 * parse_opts - parse mount options into rdma options structure
 184 * @params: options string passed from mount
 185 * @opts: rdma transport-specific structure to parse options into
 186 *
 187 * Returns 0 upon success, -ERRNO upon failure
 188 */
 189static int parse_opts(char *params, struct p9_rdma_opts *opts)
 190{
 191        char *p;
 192        substring_t args[MAX_OPT_ARGS];
 193        int option;
 194        char *options, *tmp_options;
 195
 196        opts->port = P9_PORT;
 197        opts->sq_depth = P9_RDMA_SQ_DEPTH;
 198        opts->rq_depth = P9_RDMA_RQ_DEPTH;
 199        opts->timeout = P9_RDMA_TIMEOUT;
 200        opts->privport = false;
 201
 202        if (!params)
 203                return 0;
 204
 205        tmp_options = kstrdup(params, GFP_KERNEL);
 206        if (!tmp_options) {
 207                p9_debug(P9_DEBUG_ERROR,
 208                         "failed to allocate copy of option string\n");
 209                return -ENOMEM;
 210        }
 211        options = tmp_options;
 212
 213        while ((p = strsep(&options, ",")) != NULL) {
 214                int token;
 215                int r;
 216                if (!*p)
 217                        continue;
 218                token = match_token(p, tokens, args);
 219                if ((token != Opt_err) && (token != Opt_privport)) {
 220                        r = match_int(&args[0], &option);
 221                        if (r < 0) {
 222                                p9_debug(P9_DEBUG_ERROR,
 223                                         "integer field, but no integer?\n");
 224                                continue;
 225                        }
 226                }
 227                switch (token) {
 228                case Opt_port:
 229                        opts->port = option;
 230                        break;
 231                case Opt_sq_depth:
 232                        opts->sq_depth = option;
 233                        break;
 234                case Opt_rq_depth:
 235                        opts->rq_depth = option;
 236                        break;
 237                case Opt_timeout:
 238                        opts->timeout = option;
 239                        break;
 240                case Opt_privport:
 241                        opts->privport = true;
 242                        break;
 243                default:
 244                        continue;
 245                }
 246        }
 247        /* RQ must be at least as large as the SQ */
 248        opts->rq_depth = max(opts->rq_depth, opts->sq_depth);
 249        kfree(tmp_options);
 250        return 0;
 251}
 252
 253static int
 254p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
 255{
 256        struct p9_client *c = id->context;
 257        struct p9_trans_rdma *rdma = c->trans;
 258        switch (event->event) {
 259        case RDMA_CM_EVENT_ADDR_RESOLVED:
 260                BUG_ON(rdma->state != P9_RDMA_INIT);
 261                rdma->state = P9_RDMA_ADDR_RESOLVED;
 262                break;
 263
 264        case RDMA_CM_EVENT_ROUTE_RESOLVED:
 265                BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED);
 266                rdma->state = P9_RDMA_ROUTE_RESOLVED;
 267                break;
 268
 269        case RDMA_CM_EVENT_ESTABLISHED:
 270                BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED);
 271                rdma->state = P9_RDMA_CONNECTED;
 272                break;
 273
 274        case RDMA_CM_EVENT_DISCONNECTED:
 275                if (rdma)
 276                        rdma->state = P9_RDMA_CLOSED;
 277                c->status = Disconnected;
 278                break;
 279
 280        case RDMA_CM_EVENT_TIMEWAIT_EXIT:
 281                break;
 282
 283        case RDMA_CM_EVENT_ADDR_CHANGE:
 284        case RDMA_CM_EVENT_ROUTE_ERROR:
 285        case RDMA_CM_EVENT_DEVICE_REMOVAL:
 286        case RDMA_CM_EVENT_MULTICAST_JOIN:
 287        case RDMA_CM_EVENT_MULTICAST_ERROR:
 288        case RDMA_CM_EVENT_REJECTED:
 289        case RDMA_CM_EVENT_CONNECT_REQUEST:
 290        case RDMA_CM_EVENT_CONNECT_RESPONSE:
 291        case RDMA_CM_EVENT_CONNECT_ERROR:
 292        case RDMA_CM_EVENT_ADDR_ERROR:
 293        case RDMA_CM_EVENT_UNREACHABLE:
 294                c->status = Disconnected;
 295                rdma_disconnect(rdma->cm_id);
 296                break;
 297        default:
 298                BUG();
 299        }
 300        complete(&rdma->cm_done);
 301        return 0;
 302}
 303
 304static void
 305recv_done(struct ib_cq *cq, struct ib_wc *wc)
 306{
 307        struct p9_client *client = cq->cq_context;
 308        struct p9_trans_rdma *rdma = client->trans;
 309        struct p9_rdma_context *c =
 310                container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
 311        struct p9_req_t *req;
 312        int err = 0;
 313        int16_t tag;
 314
 315        req = NULL;
 316        ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize,
 317                                                         DMA_FROM_DEVICE);
 318
 319        if (wc->status != IB_WC_SUCCESS)
 320                goto err_out;
 321
 322        c->rc.size = wc->byte_len;
 323        err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1);
 324        if (err)
 325                goto err_out;
 326
 327        req = p9_tag_lookup(client, tag);
 328        if (!req)
 329                goto err_out;
 330
 331        /* Check that we have not yet received a reply for this request.
 332         */
 333        if (unlikely(req->rc.sdata)) {
 334                pr_err("Duplicate reply for request %d", tag);
 335                goto err_out;
 336        }
 337
 338        req->rc.size = c->rc.size;
 339        req->rc.sdata = c->rc.sdata;
 340        p9_client_cb(client, req, REQ_STATUS_RCVD);
 341
 342 out:
 343        up(&rdma->rq_sem);
 344        kfree(c);
 345        return;
 346
 347 err_out:
 348        p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n",
 349                        req, err, wc->status);
 350        rdma->state = P9_RDMA_FLUSHING;
 351        client->status = Disconnected;
 352        goto out;
 353}
 354
 355static void
 356send_done(struct ib_cq *cq, struct ib_wc *wc)
 357{
 358        struct p9_client *client = cq->cq_context;
 359        struct p9_trans_rdma *rdma = client->trans;
 360        struct p9_rdma_context *c =
 361                container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
 362
 363        ib_dma_unmap_single(rdma->cm_id->device,
 364                            c->busa, c->req->tc.size,
 365                            DMA_TO_DEVICE);
 366        up(&rdma->sq_sem);
 367        p9_req_put(c->req);
 368        kfree(c);
 369}
 370
 371static void qp_event_handler(struct ib_event *event, void *context)
 372{
 373        p9_debug(P9_DEBUG_ERROR, "QP event %d context %p\n",
 374                 event->event, context);
 375}
 376
 377static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
 378{
 379        if (!rdma)
 380                return;
 381
 382        if (rdma->qp && !IS_ERR(rdma->qp))
 383                ib_destroy_qp(rdma->qp);
 384
 385        if (rdma->pd && !IS_ERR(rdma->pd))
 386                ib_dealloc_pd(rdma->pd);
 387
 388        if (rdma->cq && !IS_ERR(rdma->cq))
 389                ib_free_cq(rdma->cq);
 390
 391        if (rdma->cm_id && !IS_ERR(rdma->cm_id))
 392                rdma_destroy_id(rdma->cm_id);
 393
 394        kfree(rdma);
 395}
 396
 397static int
 398post_recv(struct p9_client *client, struct p9_rdma_context *c)
 399{
 400        struct p9_trans_rdma *rdma = client->trans;
 401        struct ib_recv_wr wr;
 402        struct ib_sge sge;
 403
 404        c->busa = ib_dma_map_single(rdma->cm_id->device,
 405                                    c->rc.sdata, client->msize,
 406                                    DMA_FROM_DEVICE);
 407        if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
 408                goto error;
 409
 410        c->cqe.done = recv_done;
 411
 412        sge.addr = c->busa;
 413        sge.length = client->msize;
 414        sge.lkey = rdma->pd->local_dma_lkey;
 415
 416        wr.next = NULL;
 417        wr.wr_cqe = &c->cqe;
 418        wr.sg_list = &sge;
 419        wr.num_sge = 1;
 420        return ib_post_recv(rdma->qp, &wr, NULL);
 421
 422 error:
 423        p9_debug(P9_DEBUG_ERROR, "EIO\n");
 424        return -EIO;
 425}
 426
 427static int rdma_request(struct p9_client *client, struct p9_req_t *req)
 428{
 429        struct p9_trans_rdma *rdma = client->trans;
 430        struct ib_send_wr wr;
 431        struct ib_sge sge;
 432        int err = 0;
 433        unsigned long flags;
 434        struct p9_rdma_context *c = NULL;
 435        struct p9_rdma_context *rpl_context = NULL;
 436
 437        /* When an error occurs between posting the recv and the send,
 438         * there will be a receive context posted without a pending request.
 439         * Since there is no way to "un-post" it, we remember it and skip
 440         * post_recv() for the next request.
 441         * So here,
 442         * see if we are this `next request' and need to absorb an excess rc.
 443         * If yes, then drop and free our own, and do not recv_post().
 444         **/
 445        if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
 446                if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
 447                        /* Got one! */
 448                        p9_fcall_fini(&req->rc);
 449                        req->rc.sdata = NULL;
 450                        goto dont_need_post_recv;
 451                } else {
 452                        /* We raced and lost. */
 453                        atomic_inc(&rdma->excess_rc);
 454                }
 455        }
 456
 457        /* Allocate an fcall for the reply */
 458        rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS);
 459        if (!rpl_context) {
 460                err = -ENOMEM;
 461                goto recv_error;
 462        }
 463        rpl_context->rc.sdata = req->rc.sdata;
 464
 465        /*
 466         * Post a receive buffer for this request. We need to ensure
 467         * there is a reply buffer available for every outstanding
 468         * request. A flushed request can result in no reply for an
 469         * outstanding request, so we must keep a count to avoid
 470         * overflowing the RQ.
 471         */
 472        if (down_interruptible(&rdma->rq_sem)) {
 473                err = -EINTR;
 474                goto recv_error;
 475        }
 476
 477        err = post_recv(client, rpl_context);
 478        if (err) {
 479                p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err);
 480                goto recv_error;
 481        }
 482        /* remove posted receive buffer from request structure */
 483        req->rc.sdata = NULL;
 484
 485dont_need_post_recv:
 486        /* Post the request */
 487        c = kmalloc(sizeof *c, GFP_NOFS);
 488        if (!c) {
 489                err = -ENOMEM;
 490                goto send_error;
 491        }
 492        c->req = req;
 493
 494        c->busa = ib_dma_map_single(rdma->cm_id->device,
 495                                    c->req->tc.sdata, c->req->tc.size,
 496                                    DMA_TO_DEVICE);
 497        if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) {
 498                err = -EIO;
 499                goto send_error;
 500        }
 501
 502        c->cqe.done = send_done;
 503
 504        sge.addr = c->busa;
 505        sge.length = c->req->tc.size;
 506        sge.lkey = rdma->pd->local_dma_lkey;
 507
 508        wr.next = NULL;
 509        wr.wr_cqe = &c->cqe;
 510        wr.opcode = IB_WR_SEND;
 511        wr.send_flags = IB_SEND_SIGNALED;
 512        wr.sg_list = &sge;
 513        wr.num_sge = 1;
 514
 515        if (down_interruptible(&rdma->sq_sem)) {
 516                err = -EINTR;
 517                goto send_error;
 518        }
 519
 520        /* Mark request as `sent' *before* we actually send it,
 521         * because doing if after could erase the REQ_STATUS_RCVD
 522         * status in case of a very fast reply.
 523         */
 524        req->status = REQ_STATUS_SENT;
 525        err = ib_post_send(rdma->qp, &wr, NULL);
 526        if (err)
 527                goto send_error;
 528
 529        /* Success */
 530        return 0;
 531
 532 /* Handle errors that happened during or while preparing the send: */
 533 send_error:
 534        req->status = REQ_STATUS_ERROR;
 535        kfree(c);
 536        p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err);
 537
 538        /* Ach.
 539         *  We did recv_post(), but not send. We have one recv_post in excess.
 540         */
 541        atomic_inc(&rdma->excess_rc);
 542        return err;
 543
 544 /* Handle errors that happened during or while preparing post_recv(): */
 545 recv_error:
 546        kfree(rpl_context);
 547        spin_lock_irqsave(&rdma->req_lock, flags);
 548        if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) {
 549                rdma->state = P9_RDMA_CLOSING;
 550                spin_unlock_irqrestore(&rdma->req_lock, flags);
 551                rdma_disconnect(rdma->cm_id);
 552        } else
 553                spin_unlock_irqrestore(&rdma->req_lock, flags);
 554        return err;
 555}
 556
 557static void rdma_close(struct p9_client *client)
 558{
 559        struct p9_trans_rdma *rdma;
 560
 561        if (!client)
 562                return;
 563
 564        rdma = client->trans;
 565        if (!rdma)
 566                return;
 567
 568        client->status = Disconnected;
 569        rdma_disconnect(rdma->cm_id);
 570        rdma_destroy_trans(rdma);
 571}
 572
 573/**
 574 * alloc_rdma - Allocate and initialize the rdma transport structure
 575 * @opts: Mount options structure
 576 */
 577static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts)
 578{
 579        struct p9_trans_rdma *rdma;
 580
 581        rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL);
 582        if (!rdma)
 583                return NULL;
 584
 585        rdma->port = opts->port;
 586        rdma->privport = opts->privport;
 587        rdma->sq_depth = opts->sq_depth;
 588        rdma->rq_depth = opts->rq_depth;
 589        rdma->timeout = opts->timeout;
 590        spin_lock_init(&rdma->req_lock);
 591        init_completion(&rdma->cm_done);
 592        sema_init(&rdma->sq_sem, rdma->sq_depth);
 593        sema_init(&rdma->rq_sem, rdma->rq_depth);
 594        atomic_set(&rdma->excess_rc, 0);
 595
 596        return rdma;
 597}
 598
 599static int rdma_cancel(struct p9_client *client, struct p9_req_t *req)
 600{
 601        /* Nothing to do here.
 602         * We will take care of it (if we have to) in rdma_cancelled()
 603         */
 604        return 1;
 605}
 606
 607/* A request has been fully flushed without a reply.
 608 * That means we have posted one buffer in excess.
 609 */
 610static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
 611{
 612        struct p9_trans_rdma *rdma = client->trans;
 613        atomic_inc(&rdma->excess_rc);
 614        return 0;
 615}
 616
 617static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
 618{
 619        struct sockaddr_in cl = {
 620                .sin_family = AF_INET,
 621                .sin_addr.s_addr = htonl(INADDR_ANY),
 622        };
 623        int port, err = -EINVAL;
 624
 625        for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) {
 626                cl.sin_port = htons((ushort)port);
 627                err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl);
 628                if (err != -EADDRINUSE)
 629                        break;
 630        }
 631        return err;
 632}
 633
 634/**
 635 * rdma_create_trans - Transport method for creating a transport instance
 636 * @client: client instance
 637 * @addr: IP address string
 638 * @args: Mount options string
 639 */
 640static int
 641rdma_create_trans(struct p9_client *client, const char *addr, char *args)
 642{
 643        int err;
 644        struct p9_rdma_opts opts;
 645        struct p9_trans_rdma *rdma;
 646        struct rdma_conn_param conn_param;
 647        struct ib_qp_init_attr qp_attr;
 648
 649        if (addr == NULL)
 650                return -EINVAL;
 651
 652        /* Parse the transport specific mount options */
 653        err = parse_opts(args, &opts);
 654        if (err < 0)
 655                return err;
 656
 657        /* Create and initialize the RDMA transport structure */
 658        rdma = alloc_rdma(&opts);
 659        if (!rdma)
 660                return -ENOMEM;
 661
 662        /* Create the RDMA CM ID */
 663        rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client,
 664                                     RDMA_PS_TCP, IB_QPT_RC);
 665        if (IS_ERR(rdma->cm_id))
 666                goto error;
 667
 668        /* Associate the client with the transport */
 669        client->trans = rdma;
 670
 671        /* Bind to a privileged port if we need to */
 672        if (opts.privport) {
 673                err = p9_rdma_bind_privport(rdma);
 674                if (err < 0) {
 675                        pr_err("%s (%d): problem binding to privport: %d\n",
 676                               __func__, task_pid_nr(current), -err);
 677                        goto error;
 678                }
 679        }
 680
 681        /* Resolve the server's address */
 682        rdma->addr.sin_family = AF_INET;
 683        rdma->addr.sin_addr.s_addr = in_aton(addr);
 684        rdma->addr.sin_port = htons(opts.port);
 685        err = rdma_resolve_addr(rdma->cm_id, NULL,
 686                                (struct sockaddr *)&rdma->addr,
 687                                rdma->timeout);
 688        if (err)
 689                goto error;
 690        err = wait_for_completion_interruptible(&rdma->cm_done);
 691        if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED))
 692                goto error;
 693
 694        /* Resolve the route to the server */
 695        err = rdma_resolve_route(rdma->cm_id, rdma->timeout);
 696        if (err)
 697                goto error;
 698        err = wait_for_completion_interruptible(&rdma->cm_done);
 699        if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED))
 700                goto error;
 701
 702        /* Create the Completion Queue */
 703        rdma->cq = ib_alloc_cq(rdma->cm_id->device, client,
 704                        opts.sq_depth + opts.rq_depth + 1,
 705                        0, IB_POLL_SOFTIRQ);
 706        if (IS_ERR(rdma->cq))
 707                goto error;
 708
 709        /* Create the Protection Domain */
 710        rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0);
 711        if (IS_ERR(rdma->pd))
 712                goto error;
 713
 714        /* Create the Queue Pair */
 715        memset(&qp_attr, 0, sizeof qp_attr);
 716        qp_attr.event_handler = qp_event_handler;
 717        qp_attr.qp_context = client;
 718        qp_attr.cap.max_send_wr = opts.sq_depth;
 719        qp_attr.cap.max_recv_wr = opts.rq_depth;
 720        qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE;
 721        qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE;
 722        qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 723        qp_attr.qp_type = IB_QPT_RC;
 724        qp_attr.send_cq = rdma->cq;
 725        qp_attr.recv_cq = rdma->cq;
 726        err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr);
 727        if (err)
 728                goto error;
 729        rdma->qp = rdma->cm_id->qp;
 730
 731        /* Request a connection */
 732        memset(&conn_param, 0, sizeof(conn_param));
 733        conn_param.private_data = NULL;
 734        conn_param.private_data_len = 0;
 735        conn_param.responder_resources = P9_RDMA_IRD;
 736        conn_param.initiator_depth = P9_RDMA_ORD;
 737        err = rdma_connect(rdma->cm_id, &conn_param);
 738        if (err)
 739                goto error;
 740        err = wait_for_completion_interruptible(&rdma->cm_done);
 741        if (err || (rdma->state != P9_RDMA_CONNECTED))
 742                goto error;
 743
 744        client->status = Connected;
 745
 746        return 0;
 747
 748error:
 749        rdma_destroy_trans(rdma);
 750        return -ENOTCONN;
 751}
 752
 753static struct p9_trans_module p9_rdma_trans = {
 754        .name = "rdma",
 755        .maxsize = P9_RDMA_MAXSIZE,
 756        .def = 0,
 757        .owner = THIS_MODULE,
 758        .create = rdma_create_trans,
 759        .close = rdma_close,
 760        .request = rdma_request,
 761        .cancel = rdma_cancel,
 762        .cancelled = rdma_cancelled,
 763        .show_options = p9_rdma_show_options,
 764};
 765
 766/**
 767 * p9_trans_rdma_init - Register the 9P RDMA transport driver
 768 */
 769static int __init p9_trans_rdma_init(void)
 770{
 771        v9fs_register_trans(&p9_rdma_trans);
 772        return 0;
 773}
 774
 775static void __exit p9_trans_rdma_exit(void)
 776{
 777        v9fs_unregister_trans(&p9_rdma_trans);
 778}
 779
 780module_init(p9_trans_rdma_init);
 781module_exit(p9_trans_rdma_exit);
 782
 783MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
 784MODULE_DESCRIPTION("RDMA Transport for 9P");
 785MODULE_LICENSE("Dual BSD/GPL");
 786