linux/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; If not, see
  18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19 *
  20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21 * CA 95054 USA or visit www.sun.com if you need additional information or
  22 * have any questions.
  23 *
  24 * GPL HEADER END
  25 */
  26/*
  27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  28 * Use is subject to license terms.
  29 *
  30 * Copyright (c) 2011, 2012, Intel Corporation.
  31 */
  32/*
  33 * This file is part of Lustre, http://www.lustre.org/
  34 * Lustre is a trademark of Sun Microsystems, Inc.
  35 *
  36 * lnet/klnds/o2iblnd/o2iblnd.c
  37 *
  38 * Author: Eric Barton <eric@bartonsoftware.com>
  39 */
  40
  41#include "o2iblnd.h"
  42#include <asm/div64.h>
  43
  44lnd_t the_o2iblnd = {
  45        .lnd_type       = O2IBLND,
  46        .lnd_startup    = kiblnd_startup,
  47        .lnd_shutdown   = kiblnd_shutdown,
  48        .lnd_ctl        = kiblnd_ctl,
  49        .lnd_query      = kiblnd_query,
  50        .lnd_send       = kiblnd_send,
  51        .lnd_recv       = kiblnd_recv,
  52};
  53
  54kib_data_t            kiblnd_data;
  55
  56__u32
  57kiblnd_cksum (void *ptr, int nob)
  58{
  59        char  *c  = ptr;
  60        __u32  sum = 0;
  61
  62        while (nob-- > 0)
  63                sum = ((sum << 1) | (sum >> 31)) + *c++;
  64
  65        /* ensure I don't return 0 (== no checksum) */
  66        return (sum == 0) ? 1 : sum;
  67}
  68
  69static char *
  70kiblnd_msgtype2str(int type)
  71{
  72        switch (type) {
  73        case IBLND_MSG_CONNREQ:
  74                return "CONNREQ";
  75
  76        case IBLND_MSG_CONNACK:
  77                return "CONNACK";
  78
  79        case IBLND_MSG_NOOP:
  80                return "NOOP";
  81
  82        case IBLND_MSG_IMMEDIATE:
  83                return "IMMEDIATE";
  84
  85        case IBLND_MSG_PUT_REQ:
  86                return "PUT_REQ";
  87
  88        case IBLND_MSG_PUT_NAK:
  89                return "PUT_NAK";
  90
  91        case IBLND_MSG_PUT_ACK:
  92                return "PUT_ACK";
  93
  94        case IBLND_MSG_PUT_DONE:
  95                return "PUT_DONE";
  96
  97        case IBLND_MSG_GET_REQ:
  98                return "GET_REQ";
  99
 100        case IBLND_MSG_GET_DONE:
 101                return "GET_DONE";
 102
 103        default:
 104                return "???";
 105        }
 106}
 107
 108static int
 109kiblnd_msgtype2size(int type)
 110{
 111        const int hdr_size = offsetof(kib_msg_t, ibm_u);
 112
 113        switch (type) {
 114        case IBLND_MSG_CONNREQ:
 115        case IBLND_MSG_CONNACK:
 116                return hdr_size + sizeof(kib_connparams_t);
 117
 118        case IBLND_MSG_NOOP:
 119                return hdr_size;
 120
 121        case IBLND_MSG_IMMEDIATE:
 122                return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
 123
 124        case IBLND_MSG_PUT_REQ:
 125                return hdr_size + sizeof(kib_putreq_msg_t);
 126
 127        case IBLND_MSG_PUT_ACK:
 128                return hdr_size + sizeof(kib_putack_msg_t);
 129
 130        case IBLND_MSG_GET_REQ:
 131                return hdr_size + sizeof(kib_get_msg_t);
 132
 133        case IBLND_MSG_PUT_NAK:
 134        case IBLND_MSG_PUT_DONE:
 135        case IBLND_MSG_GET_DONE:
 136                return hdr_size + sizeof(kib_completion_msg_t);
 137        default:
 138                return -1;
 139        }
 140}
 141
 142static int
 143kiblnd_unpack_rd(kib_msg_t *msg, int flip)
 144{
 145        kib_rdma_desc_t   *rd;
 146        int             nob;
 147        int             n;
 148        int             i;
 149
 150        LASSERT (msg->ibm_type == IBLND_MSG_GET_REQ ||
 151                 msg->ibm_type == IBLND_MSG_PUT_ACK);
 152
 153        rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
 154                              &msg->ibm_u.get.ibgm_rd :
 155                              &msg->ibm_u.putack.ibpam_rd;
 156
 157        if (flip) {
 158                __swab32s(&rd->rd_key);
 159                __swab32s(&rd->rd_nfrags);
 160        }
 161
 162        n = rd->rd_nfrags;
 163
 164        if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
 165                CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
 166                       n, IBLND_MAX_RDMA_FRAGS);
 167                return 1;
 168        }
 169
 170        nob = offsetof (kib_msg_t, ibm_u) +
 171              kiblnd_rd_msg_size(rd, msg->ibm_type, n);
 172
 173        if (msg->ibm_nob < nob) {
 174                CERROR("Short %s: %d(%d)\n",
 175                       kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
 176                return 1;
 177        }
 178
 179        if (!flip)
 180                return 0;
 181
 182        for (i = 0; i < n; i++) {
 183                __swab32s(&rd->rd_frags[i].rf_nob);
 184                __swab64s(&rd->rd_frags[i].rf_addr);
 185        }
 186
 187        return 0;
 188}
 189
 190void
 191kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, int version,
 192                 int credits, lnet_nid_t dstnid, __u64 dststamp)
 193{
 194        kib_net_t *net = ni->ni_data;
 195
 196        /* CAVEAT EMPTOR! all message fields not set here should have been
 197         * initialised previously. */
 198        msg->ibm_magic    = IBLND_MSG_MAGIC;
 199        msg->ibm_version  = version;
 200        /*   ibm_type */
 201        msg->ibm_credits  = credits;
 202        /*   ibm_nob */
 203        msg->ibm_cksum    = 0;
 204        msg->ibm_srcnid   = ni->ni_nid;
 205        msg->ibm_srcstamp = net->ibn_incarnation;
 206        msg->ibm_dstnid   = dstnid;
 207        msg->ibm_dststamp = dststamp;
 208
 209        if (*kiblnd_tunables.kib_cksum) {
 210                /* NB ibm_cksum zero while computing cksum */
 211                msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
 212        }
 213}
 214
 215int
 216kiblnd_unpack_msg(kib_msg_t *msg, int nob)
 217{
 218        const int hdr_size = offsetof(kib_msg_t, ibm_u);
 219        __u32     msg_cksum;
 220        __u16     version;
 221        int       msg_nob;
 222        int       flip;
 223
 224        /* 6 bytes are enough to have received magic + version */
 225        if (nob < 6) {
 226                CERROR("Short message: %d\n", nob);
 227                return -EPROTO;
 228        }
 229
 230        if (msg->ibm_magic == IBLND_MSG_MAGIC) {
 231                flip = 0;
 232        } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
 233                flip = 1;
 234        } else {
 235                CERROR("Bad magic: %08x\n", msg->ibm_magic);
 236                return -EPROTO;
 237        }
 238
 239        version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
 240        if (version != IBLND_MSG_VERSION &&
 241            version != IBLND_MSG_VERSION_1) {
 242                CERROR("Bad version: %x\n", version);
 243                return -EPROTO;
 244        }
 245
 246        if (nob < hdr_size) {
 247                CERROR("Short message: %d\n", nob);
 248                return -EPROTO;
 249        }
 250
 251        msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
 252        if (msg_nob > nob) {
 253                CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
 254                return -EPROTO;
 255        }
 256
 257        /* checksum must be computed with ibm_cksum zero and BEFORE anything
 258         * gets flipped */
 259        msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
 260        msg->ibm_cksum = 0;
 261        if (msg_cksum != 0 &&
 262            msg_cksum != kiblnd_cksum(msg, msg_nob)) {
 263                CERROR("Bad checksum\n");
 264                return -EPROTO;
 265        }
 266
 267        msg->ibm_cksum = msg_cksum;
 268
 269        if (flip) {
 270                /* leave magic unflipped as a clue to peer endianness */
 271                msg->ibm_version = version;
 272                CLASSERT (sizeof(msg->ibm_type) == 1);
 273                CLASSERT (sizeof(msg->ibm_credits) == 1);
 274                msg->ibm_nob     = msg_nob;
 275                __swab64s(&msg->ibm_srcnid);
 276                __swab64s(&msg->ibm_srcstamp);
 277                __swab64s(&msg->ibm_dstnid);
 278                __swab64s(&msg->ibm_dststamp);
 279        }
 280
 281        if (msg->ibm_srcnid == LNET_NID_ANY) {
 282                CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
 283                return -EPROTO;
 284        }
 285
 286        if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
 287                CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
 288                       msg_nob, kiblnd_msgtype2size(msg->ibm_type));
 289                return -EPROTO;
 290        }
 291
 292        switch (msg->ibm_type) {
 293        default:
 294                CERROR("Unknown message type %x\n", msg->ibm_type);
 295                return -EPROTO;
 296
 297        case IBLND_MSG_NOOP:
 298        case IBLND_MSG_IMMEDIATE:
 299        case IBLND_MSG_PUT_REQ:
 300                break;
 301
 302        case IBLND_MSG_PUT_ACK:
 303        case IBLND_MSG_GET_REQ:
 304                if (kiblnd_unpack_rd(msg, flip))
 305                        return -EPROTO;
 306                break;
 307
 308        case IBLND_MSG_PUT_NAK:
 309        case IBLND_MSG_PUT_DONE:
 310        case IBLND_MSG_GET_DONE:
 311                if (flip)
 312                        __swab32s(&msg->ibm_u.completion.ibcm_status);
 313                break;
 314
 315        case IBLND_MSG_CONNREQ:
 316        case IBLND_MSG_CONNACK:
 317                if (flip) {
 318                        __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
 319                        __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
 320                        __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
 321                }
 322                break;
 323        }
 324        return 0;
 325}
 326
 327int
 328kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
 329{
 330        kib_peer_t      *peer;
 331        kib_net_t       *net = ni->ni_data;
 332        int             cpt = lnet_cpt_of_nid(nid);
 333        unsigned long   flags;
 334
 335        LASSERT(net != NULL);
 336        LASSERT(nid != LNET_NID_ANY);
 337
 338        LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
 339        if (peer == NULL) {
 340                CERROR("Cannot allocate peer\n");
 341                return -ENOMEM;
 342        }
 343
 344        memset(peer, 0, sizeof(*peer));  /* zero flags etc */
 345
 346        peer->ibp_ni = ni;
 347        peer->ibp_nid = nid;
 348        peer->ibp_error = 0;
 349        peer->ibp_last_alive = 0;
 350        atomic_set(&peer->ibp_refcount, 1);  /* 1 ref for caller */
 351
 352        INIT_LIST_HEAD(&peer->ibp_list);     /* not in the peer table yet */
 353        INIT_LIST_HEAD(&peer->ibp_conns);
 354        INIT_LIST_HEAD(&peer->ibp_tx_queue);
 355
 356        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 357
 358        /* always called with a ref on ni, which prevents ni being shutdown */
 359        LASSERT (net->ibn_shutdown == 0);
 360
 361        /* npeers only grows with the global lock held */
 362        atomic_inc(&net->ibn_npeers);
 363
 364        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 365
 366        *peerp = peer;
 367        return 0;
 368}
 369
 370void
 371kiblnd_destroy_peer (kib_peer_t *peer)
 372{
 373        kib_net_t *net = peer->ibp_ni->ni_data;
 374
 375        LASSERT (net != NULL);
 376        LASSERT (atomic_read(&peer->ibp_refcount) == 0);
 377        LASSERT (!kiblnd_peer_active(peer));
 378        LASSERT (peer->ibp_connecting == 0);
 379        LASSERT (peer->ibp_accepting == 0);
 380        LASSERT (list_empty(&peer->ibp_conns));
 381        LASSERT (list_empty(&peer->ibp_tx_queue));
 382
 383        LIBCFS_FREE(peer, sizeof(*peer));
 384
 385        /* NB a peer's connections keep a reference on their peer until
 386         * they are destroyed, so we can be assured that _all_ state to do
 387         * with this peer has been cleaned up when its refcount drops to
 388         * zero. */
 389        atomic_dec(&net->ibn_npeers);
 390}
 391
 392kib_peer_t *
 393kiblnd_find_peer_locked (lnet_nid_t nid)
 394{
 395        /* the caller is responsible for accounting the additional reference
 396         * that this creates */
 397        struct list_head       *peer_list = kiblnd_nid2peerlist(nid);
 398        struct list_head       *tmp;
 399        kib_peer_t       *peer;
 400
 401        list_for_each (tmp, peer_list) {
 402
 403                peer = list_entry(tmp, kib_peer_t, ibp_list);
 404
 405                LASSERT (peer->ibp_connecting > 0 || /* creating conns */
 406                         peer->ibp_accepting > 0 ||
 407                         !list_empty(&peer->ibp_conns));  /* active conn */
 408
 409                if (peer->ibp_nid != nid)
 410                        continue;
 411
 412                CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
 413                       peer, libcfs_nid2str(nid),
 414                       atomic_read(&peer->ibp_refcount),
 415                       peer->ibp_version);
 416                return peer;
 417        }
 418        return NULL;
 419}
 420
 421void
 422kiblnd_unlink_peer_locked (kib_peer_t *peer)
 423{
 424        LASSERT (list_empty(&peer->ibp_conns));
 425
 426        LASSERT (kiblnd_peer_active(peer));
 427        list_del_init(&peer->ibp_list);
 428        /* lose peerlist's ref */
 429        kiblnd_peer_decref(peer);
 430}
 431
 432int
 433kiblnd_get_peer_info (lnet_ni_t *ni, int index,
 434                      lnet_nid_t *nidp, int *count)
 435{
 436        kib_peer_t          *peer;
 437        struct list_head            *ptmp;
 438        int                 i;
 439        unsigned long     flags;
 440
 441        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 442
 443        for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
 444
 445                list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
 446
 447                        peer = list_entry(ptmp, kib_peer_t, ibp_list);
 448                        LASSERT (peer->ibp_connecting > 0 ||
 449                                 peer->ibp_accepting > 0 ||
 450                                 !list_empty(&peer->ibp_conns));
 451
 452                        if (peer->ibp_ni != ni)
 453                                continue;
 454
 455                        if (index-- > 0)
 456                                continue;
 457
 458                        *nidp = peer->ibp_nid;
 459                        *count = atomic_read(&peer->ibp_refcount);
 460
 461                        read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
 462                                               flags);
 463                        return 0;
 464                }
 465        }
 466
 467        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 468        return -ENOENT;
 469}
 470
 471void
 472kiblnd_del_peer_locked (kib_peer_t *peer)
 473{
 474        struct list_head           *ctmp;
 475        struct list_head           *cnxt;
 476        kib_conn_t         *conn;
 477
 478        if (list_empty(&peer->ibp_conns)) {
 479                kiblnd_unlink_peer_locked(peer);
 480        } else {
 481                list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
 482                        conn = list_entry(ctmp, kib_conn_t, ibc_list);
 483
 484                        kiblnd_close_conn_locked(conn, 0);
 485                }
 486                /* NB closing peer's last conn unlinked it. */
 487        }
 488        /* NB peer now unlinked; might even be freed if the peer table had the
 489         * last ref on it. */
 490}
 491
 492int
 493kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid)
 494{
 495        LIST_HEAD        (zombies);
 496        struct list_head            *ptmp;
 497        struct list_head            *pnxt;
 498        kib_peer_t          *peer;
 499        int                 lo;
 500        int                 hi;
 501        int                 i;
 502        unsigned long     flags;
 503        int                 rc = -ENOENT;
 504
 505        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 506
 507        if (nid != LNET_NID_ANY) {
 508                lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
 509        } else {
 510                lo = 0;
 511                hi = kiblnd_data.kib_peer_hash_size - 1;
 512        }
 513
 514        for (i = lo; i <= hi; i++) {
 515                list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
 516                        peer = list_entry(ptmp, kib_peer_t, ibp_list);
 517                        LASSERT (peer->ibp_connecting > 0 ||
 518                                 peer->ibp_accepting > 0 ||
 519                                 !list_empty(&peer->ibp_conns));
 520
 521                        if (peer->ibp_ni != ni)
 522                                continue;
 523
 524                        if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
 525                                continue;
 526
 527                        if (!list_empty(&peer->ibp_tx_queue)) {
 528                                LASSERT (list_empty(&peer->ibp_conns));
 529
 530                                list_splice_init(&peer->ibp_tx_queue,
 531                                                     &zombies);
 532                        }
 533
 534                        kiblnd_del_peer_locked(peer);
 535                        rc = 0;  /* matched something */
 536                }
 537        }
 538
 539        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 540
 541        kiblnd_txlist_done(ni, &zombies, -EIO);
 542
 543        return rc;
 544}
 545
 546kib_conn_t *
 547kiblnd_get_conn_by_idx (lnet_ni_t *ni, int index)
 548{
 549        kib_peer_t          *peer;
 550        struct list_head            *ptmp;
 551        kib_conn_t          *conn;
 552        struct list_head            *ctmp;
 553        int                 i;
 554        unsigned long     flags;
 555
 556        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 557
 558        for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
 559                list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
 560
 561                        peer = list_entry(ptmp, kib_peer_t, ibp_list);
 562                        LASSERT (peer->ibp_connecting > 0 ||
 563                                 peer->ibp_accepting > 0 ||
 564                                 !list_empty(&peer->ibp_conns));
 565
 566                        if (peer->ibp_ni != ni)
 567                                continue;
 568
 569                        list_for_each (ctmp, &peer->ibp_conns) {
 570                                if (index-- > 0)
 571                                        continue;
 572
 573                                conn = list_entry(ctmp, kib_conn_t,
 574                                                      ibc_list);
 575                                kiblnd_conn_addref(conn);
 576                                read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
 577                                                       flags);
 578                                return conn;
 579                        }
 580                }
 581        }
 582
 583        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
 584        return NULL;
 585}
 586
 587void
 588kiblnd_debug_rx (kib_rx_t *rx)
 589{
 590        CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
 591               rx, rx->rx_status, rx->rx_msg->ibm_type,
 592               rx->rx_msg->ibm_credits);
 593}
 594
 595void
 596kiblnd_debug_tx (kib_tx_t *tx)
 597{
 598        CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lx "
 599               "cookie "LPX64" msg %s%s type %x cred %d\n",
 600               tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
 601               tx->tx_status, tx->tx_deadline, tx->tx_cookie,
 602               tx->tx_lntmsg[0] == NULL ? "-" : "!",
 603               tx->tx_lntmsg[1] == NULL ? "-" : "!",
 604               tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
 605}
 606
 607void
 608kiblnd_debug_conn (kib_conn_t *conn)
 609{
 610        struct list_head        *tmp;
 611        int             i;
 612
 613        spin_lock(&conn->ibc_lock);
 614
 615        CDEBUG(D_CONSOLE, "conn[%d] %p [version %x] -> %s: \n",
 616               atomic_read(&conn->ibc_refcount), conn,
 617               conn->ibc_version, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 618        CDEBUG(D_CONSOLE, "   state %d nposted %d/%d cred %d o_cred %d r_cred %d\n",
 619               conn->ibc_state, conn->ibc_noops_posted,
 620               conn->ibc_nsends_posted, conn->ibc_credits,
 621               conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
 622        CDEBUG(D_CONSOLE, "   comms_err %d\n", conn->ibc_comms_error);
 623
 624        CDEBUG(D_CONSOLE, "   early_rxs:\n");
 625        list_for_each(tmp, &conn->ibc_early_rxs)
 626                kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
 627
 628        CDEBUG(D_CONSOLE, "   tx_noops:\n");
 629        list_for_each(tmp, &conn->ibc_tx_noops)
 630                kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
 631
 632        CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
 633        list_for_each(tmp, &conn->ibc_tx_queue_nocred)
 634                kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
 635
 636        CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
 637        list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
 638                kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
 639
 640        CDEBUG(D_CONSOLE, "   tx_queue:\n");
 641        list_for_each(tmp, &conn->ibc_tx_queue)
 642                kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
 643
 644        CDEBUG(D_CONSOLE, "   active_txs:\n");
 645        list_for_each(tmp, &conn->ibc_active_txs)
 646                kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
 647
 648        CDEBUG(D_CONSOLE, "   rxs:\n");
 649        for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++)
 650                kiblnd_debug_rx(&conn->ibc_rxs[i]);
 651
 652        spin_unlock(&conn->ibc_lock);
 653}
 654
 655int
 656kiblnd_translate_mtu(int value)
 657{
 658        switch (value) {
 659        default:
 660                return -1;
 661        case 0:
 662                return 0;
 663        case 256:
 664                return IB_MTU_256;
 665        case 512:
 666                return IB_MTU_512;
 667        case 1024:
 668                return IB_MTU_1024;
 669        case 2048:
 670                return IB_MTU_2048;
 671        case 4096:
 672                return IB_MTU_4096;
 673        }
 674}
 675
 676static void
 677kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
 678{
 679        int        mtu;
 680
 681        /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
 682        if (cmid->route.path_rec == NULL)
 683                return;
 684
 685        mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
 686        LASSERT (mtu >= 0);
 687        if (mtu != 0)
 688                cmid->route.path_rec->mtu = mtu;
 689}
 690
 691static int
 692kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
 693{
 694        cpumask_t       *mask;
 695        int             vectors;
 696        int             off;
 697        int             i;
 698        lnet_nid_t      nid = conn->ibc_peer->ibp_nid;
 699
 700        vectors = conn->ibc_cmid->device->num_comp_vectors;
 701        if (vectors <= 1)
 702                return 0;
 703
 704        mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
 705        if (mask == NULL)
 706                return 0;
 707
 708        /* hash NID to CPU id in this partition... */
 709        off = do_div(nid, cpus_weight(*mask));
 710        for_each_cpu_mask(i, *mask) {
 711                if (off-- == 0)
 712                        return i % vectors;
 713        }
 714
 715        LBUG();
 716        return 1;
 717}
 718
 719kib_conn_t *
 720kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
 721                   int state, int version)
 722{
 723        /* CAVEAT EMPTOR:
 724         * If the new conn is created successfully it takes over the caller's
 725         * ref on 'peer'.  It also "owns" 'cmid' and destroys it when it itself
 726         * is destroyed.  On failure, the caller's ref on 'peer' remains and
 727         * she must dispose of 'cmid'.  (Actually I'd block forever if I tried
 728         * to destroy 'cmid' here since I'm called from the CM which still has
 729         * its ref on 'cmid'). */
 730        rwlock_t                *glock = &kiblnd_data.kib_global_lock;
 731        kib_net_t             *net = peer->ibp_ni->ni_data;
 732        kib_dev_t             *dev;
 733        struct ib_qp_init_attr *init_qp_attr;
 734        struct kib_sched_info   *sched;
 735        kib_conn_t              *conn;
 736        struct ib_cq            *cq;
 737        unsigned long           flags;
 738        int                     cpt;
 739        int                     rc;
 740        int                     i;
 741
 742        LASSERT(net != NULL);
 743        LASSERT(!in_interrupt());
 744
 745        dev = net->ibn_dev;
 746
 747        cpt = lnet_cpt_of_nid(peer->ibp_nid);
 748        sched = kiblnd_data.kib_scheds[cpt];
 749
 750        LASSERT(sched->ibs_nthreads > 0);
 751
 752        LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
 753                         sizeof(*init_qp_attr));
 754        if (init_qp_attr == NULL) {
 755                CERROR("Can't allocate qp_attr for %s\n",
 756                       libcfs_nid2str(peer->ibp_nid));
 757                goto failed_0;
 758        }
 759
 760        LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
 761        if (conn == NULL) {
 762                CERROR("Can't allocate connection for %s\n",
 763                       libcfs_nid2str(peer->ibp_nid));
 764                goto failed_1;
 765        }
 766
 767        conn->ibc_state = IBLND_CONN_INIT;
 768        conn->ibc_version = version;
 769        conn->ibc_peer = peer;            /* I take the caller's ref */
 770        cmid->context = conn;              /* for future CM callbacks */
 771        conn->ibc_cmid = cmid;
 772
 773        INIT_LIST_HEAD(&conn->ibc_early_rxs);
 774        INIT_LIST_HEAD(&conn->ibc_tx_noops);
 775        INIT_LIST_HEAD(&conn->ibc_tx_queue);
 776        INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
 777        INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
 778        INIT_LIST_HEAD(&conn->ibc_active_txs);
 779        spin_lock_init(&conn->ibc_lock);
 780
 781        LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
 782                         sizeof(*conn->ibc_connvars));
 783        if (conn->ibc_connvars == NULL) {
 784                CERROR("Can't allocate in-progress connection state\n");
 785                goto failed_2;
 786        }
 787
 788        write_lock_irqsave(glock, flags);
 789        if (dev->ibd_failover) {
 790                write_unlock_irqrestore(glock, flags);
 791                CERROR("%s: failover in progress\n", dev->ibd_ifname);
 792                goto failed_2;
 793        }
 794
 795        if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
 796                /* wakeup failover thread and teardown connection */
 797                if (kiblnd_dev_can_failover(dev)) {
 798                        list_add_tail(&dev->ibd_fail_list,
 799                                      &kiblnd_data.kib_failed_devs);
 800                        wake_up(&kiblnd_data.kib_failover_waitq);
 801                }
 802
 803                write_unlock_irqrestore(glock, flags);
 804                CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
 805                       cmid->device->name, dev->ibd_ifname);
 806                goto failed_2;
 807        }
 808
 809        kiblnd_hdev_addref_locked(dev->ibd_hdev);
 810        conn->ibc_hdev = dev->ibd_hdev;
 811
 812        kiblnd_setup_mtu_locked(cmid);
 813
 814        write_unlock_irqrestore(glock, flags);
 815
 816        LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
 817                         IBLND_RX_MSGS(version) * sizeof(kib_rx_t));
 818        if (conn->ibc_rxs == NULL) {
 819                CERROR("Cannot allocate RX buffers\n");
 820                goto failed_2;
 821        }
 822
 823        rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
 824                                IBLND_RX_MSG_PAGES(version));
 825        if (rc != 0)
 826                goto failed_2;
 827
 828        kiblnd_map_rx_descs(conn);
 829
 830        cq = ib_create_cq(cmid->device,
 831                          kiblnd_cq_completion, kiblnd_cq_event, conn,
 832                          IBLND_CQ_ENTRIES(version),
 833                          kiblnd_get_completion_vector(conn, cpt));
 834        if (IS_ERR(cq)) {
 835                CERROR("Can't create CQ: %ld, cqe: %d\n",
 836                       PTR_ERR(cq), IBLND_CQ_ENTRIES(version));
 837                goto failed_2;
 838        }
 839
 840        conn->ibc_cq = cq;
 841
 842        rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 843        if (rc != 0) {
 844                CERROR("Can't request completion notificiation: %d\n", rc);
 845                goto failed_2;
 846        }
 847
 848        init_qp_attr->event_handler = kiblnd_qp_event;
 849        init_qp_attr->qp_context = conn;
 850        init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(version);
 851        init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(version);
 852        init_qp_attr->cap.max_send_sge = 1;
 853        init_qp_attr->cap.max_recv_sge = 1;
 854        init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
 855        init_qp_attr->qp_type = IB_QPT_RC;
 856        init_qp_attr->send_cq = cq;
 857        init_qp_attr->recv_cq = cq;
 858
 859        conn->ibc_sched = sched;
 860
 861        rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
 862        if (rc != 0) {
 863                CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
 864                       rc, init_qp_attr->cap.max_send_wr,
 865                       init_qp_attr->cap.max_recv_wr);
 866                goto failed_2;
 867        }
 868
 869        LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
 870
 871        /* 1 ref for caller and each rxmsg */
 872        atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(version));
 873        conn->ibc_nrx = IBLND_RX_MSGS(version);
 874
 875        /* post receives */
 876        for (i = 0; i < IBLND_RX_MSGS(version); i++) {
 877                rc = kiblnd_post_rx(&conn->ibc_rxs[i],
 878                                    IBLND_POSTRX_NO_CREDIT);
 879                if (rc != 0) {
 880                        CERROR("Can't post rxmsg: %d\n", rc);
 881
 882                        /* Make posted receives complete */
 883                        kiblnd_abort_receives(conn);
 884
 885                        /* correct # of posted buffers
 886                         * NB locking needed now I'm racing with completion */
 887                        spin_lock_irqsave(&sched->ibs_lock, flags);
 888                        conn->ibc_nrx -= IBLND_RX_MSGS(version) - i;
 889                        spin_unlock_irqrestore(&sched->ibs_lock, flags);
 890
 891                        /* cmid will be destroyed by CM(ofed) after cm_callback
 892                         * returned, so we can't refer it anymore
 893                         * (by kiblnd_connd()->kiblnd_destroy_conn) */
 894                        rdma_destroy_qp(conn->ibc_cmid);
 895                        conn->ibc_cmid = NULL;
 896
 897                        /* Drop my own and unused rxbuffer refcounts */
 898                        while (i++ <= IBLND_RX_MSGS(version))
 899                                kiblnd_conn_decref(conn);
 900
 901                        return NULL;
 902                }
 903        }
 904
 905        /* Init successful! */
 906        LASSERT (state == IBLND_CONN_ACTIVE_CONNECT ||
 907                 state == IBLND_CONN_PASSIVE_WAIT);
 908        conn->ibc_state = state;
 909
 910        /* 1 more conn */
 911        atomic_inc(&net->ibn_nconns);
 912        return conn;
 913
 914 failed_2:
 915        kiblnd_destroy_conn(conn);
 916 failed_1:
 917        LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
 918 failed_0:
 919        return NULL;
 920}
 921
 922void
 923kiblnd_destroy_conn (kib_conn_t *conn)
 924{
 925        struct rdma_cm_id *cmid = conn->ibc_cmid;
 926        kib_peer_t      *peer = conn->ibc_peer;
 927        int             rc;
 928
 929        LASSERT (!in_interrupt());
 930        LASSERT (atomic_read(&conn->ibc_refcount) == 0);
 931        LASSERT (list_empty(&conn->ibc_early_rxs));
 932        LASSERT (list_empty(&conn->ibc_tx_noops));
 933        LASSERT (list_empty(&conn->ibc_tx_queue));
 934        LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
 935        LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
 936        LASSERT (list_empty(&conn->ibc_active_txs));
 937        LASSERT (conn->ibc_noops_posted == 0);
 938        LASSERT (conn->ibc_nsends_posted == 0);
 939
 940        switch (conn->ibc_state) {
 941        default:
 942                /* conn must be completely disengaged from the network */
 943                LBUG();
 944
 945        case IBLND_CONN_DISCONNECTED:
 946                /* connvars should have been freed already */
 947                LASSERT (conn->ibc_connvars == NULL);
 948                break;
 949
 950        case IBLND_CONN_INIT:
 951                break;
 952        }
 953
 954        /* conn->ibc_cmid might be destroyed by CM already */
 955        if (cmid != NULL && cmid->qp != NULL)
 956                rdma_destroy_qp(cmid);
 957
 958        if (conn->ibc_cq != NULL) {
 959                rc = ib_destroy_cq(conn->ibc_cq);
 960                if (rc != 0)
 961                        CWARN("Error destroying CQ: %d\n", rc);
 962        }
 963
 964        if (conn->ibc_rx_pages != NULL)
 965                kiblnd_unmap_rx_descs(conn);
 966
 967        if (conn->ibc_rxs != NULL) {
 968                LIBCFS_FREE(conn->ibc_rxs,
 969                            IBLND_RX_MSGS(conn->ibc_version) * sizeof(kib_rx_t));
 970        }
 971
 972        if (conn->ibc_connvars != NULL)
 973                LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
 974
 975        if (conn->ibc_hdev != NULL)
 976                kiblnd_hdev_decref(conn->ibc_hdev);
 977
 978        /* See CAVEAT EMPTOR above in kiblnd_create_conn */
 979        if (conn->ibc_state != IBLND_CONN_INIT) {
 980                kib_net_t *net = peer->ibp_ni->ni_data;
 981
 982                kiblnd_peer_decref(peer);
 983                rdma_destroy_id(cmid);
 984                atomic_dec(&net->ibn_nconns);
 985        }
 986
 987        LIBCFS_FREE(conn, sizeof(*conn));
 988}
 989
 990int
 991kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why)
 992{
 993        kib_conn_t           *conn;
 994        struct list_head             *ctmp;
 995        struct list_head             *cnxt;
 996        int                  count = 0;
 997
 998        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
 999                conn = list_entry(ctmp, kib_conn_t, ibc_list);
1000
1001                CDEBUG(D_NET, "Closing conn -> %s, "
1002                              "version: %x, reason: %d\n",
1003                       libcfs_nid2str(peer->ibp_nid),
1004                       conn->ibc_version, why);
1005
1006                kiblnd_close_conn_locked(conn, why);
1007                count++;
1008        }
1009
1010        return count;
1011}
1012
1013int
1014kiblnd_close_stale_conns_locked (kib_peer_t *peer,
1015                                 int version, __u64 incarnation)
1016{
1017        kib_conn_t           *conn;
1018        struct list_head             *ctmp;
1019        struct list_head             *cnxt;
1020        int                  count = 0;
1021
1022        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
1023                conn = list_entry(ctmp, kib_conn_t, ibc_list);
1024
1025                if (conn->ibc_version     == version &&
1026                    conn->ibc_incarnation == incarnation)
1027                        continue;
1028
1029                CDEBUG(D_NET, "Closing stale conn -> %s version: %x, "
1030                              "incarnation:"LPX64"(%x, "LPX64")\n",
1031                       libcfs_nid2str(peer->ibp_nid),
1032                       conn->ibc_version, conn->ibc_incarnation,
1033                       version, incarnation);
1034
1035                kiblnd_close_conn_locked(conn, -ESTALE);
1036                count++;
1037        }
1038
1039        return count;
1040}
1041
1042int
1043kiblnd_close_matching_conns (lnet_ni_t *ni, lnet_nid_t nid)
1044{
1045        kib_peer_t           *peer;
1046        struct list_head             *ptmp;
1047        struct list_head             *pnxt;
1048        int                  lo;
1049        int                  hi;
1050        int                  i;
1051        unsigned long      flags;
1052        int                  count = 0;
1053
1054        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1055
1056        if (nid != LNET_NID_ANY)
1057                lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
1058        else {
1059                lo = 0;
1060                hi = kiblnd_data.kib_peer_hash_size - 1;
1061        }
1062
1063        for (i = lo; i <= hi; i++) {
1064                list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
1065
1066                        peer = list_entry(ptmp, kib_peer_t, ibp_list);
1067                        LASSERT (peer->ibp_connecting > 0 ||
1068                                 peer->ibp_accepting > 0 ||
1069                                 !list_empty(&peer->ibp_conns));
1070
1071                        if (peer->ibp_ni != ni)
1072                                continue;
1073
1074                        if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
1075                                continue;
1076
1077                        count += kiblnd_close_peer_conns_locked(peer, 0);
1078                }
1079        }
1080
1081        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1082
1083        /* wildcards always succeed */
1084        if (nid == LNET_NID_ANY)
1085                return 0;
1086
1087        return (count == 0) ? -ENOENT : 0;
1088}
1089
1090int
1091kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
1092{
1093        struct libcfs_ioctl_data *data = arg;
1094        int                    rc = -EINVAL;
1095
1096        switch(cmd) {
1097        case IOC_LIBCFS_GET_PEER: {
1098                lnet_nid_t   nid = 0;
1099                int       count = 0;
1100
1101                rc = kiblnd_get_peer_info(ni, data->ioc_count,
1102                                          &nid, &count);
1103                data->ioc_nid    = nid;
1104                data->ioc_count  = count;
1105                break;
1106        }
1107
1108        case IOC_LIBCFS_DEL_PEER: {
1109                rc = kiblnd_del_peer(ni, data->ioc_nid);
1110                break;
1111        }
1112        case IOC_LIBCFS_GET_CONN: {
1113                kib_conn_t *conn;
1114
1115                rc = 0;
1116                conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
1117                if (conn == NULL) {
1118                        rc = -ENOENT;
1119                        break;
1120                }
1121
1122                LASSERT (conn->ibc_cmid != NULL);
1123                data->ioc_nid = conn->ibc_peer->ibp_nid;
1124                if (conn->ibc_cmid->route.path_rec == NULL)
1125                        data->ioc_u32[0] = 0; /* iWarp has no path MTU */
1126                else
1127                        data->ioc_u32[0] =
1128                        ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
1129                kiblnd_conn_decref(conn);
1130                break;
1131        }
1132        case IOC_LIBCFS_CLOSE_CONNECTION: {
1133                rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
1134                break;
1135        }
1136
1137        default:
1138                break;
1139        }
1140
1141        return rc;
1142}
1143
1144void
1145kiblnd_query (lnet_ni_t *ni, lnet_nid_t nid, cfs_time_t *when)
1146{
1147        cfs_time_t      last_alive = 0;
1148        cfs_time_t      now = cfs_time_current();
1149        rwlock_t        *glock = &kiblnd_data.kib_global_lock;
1150        kib_peer_t      *peer;
1151        unsigned long   flags;
1152
1153        read_lock_irqsave(glock, flags);
1154
1155        peer = kiblnd_find_peer_locked(nid);
1156        if (peer != NULL) {
1157                LASSERT (peer->ibp_connecting > 0 || /* creating conns */
1158                         peer->ibp_accepting > 0 ||
1159                         !list_empty(&peer->ibp_conns));  /* active conn */
1160                last_alive = peer->ibp_last_alive;
1161        }
1162
1163        read_unlock_irqrestore(glock, flags);
1164
1165        if (last_alive != 0)
1166                *when = last_alive;
1167
1168        /* peer is not persistent in hash, trigger peer creation
1169         * and connection establishment with a NULL tx */
1170        if (peer == NULL)
1171                kiblnd_launch_tx(ni, NULL, nid);
1172
1173        CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
1174               libcfs_nid2str(nid), peer,
1175               last_alive ? cfs_duration_sec(now - last_alive) : -1);
1176        return;
1177}
1178
1179void
1180kiblnd_free_pages(kib_pages_t *p)
1181{
1182        int     npages = p->ibp_npages;
1183        int     i;
1184
1185        for (i = 0; i < npages; i++) {
1186                if (p->ibp_pages[i] != NULL)
1187                        __free_page(p->ibp_pages[i]);
1188        }
1189
1190        LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
1191}
1192
1193int
1194kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
1195{
1196        kib_pages_t     *p;
1197        int             i;
1198
1199        LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
1200                         offsetof(kib_pages_t, ibp_pages[npages]));
1201        if (p == NULL) {
1202                CERROR("Can't allocate descriptor for %d pages\n", npages);
1203                return -ENOMEM;
1204        }
1205
1206        memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1207        p->ibp_npages = npages;
1208
1209        for (i = 0; i < npages; i++) {
1210                p->ibp_pages[i] = alloc_pages_node(
1211                                    cfs_cpt_spread_node(lnet_cpt_table(), cpt),
1212                                    __GFP_IO, 0);
1213                if (p->ibp_pages[i] == NULL) {
1214                        CERROR("Can't allocate page %d of %d\n", i, npages);
1215                        kiblnd_free_pages(p);
1216                        return -ENOMEM;
1217                }
1218        }
1219
1220        *pp = p;
1221        return 0;
1222}
1223
1224void
1225kiblnd_unmap_rx_descs(kib_conn_t *conn)
1226{
1227        kib_rx_t *rx;
1228        int       i;
1229
1230        LASSERT (conn->ibc_rxs != NULL);
1231        LASSERT (conn->ibc_hdev != NULL);
1232
1233        for (i = 0; i < IBLND_RX_MSGS(conn->ibc_version); i++) {
1234                rx = &conn->ibc_rxs[i];
1235
1236                LASSERT (rx->rx_nob >= 0); /* not posted */
1237
1238                kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
1239                                        KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
1240                                                          rx->rx_msgaddr),
1241                                        IBLND_MSG_SIZE, DMA_FROM_DEVICE);
1242        }
1243
1244        kiblnd_free_pages(conn->ibc_rx_pages);
1245
1246        conn->ibc_rx_pages = NULL;
1247}
1248
1249void
1250kiblnd_map_rx_descs(kib_conn_t *conn)
1251{
1252        kib_rx_t       *rx;
1253        struct page    *pg;
1254        int          pg_off;
1255        int          ipg;
1256        int          i;
1257
1258        for (pg_off = ipg = i = 0;
1259             i < IBLND_RX_MSGS(conn->ibc_version); i++) {
1260                pg = conn->ibc_rx_pages->ibp_pages[ipg];
1261                rx = &conn->ibc_rxs[i];
1262
1263                rx->rx_conn = conn;
1264                rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
1265
1266                rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
1267                                                       rx->rx_msg, IBLND_MSG_SIZE,
1268                                                       DMA_FROM_DEVICE);
1269                LASSERT (!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
1270                                                   rx->rx_msgaddr));
1271                KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
1272
1273                CDEBUG(D_NET,"rx %d: %p "LPX64"("LPX64")\n",
1274                       i, rx->rx_msg, rx->rx_msgaddr,
1275                       lnet_page2phys(pg) + pg_off);
1276
1277                pg_off += IBLND_MSG_SIZE;
1278                LASSERT (pg_off <= PAGE_SIZE);
1279
1280                if (pg_off == PAGE_SIZE) {
1281                        pg_off = 0;
1282                        ipg++;
1283                        LASSERT (ipg <= IBLND_RX_MSG_PAGES(conn->ibc_version));
1284                }
1285        }
1286}
1287
1288static void
1289kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
1290{
1291        kib_hca_dev_t  *hdev = tpo->tpo_hdev;
1292        kib_tx_t       *tx;
1293        int          i;
1294
1295        LASSERT (tpo->tpo_pool.po_allocated == 0);
1296
1297        if (hdev == NULL)
1298                return;
1299
1300        for (i = 0; i < tpo->tpo_pool.po_size; i++) {
1301                tx = &tpo->tpo_tx_descs[i];
1302                kiblnd_dma_unmap_single(hdev->ibh_ibdev,
1303                                        KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
1304                                                          tx->tx_msgaddr),
1305                                        IBLND_MSG_SIZE, DMA_TO_DEVICE);
1306        }
1307
1308        kiblnd_hdev_decref(hdev);
1309        tpo->tpo_hdev = NULL;
1310}
1311
1312static kib_hca_dev_t *
1313kiblnd_current_hdev(kib_dev_t *dev)
1314{
1315        kib_hca_dev_t *hdev;
1316        unsigned long  flags;
1317        int         i = 0;
1318
1319        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1320        while (dev->ibd_failover) {
1321                read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1322                if (i++ % 50 == 0)
1323                        CDEBUG(D_NET, "%s: Wait for failover\n",
1324                               dev->ibd_ifname);
1325                schedule_timeout(cfs_time_seconds(1) / 100);
1326
1327                read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1328        }
1329
1330        kiblnd_hdev_addref_locked(dev->ibd_hdev);
1331        hdev = dev->ibd_hdev;
1332
1333        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1334
1335        return hdev;
1336}
1337
1338static void
1339kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
1340{
1341        kib_pages_t    *txpgs = tpo->tpo_tx_pages;
1342        kib_pool_t     *pool  = &tpo->tpo_pool;
1343        kib_net_t      *net   = pool->po_owner->ps_net;
1344        kib_dev_t      *dev;
1345        struct page    *page;
1346        kib_tx_t       *tx;
1347        int          page_offset;
1348        int          ipage;
1349        int          i;
1350
1351        LASSERT (net != NULL);
1352
1353        dev = net->ibn_dev;
1354
1355        /* pre-mapped messages are not bigger than 1 page */
1356        CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE);
1357
1358        /* No fancy arithmetic when we do the buffer calculations */
1359        CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0);
1360
1361        tpo->tpo_hdev = kiblnd_current_hdev(dev);
1362
1363        for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
1364                page = txpgs->ibp_pages[ipage];
1365                tx = &tpo->tpo_tx_descs[i];
1366
1367                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1368                                           page_offset);
1369
1370                tx->tx_msgaddr = kiblnd_dma_map_single(
1371                        tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
1372                        IBLND_MSG_SIZE, DMA_TO_DEVICE);
1373                LASSERT (!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
1374                                                   tx->tx_msgaddr));
1375                KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
1376
1377                list_add(&tx->tx_list, &pool->po_free_list);
1378
1379                page_offset += IBLND_MSG_SIZE;
1380                LASSERT (page_offset <= PAGE_SIZE);
1381
1382                if (page_offset == PAGE_SIZE) {
1383                        page_offset = 0;
1384                        ipage++;
1385                        LASSERT (ipage <= txpgs->ibp_npages);
1386                }
1387        }
1388}
1389
1390struct ib_mr *
1391kiblnd_find_dma_mr(kib_hca_dev_t *hdev, __u64 addr, __u64 size)
1392{
1393        __u64   index;
1394
1395        LASSERT (hdev->ibh_mrs[0] != NULL);
1396
1397        if (hdev->ibh_nmrs == 1)
1398                return hdev->ibh_mrs[0];
1399
1400        index = addr >> hdev->ibh_mr_shift;
1401
1402        if (index <  hdev->ibh_nmrs &&
1403            index == ((addr + size - 1) >> hdev->ibh_mr_shift))
1404                return hdev->ibh_mrs[index];
1405
1406        return NULL;
1407}
1408
1409struct ib_mr *
1410kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd)
1411{
1412        struct ib_mr *prev_mr;
1413        struct ib_mr *mr;
1414        int        i;
1415
1416        LASSERT (hdev->ibh_mrs[0] != NULL);
1417
1418        if (*kiblnd_tunables.kib_map_on_demand > 0 &&
1419            *kiblnd_tunables.kib_map_on_demand <= rd->rd_nfrags)
1420                return NULL;
1421
1422        if (hdev->ibh_nmrs == 1)
1423                return hdev->ibh_mrs[0];
1424
1425        for (i = 0, mr = prev_mr = NULL;
1426             i < rd->rd_nfrags; i++) {
1427                mr = kiblnd_find_dma_mr(hdev,
1428                                        rd->rd_frags[i].rf_addr,
1429                                        rd->rd_frags[i].rf_nob);
1430                if (prev_mr == NULL)
1431                        prev_mr = mr;
1432
1433                if (mr == NULL || prev_mr != mr) {
1434                        /* Can't covered by one single MR */
1435                        mr = NULL;
1436                        break;
1437                }
1438        }
1439
1440        return mr;
1441}
1442
1443void
1444kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
1445{
1446        LASSERT (pool->fpo_map_count == 0);
1447
1448        if (pool->fpo_fmr_pool != NULL)
1449                ib_destroy_fmr_pool(pool->fpo_fmr_pool);
1450
1451        if (pool->fpo_hdev != NULL)
1452                kiblnd_hdev_decref(pool->fpo_hdev);
1453
1454        LIBCFS_FREE(pool, sizeof(kib_fmr_pool_t));
1455}
1456
1457void
1458kiblnd_destroy_fmr_pool_list(struct list_head *head)
1459{
1460        kib_fmr_pool_t *pool;
1461
1462        while (!list_empty(head)) {
1463                pool = list_entry(head->next, kib_fmr_pool_t, fpo_list);
1464                list_del(&pool->fpo_list);
1465                kiblnd_destroy_fmr_pool(pool);
1466        }
1467}
1468
1469static int kiblnd_fmr_pool_size(int ncpts)
1470{
1471        int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts;
1472
1473        return max(IBLND_FMR_POOL, size);
1474}
1475
1476static int kiblnd_fmr_flush_trigger(int ncpts)
1477{
1478        int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts;
1479
1480        return max(IBLND_FMR_POOL_FLUSH, size);
1481}
1482
1483int
1484kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t **pp_fpo)
1485{
1486        /* FMR pool for RDMA */
1487        kib_dev_t              *dev = fps->fps_net->ibn_dev;
1488        kib_fmr_pool_t    *fpo;
1489        struct ib_fmr_pool_param param = {
1490                .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
1491                .page_shift     = PAGE_SHIFT,
1492                .access     = (IB_ACCESS_LOCAL_WRITE |
1493                                      IB_ACCESS_REMOTE_WRITE),
1494                .pool_size         = fps->fps_pool_size,
1495                .dirty_watermark   = fps->fps_flush_trigger,
1496                .flush_function    = NULL,
1497                .flush_arg       = NULL,
1498                .cache       = !!*kiblnd_tunables.kib_fmr_cache};
1499        int rc;
1500
1501        LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
1502        if (fpo == NULL)
1503                return -ENOMEM;
1504
1505        fpo->fpo_hdev = kiblnd_current_hdev(dev);
1506
1507        fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, &param);
1508        if (IS_ERR(fpo->fpo_fmr_pool)) {
1509                rc = PTR_ERR(fpo->fpo_fmr_pool);
1510                CERROR("Failed to create FMR pool: %d\n", rc);
1511
1512                kiblnd_hdev_decref(fpo->fpo_hdev);
1513                LIBCFS_FREE(fpo, sizeof(kib_fmr_pool_t));
1514                return rc;
1515        }
1516
1517        fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1518        fpo->fpo_owner    = fps;
1519        *pp_fpo = fpo;
1520
1521        return 0;
1522}
1523
1524static void
1525kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, struct list_head *zombies)
1526{
1527        if (fps->fps_net == NULL) /* intialized? */
1528                return;
1529
1530        spin_lock(&fps->fps_lock);
1531
1532        while (!list_empty(&fps->fps_pool_list)) {
1533                kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
1534                                                 kib_fmr_pool_t, fpo_list);
1535                fpo->fpo_failed = 1;
1536                list_del(&fpo->fpo_list);
1537                if (fpo->fpo_map_count == 0)
1538                        list_add(&fpo->fpo_list, zombies);
1539                else
1540                        list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
1541        }
1542
1543        spin_unlock(&fps->fps_lock);
1544}
1545
1546static void
1547kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
1548{
1549        if (fps->fps_net != NULL) { /* initialized? */
1550                kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
1551                kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
1552        }
1553}
1554
1555static int
1556kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, kib_net_t *net,
1557                        int pool_size, int flush_trigger)
1558{
1559        kib_fmr_pool_t *fpo;
1560        int          rc;
1561
1562        memset(fps, 0, sizeof(kib_fmr_poolset_t));
1563
1564        fps->fps_net = net;
1565        fps->fps_cpt = cpt;
1566        fps->fps_pool_size = pool_size;
1567        fps->fps_flush_trigger = flush_trigger;
1568        spin_lock_init(&fps->fps_lock);
1569        INIT_LIST_HEAD(&fps->fps_pool_list);
1570        INIT_LIST_HEAD(&fps->fps_failed_pool_list);
1571
1572        rc = kiblnd_create_fmr_pool(fps, &fpo);
1573        if (rc == 0)
1574                list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
1575
1576        return rc;
1577}
1578
1579static int
1580kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, cfs_time_t now)
1581{
1582        if (fpo->fpo_map_count != 0) /* still in use */
1583                return 0;
1584        if (fpo->fpo_failed)
1585                return 1;
1586        return cfs_time_aftereq(now, fpo->fpo_deadline);
1587}
1588
1589void
1590kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
1591{
1592        LIST_HEAD     (zombies);
1593        kib_fmr_pool_t    *fpo = fmr->fmr_pool;
1594        kib_fmr_poolset_t *fps = fpo->fpo_owner;
1595        cfs_time_t       now = cfs_time_current();
1596        kib_fmr_pool_t    *tmp;
1597        int             rc;
1598
1599        rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
1600        LASSERT (rc == 0);
1601
1602        if (status != 0) {
1603                rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
1604                LASSERT (rc == 0);
1605        }
1606
1607        fmr->fmr_pool = NULL;
1608        fmr->fmr_pfmr = NULL;
1609
1610        spin_lock(&fps->fps_lock);
1611        fpo->fpo_map_count --;  /* decref the pool */
1612
1613        list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
1614                /* the first pool is persistent */
1615                if (fps->fps_pool_list.next == &fpo->fpo_list)
1616                        continue;
1617
1618                if (kiblnd_fmr_pool_is_idle(fpo, now)) {
1619                        list_move(&fpo->fpo_list, &zombies);
1620                        fps->fps_version ++;
1621                }
1622        }
1623        spin_unlock(&fps->fps_lock);
1624
1625        if (!list_empty(&zombies))
1626                kiblnd_destroy_fmr_pool_list(&zombies);
1627}
1628
1629int
1630kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
1631                    __u64 iov, kib_fmr_t *fmr)
1632{
1633        struct ib_pool_fmr *pfmr;
1634        kib_fmr_pool_t     *fpo;
1635        __u64          version;
1636        int              rc;
1637
1638 again:
1639        spin_lock(&fps->fps_lock);
1640        version = fps->fps_version;
1641        list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
1642                fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1643                fpo->fpo_map_count++;
1644                spin_unlock(&fps->fps_lock);
1645
1646                pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
1647                                            pages, npages, iov);
1648                if (likely(!IS_ERR(pfmr))) {
1649                        fmr->fmr_pool = fpo;
1650                        fmr->fmr_pfmr = pfmr;
1651                        return 0;
1652                }
1653
1654                spin_lock(&fps->fps_lock);
1655                fpo->fpo_map_count--;
1656                if (PTR_ERR(pfmr) != -EAGAIN) {
1657                        spin_unlock(&fps->fps_lock);
1658                        return PTR_ERR(pfmr);
1659                }
1660
1661                /* EAGAIN and ... */
1662                if (version != fps->fps_version) {
1663                        spin_unlock(&fps->fps_lock);
1664                        goto again;
1665                }
1666        }
1667
1668        if (fps->fps_increasing) {
1669                spin_unlock(&fps->fps_lock);
1670                CDEBUG(D_NET, "Another thread is allocating new "
1671                       "FMR pool, waiting for her to complete\n");
1672                schedule();
1673                goto again;
1674
1675        }
1676
1677        if (cfs_time_before(cfs_time_current(), fps->fps_next_retry)) {
1678                /* someone failed recently */
1679                spin_unlock(&fps->fps_lock);
1680                return -EAGAIN;
1681        }
1682
1683        fps->fps_increasing = 1;
1684        spin_unlock(&fps->fps_lock);
1685
1686        CDEBUG(D_NET, "Allocate new FMR pool\n");
1687        rc = kiblnd_create_fmr_pool(fps, &fpo);
1688        spin_lock(&fps->fps_lock);
1689        fps->fps_increasing = 0;
1690        if (rc == 0) {
1691                fps->fps_version++;
1692                list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
1693        } else {
1694                fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
1695        }
1696        spin_unlock(&fps->fps_lock);
1697
1698        goto again;
1699}
1700
1701static void
1702kiblnd_fini_pool(kib_pool_t *pool)
1703{
1704        LASSERT (list_empty(&pool->po_free_list));
1705        LASSERT (pool->po_allocated == 0);
1706
1707        CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
1708}
1709
1710static void
1711kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
1712{
1713        CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
1714
1715        memset(pool, 0, sizeof(kib_pool_t));
1716        INIT_LIST_HEAD(&pool->po_free_list);
1717        pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1718        pool->po_owner    = ps;
1719        pool->po_size     = size;
1720}
1721
1722void
1723kiblnd_destroy_pool_list(struct list_head *head)
1724{
1725        kib_pool_t *pool;
1726
1727        while (!list_empty(head)) {
1728                pool = list_entry(head->next, kib_pool_t, po_list);
1729                list_del(&pool->po_list);
1730
1731                LASSERT (pool->po_owner != NULL);
1732                pool->po_owner->ps_pool_destroy(pool);
1733        }
1734}
1735
1736static void
1737kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
1738{
1739        if (ps->ps_net == NULL) /* intialized? */
1740                return;
1741
1742        spin_lock(&ps->ps_lock);
1743        while (!list_empty(&ps->ps_pool_list)) {
1744                kib_pool_t *po = list_entry(ps->ps_pool_list.next,
1745                                            kib_pool_t, po_list);
1746                po->po_failed = 1;
1747                list_del(&po->po_list);
1748                if (po->po_allocated == 0)
1749                        list_add(&po->po_list, zombies);
1750                else
1751                        list_add(&po->po_list, &ps->ps_failed_pool_list);
1752        }
1753        spin_unlock(&ps->ps_lock);
1754}
1755
1756static void
1757kiblnd_fini_poolset(kib_poolset_t *ps)
1758{
1759        if (ps->ps_net != NULL) { /* initialized? */
1760                kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
1761                kiblnd_destroy_pool_list(&ps->ps_pool_list);
1762        }
1763}
1764
1765static int
1766kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
1767                    kib_net_t *net, char *name, int size,
1768                    kib_ps_pool_create_t po_create,
1769                    kib_ps_pool_destroy_t po_destroy,
1770                    kib_ps_node_init_t nd_init,
1771                    kib_ps_node_fini_t nd_fini)
1772{
1773        kib_pool_t      *pool;
1774        int             rc;
1775
1776        memset(ps, 0, sizeof(kib_poolset_t));
1777
1778        ps->ps_cpt          = cpt;
1779        ps->ps_net        = net;
1780        ps->ps_pool_create  = po_create;
1781        ps->ps_pool_destroy = po_destroy;
1782        ps->ps_node_init    = nd_init;
1783        ps->ps_node_fini    = nd_fini;
1784        ps->ps_pool_size    = size;
1785        if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
1786            >= sizeof(ps->ps_name))
1787                return -E2BIG;
1788        spin_lock_init(&ps->ps_lock);
1789        INIT_LIST_HEAD(&ps->ps_pool_list);
1790        INIT_LIST_HEAD(&ps->ps_failed_pool_list);
1791
1792        rc = ps->ps_pool_create(ps, size, &pool);
1793        if (rc == 0)
1794                list_add(&pool->po_list, &ps->ps_pool_list);
1795        else
1796                CERROR("Failed to create the first pool for %s\n", ps->ps_name);
1797
1798        return rc;
1799}
1800
1801static int
1802kiblnd_pool_is_idle(kib_pool_t *pool, cfs_time_t now)
1803{
1804        if (pool->po_allocated != 0) /* still in use */
1805                return 0;
1806        if (pool->po_failed)
1807                return 1;
1808        return cfs_time_aftereq(now, pool->po_deadline);
1809}
1810
1811void
1812kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
1813{
1814        LIST_HEAD  (zombies);
1815        kib_poolset_t  *ps = pool->po_owner;
1816        kib_pool_t     *tmp;
1817        cfs_time_t      now = cfs_time_current();
1818
1819        spin_lock(&ps->ps_lock);
1820
1821        if (ps->ps_node_fini != NULL)
1822                ps->ps_node_fini(pool, node);
1823
1824        LASSERT (pool->po_allocated > 0);
1825        list_add(node, &pool->po_free_list);
1826        pool->po_allocated --;
1827
1828        list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
1829                /* the first pool is persistent */
1830                if (ps->ps_pool_list.next == &pool->po_list)
1831                        continue;
1832
1833                if (kiblnd_pool_is_idle(pool, now))
1834                        list_move(&pool->po_list, &zombies);
1835        }
1836        spin_unlock(&ps->ps_lock);
1837
1838        if (!list_empty(&zombies))
1839                kiblnd_destroy_pool_list(&zombies);
1840}
1841
1842struct list_head *
1843kiblnd_pool_alloc_node(kib_poolset_t *ps)
1844{
1845        struct list_head            *node;
1846        kib_pool_t          *pool;
1847        int                 rc;
1848
1849 again:
1850        spin_lock(&ps->ps_lock);
1851        list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
1852                if (list_empty(&pool->po_free_list))
1853                        continue;
1854
1855                pool->po_allocated ++;
1856                pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1857                node = pool->po_free_list.next;
1858                list_del(node);
1859
1860                if (ps->ps_node_init != NULL) {
1861                        /* still hold the lock */
1862                        ps->ps_node_init(pool, node);
1863                }
1864                spin_unlock(&ps->ps_lock);
1865                return node;
1866        }
1867
1868        /* no available tx pool and ... */
1869        if (ps->ps_increasing) {
1870                /* another thread is allocating a new pool */
1871                spin_unlock(&ps->ps_lock);
1872                CDEBUG(D_NET, "Another thread is allocating new "
1873                       "%s pool, waiting for her to complete\n",
1874                       ps->ps_name);
1875                schedule();
1876                goto again;
1877        }
1878
1879        if (cfs_time_before(cfs_time_current(), ps->ps_next_retry)) {
1880                /* someone failed recently */
1881                spin_unlock(&ps->ps_lock);
1882                return NULL;
1883        }
1884
1885        ps->ps_increasing = 1;
1886        spin_unlock(&ps->ps_lock);
1887
1888        CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
1889
1890        rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
1891
1892        spin_lock(&ps->ps_lock);
1893        ps->ps_increasing = 0;
1894        if (rc == 0) {
1895                list_add_tail(&pool->po_list, &ps->ps_pool_list);
1896        } else {
1897                ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
1898                CERROR("Can't allocate new %s pool because out of memory\n",
1899                       ps->ps_name);
1900        }
1901        spin_unlock(&ps->ps_lock);
1902
1903        goto again;
1904}
1905
1906void
1907kiblnd_pmr_pool_unmap(kib_phys_mr_t *pmr)
1908{
1909        kib_pmr_pool_t      *ppo = pmr->pmr_pool;
1910        struct ib_mr    *mr  = pmr->pmr_mr;
1911
1912        pmr->pmr_mr = NULL;
1913        kiblnd_pool_free_node(&ppo->ppo_pool, &pmr->pmr_list);
1914        if (mr != NULL)
1915                ib_dereg_mr(mr);
1916}
1917
1918int
1919kiblnd_pmr_pool_map(kib_pmr_poolset_t *pps, kib_hca_dev_t *hdev,
1920                    kib_rdma_desc_t *rd, __u64 *iova, kib_phys_mr_t **pp_pmr)
1921{
1922        kib_phys_mr_t *pmr;
1923        struct list_head    *node;
1924        int         rc;
1925        int         i;
1926
1927        node = kiblnd_pool_alloc_node(&pps->pps_poolset);
1928        if (node == NULL) {
1929                CERROR("Failed to allocate PMR descriptor\n");
1930                return -ENOMEM;
1931        }
1932
1933        pmr = container_of(node, kib_phys_mr_t, pmr_list);
1934        if (pmr->pmr_pool->ppo_hdev != hdev) {
1935                kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
1936                return -EAGAIN;
1937        }
1938
1939        for (i = 0; i < rd->rd_nfrags; i ++) {
1940                pmr->pmr_ipb[i].addr = rd->rd_frags[i].rf_addr;
1941                pmr->pmr_ipb[i].size = rd->rd_frags[i].rf_nob;
1942        }
1943
1944        pmr->pmr_mr = ib_reg_phys_mr(hdev->ibh_pd,
1945                                     pmr->pmr_ipb, rd->rd_nfrags,
1946                                     IB_ACCESS_LOCAL_WRITE |
1947                                     IB_ACCESS_REMOTE_WRITE,
1948                                     iova);
1949        if (!IS_ERR(pmr->pmr_mr)) {
1950                pmr->pmr_iova = *iova;
1951                *pp_pmr = pmr;
1952                return 0;
1953        }
1954
1955        rc = PTR_ERR(pmr->pmr_mr);
1956        CERROR("Failed ib_reg_phys_mr: %d\n", rc);
1957
1958        pmr->pmr_mr = NULL;
1959        kiblnd_pool_free_node(&pmr->pmr_pool->ppo_pool, node);
1960
1961        return rc;
1962}
1963
1964static void
1965kiblnd_destroy_pmr_pool(kib_pool_t *pool)
1966{
1967        kib_pmr_pool_t *ppo = container_of(pool, kib_pmr_pool_t, ppo_pool);
1968        kib_phys_mr_t  *pmr;
1969
1970        LASSERT (pool->po_allocated == 0);
1971
1972        while (!list_empty(&pool->po_free_list)) {
1973                pmr = list_entry(pool->po_free_list.next,
1974                                     kib_phys_mr_t, pmr_list);
1975
1976                LASSERT (pmr->pmr_mr == NULL);
1977                list_del(&pmr->pmr_list);
1978
1979                if (pmr->pmr_ipb != NULL) {
1980                        LIBCFS_FREE(pmr->pmr_ipb,
1981                                    IBLND_MAX_RDMA_FRAGS *
1982                                    sizeof(struct ib_phys_buf));
1983                }
1984
1985                LIBCFS_FREE(pmr, sizeof(kib_phys_mr_t));
1986        }
1987
1988        kiblnd_fini_pool(pool);
1989        if (ppo->ppo_hdev != NULL)
1990                kiblnd_hdev_decref(ppo->ppo_hdev);
1991
1992        LIBCFS_FREE(ppo, sizeof(kib_pmr_pool_t));
1993}
1994
1995static inline int kiblnd_pmr_pool_size(int ncpts)
1996{
1997        int size = *kiblnd_tunables.kib_pmr_pool_size / ncpts;
1998
1999        return max(IBLND_PMR_POOL, size);
2000}
2001
2002static int
2003kiblnd_create_pmr_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
2004{
2005        struct kib_pmr_pool     *ppo;
2006        struct kib_pool         *pool;
2007        kib_phys_mr_t           *pmr;
2008        int                     i;
2009
2010        LIBCFS_CPT_ALLOC(ppo, lnet_cpt_table(),
2011                         ps->ps_cpt, sizeof(kib_pmr_pool_t));
2012        if (ppo == NULL) {
2013                CERROR("Failed to allocate PMR pool\n");
2014                return -ENOMEM;
2015        }
2016
2017        pool = &ppo->ppo_pool;
2018        kiblnd_init_pool(ps, pool, size);
2019
2020        for (i = 0; i < size; i++) {
2021                LIBCFS_CPT_ALLOC(pmr, lnet_cpt_table(),
2022                                 ps->ps_cpt, sizeof(kib_phys_mr_t));
2023                if (pmr == NULL)
2024                        break;
2025
2026                pmr->pmr_pool = ppo;
2027                LIBCFS_CPT_ALLOC(pmr->pmr_ipb, lnet_cpt_table(), ps->ps_cpt,
2028                                 IBLND_MAX_RDMA_FRAGS * sizeof(*pmr->pmr_ipb));
2029                if (pmr->pmr_ipb == NULL)
2030                        break;
2031
2032                list_add(&pmr->pmr_list, &pool->po_free_list);
2033        }
2034
2035        if (i < size) {
2036                ps->ps_pool_destroy(pool);
2037                return -ENOMEM;
2038        }
2039
2040        ppo->ppo_hdev = kiblnd_current_hdev(ps->ps_net->ibn_dev);
2041        *pp_po = pool;
2042        return 0;
2043}
2044
2045static void
2046kiblnd_destroy_tx_pool(kib_pool_t *pool)
2047{
2048        kib_tx_pool_t  *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
2049        int          i;
2050
2051        LASSERT (pool->po_allocated == 0);
2052
2053        if (tpo->tpo_tx_pages != NULL) {
2054                kiblnd_unmap_tx_pool(tpo);
2055                kiblnd_free_pages(tpo->tpo_tx_pages);
2056        }
2057
2058        if (tpo->tpo_tx_descs == NULL)
2059                goto out;
2060
2061        for (i = 0; i < pool->po_size; i++) {
2062                kib_tx_t *tx = &tpo->tpo_tx_descs[i];
2063
2064                list_del(&tx->tx_list);
2065                if (tx->tx_pages != NULL)
2066                        LIBCFS_FREE(tx->tx_pages,
2067                                    LNET_MAX_IOV *
2068                                    sizeof(*tx->tx_pages));
2069                if (tx->tx_frags != NULL)
2070                        LIBCFS_FREE(tx->tx_frags,
2071                                    IBLND_MAX_RDMA_FRAGS *
2072                                            sizeof(*tx->tx_frags));
2073                if (tx->tx_wrq != NULL)
2074                        LIBCFS_FREE(tx->tx_wrq,
2075                                    (1 + IBLND_MAX_RDMA_FRAGS) *
2076                                    sizeof(*tx->tx_wrq));
2077                if (tx->tx_sge != NULL)
2078                        LIBCFS_FREE(tx->tx_sge,
2079                                    (1 + IBLND_MAX_RDMA_FRAGS) *
2080                                    sizeof(*tx->tx_sge));
2081                if (tx->tx_rd != NULL)
2082                        LIBCFS_FREE(tx->tx_rd,
2083                                    offsetof(kib_rdma_desc_t,
2084                                             rd_frags[IBLND_MAX_RDMA_FRAGS]));
2085        }
2086
2087        LIBCFS_FREE(tpo->tpo_tx_descs,
2088                    pool->po_size * sizeof(kib_tx_t));
2089out:
2090        kiblnd_fini_pool(pool);
2091        LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
2092}
2093
2094static int kiblnd_tx_pool_size(int ncpts)
2095{
2096        int ntx = *kiblnd_tunables.kib_ntx / ncpts;
2097
2098        return max(IBLND_TX_POOL, ntx);
2099}
2100
2101static int
2102kiblnd_create_tx_pool(kib_poolset_t *ps, int size, kib_pool_t **pp_po)
2103{
2104        int         i;
2105        int         npg;
2106        kib_pool_t    *pool;
2107        kib_tx_pool_t *tpo;
2108
2109        LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
2110        if (tpo == NULL) {
2111                CERROR("Failed to allocate TX pool\n");
2112                return -ENOMEM;
2113        }
2114
2115        pool = &tpo->tpo_pool;
2116        kiblnd_init_pool(ps, pool, size);
2117        tpo->tpo_tx_descs = NULL;
2118        tpo->tpo_tx_pages = NULL;
2119
2120        npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
2121        if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg) != 0) {
2122                CERROR("Can't allocate tx pages: %d\n", npg);
2123                LIBCFS_FREE(tpo, sizeof(kib_tx_pool_t));
2124                return -ENOMEM;
2125        }
2126
2127        LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
2128                         size * sizeof(kib_tx_t));
2129        if (tpo->tpo_tx_descs == NULL) {
2130                CERROR("Can't allocate %d tx descriptors\n", size);
2131                ps->ps_pool_destroy(pool);
2132                return -ENOMEM;
2133        }
2134
2135        memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
2136
2137        for (i = 0; i < size; i++) {
2138                kib_tx_t *tx = &tpo->tpo_tx_descs[i];
2139
2140                tx->tx_pool = tpo;
2141                if (ps->ps_net->ibn_fmr_ps != NULL) {
2142                        LIBCFS_CPT_ALLOC(tx->tx_pages,
2143                                         lnet_cpt_table(), ps->ps_cpt,
2144                                         LNET_MAX_IOV * sizeof(*tx->tx_pages));
2145                        if (tx->tx_pages == NULL)
2146                                break;
2147                }
2148
2149                LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
2150                                 IBLND_MAX_RDMA_FRAGS * sizeof(*tx->tx_frags));
2151                if (tx->tx_frags == NULL)
2152                        break;
2153
2154                sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS);
2155
2156                LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
2157                                 (1 + IBLND_MAX_RDMA_FRAGS) *
2158                                 sizeof(*tx->tx_wrq));
2159                if (tx->tx_wrq == NULL)
2160                        break;
2161
2162                LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
2163                                 (1 + IBLND_MAX_RDMA_FRAGS) *
2164                                 sizeof(*tx->tx_sge));
2165                if (tx->tx_sge == NULL)
2166                        break;
2167
2168                LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
2169                                 offsetof(kib_rdma_desc_t,
2170                                          rd_frags[IBLND_MAX_RDMA_FRAGS]));
2171                if (tx->tx_rd == NULL)
2172                        break;
2173        }
2174
2175        if (i == size) {
2176                kiblnd_map_tx_pool(tpo);
2177                *pp_po = pool;
2178                return 0;
2179        }
2180
2181        ps->ps_pool_destroy(pool);
2182        return -ENOMEM;
2183}
2184
2185static void
2186kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
2187{
2188        kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
2189                                             tps_poolset);
2190        kib_tx_t         *tx  = list_entry(node, kib_tx_t, tx_list);
2191
2192        tx->tx_cookie = tps->tps_next_tx_cookie ++;
2193}
2194
2195void
2196kiblnd_net_fini_pools(kib_net_t *net)
2197{
2198        int     i;
2199
2200        cfs_cpt_for_each(i, lnet_cpt_table()) {
2201                kib_tx_poolset_t        *tps;
2202                kib_fmr_poolset_t       *fps;
2203                kib_pmr_poolset_t       *pps;
2204
2205                if (net->ibn_tx_ps != NULL) {
2206                        tps = net->ibn_tx_ps[i];
2207                        kiblnd_fini_poolset(&tps->tps_poolset);
2208                }
2209
2210                if (net->ibn_fmr_ps != NULL) {
2211                        fps = net->ibn_fmr_ps[i];
2212                        kiblnd_fini_fmr_poolset(fps);
2213                }
2214
2215                if (net->ibn_pmr_ps != NULL) {
2216                        pps = net->ibn_pmr_ps[i];
2217                        kiblnd_fini_poolset(&pps->pps_poolset);
2218                }
2219        }
2220
2221        if (net->ibn_tx_ps != NULL) {
2222                cfs_percpt_free(net->ibn_tx_ps);
2223                net->ibn_tx_ps = NULL;
2224        }
2225
2226        if (net->ibn_fmr_ps != NULL) {
2227                cfs_percpt_free(net->ibn_fmr_ps);
2228                net->ibn_fmr_ps = NULL;
2229        }
2230
2231        if (net->ibn_pmr_ps != NULL) {
2232                cfs_percpt_free(net->ibn_pmr_ps);
2233                net->ibn_pmr_ps = NULL;
2234        }
2235}
2236
2237int
2238kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
2239{
2240        unsigned long   flags;
2241        int             cpt;
2242        int             rc;
2243        int             i;
2244
2245        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2246        if (*kiblnd_tunables.kib_map_on_demand == 0 &&
2247            net->ibn_dev->ibd_hdev->ibh_nmrs == 1) {
2248                read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
2249                                           flags);
2250                goto create_tx_pool;
2251        }
2252
2253        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2254
2255        if (*kiblnd_tunables.kib_fmr_pool_size <
2256            *kiblnd_tunables.kib_ntx / 4) {
2257                CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
2258                       *kiblnd_tunables.kib_fmr_pool_size,
2259                       *kiblnd_tunables.kib_ntx / 4);
2260                rc = -EINVAL;
2261                goto failed;
2262        }
2263
2264        /* TX pool must be created later than FMR/PMR, see LU-2268
2265         * for details */
2266        LASSERT(net->ibn_tx_ps == NULL);
2267
2268        /* premapping can fail if ibd_nmr > 1, so we always create
2269         * FMR/PMR pool and map-on-demand if premapping failed */
2270
2271        net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
2272                                           sizeof(kib_fmr_poolset_t));
2273        if (net->ibn_fmr_ps == NULL) {
2274                CERROR("Failed to allocate FMR pool array\n");
2275                rc = -ENOMEM;
2276                goto failed;
2277        }
2278
2279        for (i = 0; i < ncpts; i++) {
2280                cpt = (cpts == NULL) ? i : cpts[i];
2281                rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net,
2282                                             kiblnd_fmr_pool_size(ncpts),
2283                                             kiblnd_fmr_flush_trigger(ncpts));
2284                if (rc == -ENOSYS && i == 0) /* no FMR */
2285                        break; /* create PMR pool */
2286
2287                if (rc != 0) { /* a real error */
2288                        CERROR("Can't initialize FMR pool for CPT %d: %d\n",
2289                               cpt, rc);
2290                        goto failed;
2291                }
2292        }
2293
2294        if (i > 0) {
2295                LASSERT(i == ncpts);
2296                goto create_tx_pool;
2297        }
2298
2299        cfs_percpt_free(net->ibn_fmr_ps);
2300        net->ibn_fmr_ps = NULL;
2301
2302        CWARN("Device does not support FMR, failing back to PMR\n");
2303
2304        if (*kiblnd_tunables.kib_pmr_pool_size <
2305            *kiblnd_tunables.kib_ntx / 4) {
2306                CERROR("Can't set pmr pool size (%d) < ntx / 4(%d)\n",
2307                       *kiblnd_tunables.kib_pmr_pool_size,
2308                       *kiblnd_tunables.kib_ntx / 4);
2309                rc = -EINVAL;
2310                goto failed;
2311        }
2312
2313        net->ibn_pmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
2314                                           sizeof(kib_pmr_poolset_t));
2315        if (net->ibn_pmr_ps == NULL) {
2316                CERROR("Failed to allocate PMR pool array\n");
2317                rc = -ENOMEM;
2318                goto failed;
2319        }
2320
2321        for (i = 0; i < ncpts; i++) {
2322                cpt = (cpts == NULL) ? i : cpts[i];
2323                rc = kiblnd_init_poolset(&net->ibn_pmr_ps[cpt]->pps_poolset,
2324                                         cpt, net, "PMR",
2325                                         kiblnd_pmr_pool_size(ncpts),
2326                                         kiblnd_create_pmr_pool,
2327                                         kiblnd_destroy_pmr_pool, NULL, NULL);
2328                if (rc != 0) {
2329                        CERROR("Can't initialize PMR pool for CPT %d: %d\n",
2330                               cpt, rc);
2331                        goto failed;
2332                }
2333        }
2334
2335 create_tx_pool:
2336        net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
2337                                          sizeof(kib_tx_poolset_t));
2338        if (net->ibn_tx_ps == NULL) {
2339                CERROR("Failed to allocate tx pool array\n");
2340                rc = -ENOMEM;
2341                goto failed;
2342        }
2343
2344        for (i = 0; i < ncpts; i++) {
2345                cpt = (cpts == NULL) ? i : cpts[i];
2346                rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
2347                                         cpt, net, "TX",
2348                                         kiblnd_tx_pool_size(ncpts),
2349                                         kiblnd_create_tx_pool,
2350                                         kiblnd_destroy_tx_pool,
2351                                         kiblnd_tx_init, NULL);
2352                if (rc != 0) {
2353                        CERROR("Can't initialize TX pool for CPT %d: %d\n",
2354                               cpt, rc);
2355                        goto failed;
2356                }
2357        }
2358
2359        return 0;
2360 failed:
2361        kiblnd_net_fini_pools(net);
2362        LASSERT(rc != 0);
2363        return rc;
2364}
2365
2366static int
2367kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
2368{
2369        struct ib_device_attr *attr;
2370        int                 rc;
2371
2372        /* It's safe to assume a HCA can handle a page size
2373         * matching that of the native system */
2374        hdev->ibh_page_shift = PAGE_SHIFT;
2375        hdev->ibh_page_size  = 1 << PAGE_SHIFT;
2376        hdev->ibh_page_mask  = ~((__u64)hdev->ibh_page_size - 1);
2377
2378        LIBCFS_ALLOC(attr, sizeof(*attr));
2379        if (attr == NULL) {
2380                CERROR("Out of memory\n");
2381                return -ENOMEM;
2382        }
2383
2384        rc = ib_query_device(hdev->ibh_ibdev, attr);
2385        if (rc == 0)
2386                hdev->ibh_mr_size = attr->max_mr_size;
2387
2388        LIBCFS_FREE(attr, sizeof(*attr));
2389
2390        if (rc != 0) {
2391                CERROR("Failed to query IB device: %d\n", rc);
2392                return rc;
2393        }
2394
2395        if (hdev->ibh_mr_size == ~0ULL) {
2396                hdev->ibh_mr_shift = 64;
2397                return 0;
2398        }
2399
2400        for (hdev->ibh_mr_shift = 0;
2401             hdev->ibh_mr_shift < 64; hdev->ibh_mr_shift ++) {
2402                if (hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) ||
2403                    hdev->ibh_mr_size == (1ULL << hdev->ibh_mr_shift) - 1)
2404                        return 0;
2405        }
2406
2407        CERROR("Invalid mr size: "LPX64"\n", hdev->ibh_mr_size);
2408        return -EINVAL;
2409}
2410
2411void
2412kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
2413{
2414        int     i;
2415
2416        if (hdev->ibh_nmrs == 0 || hdev->ibh_mrs == NULL)
2417                return;
2418
2419        for (i = 0; i < hdev->ibh_nmrs; i++) {
2420                if (hdev->ibh_mrs[i] == NULL)
2421                        break;
2422
2423                ib_dereg_mr(hdev->ibh_mrs[i]);
2424        }
2425
2426        LIBCFS_FREE(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
2427        hdev->ibh_mrs  = NULL;
2428        hdev->ibh_nmrs = 0;
2429}
2430
2431void
2432kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
2433{
2434        kiblnd_hdev_cleanup_mrs(hdev);
2435
2436        if (hdev->ibh_pd != NULL)
2437                ib_dealloc_pd(hdev->ibh_pd);
2438
2439        if (hdev->ibh_cmid != NULL)
2440                rdma_destroy_id(hdev->ibh_cmid);
2441
2442        LIBCFS_FREE(hdev, sizeof(*hdev));
2443}
2444
2445int
2446kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
2447{
2448        struct ib_mr *mr;
2449        int        i;
2450        int        rc;
2451        __u64    mm_size;
2452        __u64    mr_size;
2453        int        acflags = IB_ACCESS_LOCAL_WRITE |
2454                                IB_ACCESS_REMOTE_WRITE;
2455
2456        rc = kiblnd_hdev_get_attr(hdev);
2457        if (rc != 0)
2458                return rc;
2459
2460        if (hdev->ibh_mr_shift == 64) {
2461                LIBCFS_ALLOC(hdev->ibh_mrs, 1 * sizeof(*hdev->ibh_mrs));
2462                if (hdev->ibh_mrs == NULL) {
2463                        CERROR("Failed to allocate MRs table\n");
2464                        return -ENOMEM;
2465                }
2466
2467                hdev->ibh_mrs[0] = NULL;
2468                hdev->ibh_nmrs   = 1;
2469
2470                mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
2471                if (IS_ERR(mr)) {
2472                        CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
2473                        kiblnd_hdev_cleanup_mrs(hdev);
2474                        return PTR_ERR(mr);
2475                }
2476
2477                hdev->ibh_mrs[0] = mr;
2478
2479                goto out;
2480        }
2481
2482        mr_size = (1ULL << hdev->ibh_mr_shift);
2483        mm_size = (unsigned long)high_memory - PAGE_OFFSET;
2484
2485        hdev->ibh_nmrs = (int)((mm_size + mr_size - 1) >> hdev->ibh_mr_shift);
2486
2487        if (hdev->ibh_mr_shift < 32 || hdev->ibh_nmrs > 1024) {
2488                /* it's 4T..., assume we will re-code at that time */
2489                CERROR("Can't support memory size: x"LPX64
2490                       " with MR size: x"LPX64"\n", mm_size, mr_size);
2491                return -EINVAL;
2492        }
2493
2494        /* create an array of MRs to cover all memory */
2495        LIBCFS_ALLOC(hdev->ibh_mrs, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
2496        if (hdev->ibh_mrs == NULL) {
2497                CERROR("Failed to allocate MRs' table\n");
2498                return -ENOMEM;
2499        }
2500
2501        memset(hdev->ibh_mrs, 0, sizeof(*hdev->ibh_mrs) * hdev->ibh_nmrs);
2502
2503        for (i = 0; i < hdev->ibh_nmrs; i++) {
2504                struct ib_phys_buf ipb;
2505                __u64         iova;
2506
2507                ipb.size = hdev->ibh_mr_size;
2508                ipb.addr = i * mr_size;
2509                iova     = ipb.addr;
2510
2511                mr = ib_reg_phys_mr(hdev->ibh_pd, &ipb, 1, acflags, &iova);
2512                if (IS_ERR(mr)) {
2513                        CERROR("Failed ib_reg_phys_mr addr "LPX64
2514                               " size "LPX64" : %ld\n",
2515                               ipb.addr, ipb.size, PTR_ERR(mr));
2516                        kiblnd_hdev_cleanup_mrs(hdev);
2517                        return PTR_ERR(mr);
2518                }
2519
2520                LASSERT (iova == ipb.addr);
2521
2522                hdev->ibh_mrs[i] = mr;
2523        }
2524
2525out:
2526        if (hdev->ibh_mr_size != ~0ULL || hdev->ibh_nmrs != 1)
2527                LCONSOLE_INFO("Register global MR array, MR size: "
2528                              LPX64", array size: %d\n",
2529                              hdev->ibh_mr_size, hdev->ibh_nmrs);
2530        return 0;
2531}
2532
2533static int
2534kiblnd_dummy_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
2535{       /* DUMMY */
2536        return 0;
2537}
2538
2539static int
2540kiblnd_dev_need_failover(kib_dev_t *dev)
2541{
2542        struct rdma_cm_id  *cmid;
2543        struct sockaddr_in  srcaddr;
2544        struct sockaddr_in  dstaddr;
2545        int              rc;
2546
2547        if (dev->ibd_hdev == NULL || /* initializing */
2548            dev->ibd_hdev->ibh_cmid == NULL || /* listener is dead */
2549            *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
2550                return 1;
2551
2552        /* XXX: it's UGLY, but I don't have better way to find
2553         * ib-bonding HCA failover because:
2554         *
2555         * a. no reliable CM event for HCA failover...
2556         * b. no OFED API to get ib_device for current net_device...
2557         *
2558         * We have only two choices at this point:
2559         *
2560         * a. rdma_bind_addr(), it will conflict with listener cmid
2561         * b. rdma_resolve_addr() to zero addr */
2562        cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
2563                                     IB_QPT_RC);
2564        if (IS_ERR(cmid)) {
2565                rc = PTR_ERR(cmid);
2566                CERROR("Failed to create cmid for failover: %d\n", rc);
2567                return rc;
2568        }
2569
2570        memset(&srcaddr, 0, sizeof(srcaddr));
2571        srcaddr.sin_family      = AF_INET;
2572        srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
2573
2574        memset(&dstaddr, 0, sizeof(dstaddr));
2575        dstaddr.sin_family = AF_INET;
2576        rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
2577                               (struct sockaddr *)&dstaddr, 1);
2578        if (rc != 0 || cmid->device == NULL) {
2579                CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
2580                       dev->ibd_ifname, &dev->ibd_ifip,
2581                       cmid->device, rc);
2582                rdma_destroy_id(cmid);
2583                return rc;
2584        }
2585
2586        if (dev->ibd_hdev->ibh_ibdev == cmid->device) {
2587                /* don't need device failover */
2588                rdma_destroy_id(cmid);
2589                return 0;
2590        }
2591
2592        return 1;
2593}
2594
2595int
2596kiblnd_dev_failover(kib_dev_t *dev)
2597{
2598        LIST_HEAD      (zombie_tpo);
2599        LIST_HEAD      (zombie_ppo);
2600        LIST_HEAD      (zombie_fpo);
2601        struct rdma_cm_id  *cmid  = NULL;
2602        kib_hca_dev_t      *hdev  = NULL;
2603        kib_hca_dev_t      *old;
2604        struct ib_pd       *pd;
2605        kib_net_t         *net;
2606        struct sockaddr_in  addr;
2607        unsigned long       flags;
2608        int              rc = 0;
2609        int                 i;
2610
2611        LASSERT (*kiblnd_tunables.kib_dev_failover > 1 ||
2612                 dev->ibd_can_failover ||
2613                 dev->ibd_hdev == NULL);
2614
2615        rc = kiblnd_dev_need_failover(dev);
2616        if (rc <= 0)
2617                goto out;
2618
2619        if (dev->ibd_hdev != NULL &&
2620            dev->ibd_hdev->ibh_cmid != NULL) {
2621                /* XXX it's not good to close old listener at here,
2622                 * because we can fail to create new listener.
2623                 * But we have to close it now, otherwise rdma_bind_addr
2624                 * will return EADDRINUSE... How crap! */
2625                write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2626
2627                cmid = dev->ibd_hdev->ibh_cmid;
2628                /* make next schedule of kiblnd_dev_need_failover()
2629                 * return 1 for me */
2630                dev->ibd_hdev->ibh_cmid  = NULL;
2631                write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2632
2633                rdma_destroy_id(cmid);
2634        }
2635
2636        cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
2637                                     IB_QPT_RC);
2638        if (IS_ERR(cmid)) {
2639                rc = PTR_ERR(cmid);
2640                CERROR("Failed to create cmid for failover: %d\n", rc);
2641                goto out;
2642        }
2643
2644        memset(&addr, 0, sizeof(addr));
2645        addr.sin_family      = AF_INET;
2646        addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
2647        addr.sin_port   = htons(*kiblnd_tunables.kib_service);
2648
2649        /* Bind to failover device or port */
2650        rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
2651        if (rc != 0 || cmid->device == NULL) {
2652                CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
2653                       dev->ibd_ifname, &dev->ibd_ifip,
2654                       cmid->device, rc);
2655                rdma_destroy_id(cmid);
2656                goto out;
2657        }
2658
2659        LIBCFS_ALLOC(hdev, sizeof(*hdev));
2660        if (hdev == NULL) {
2661                CERROR("Failed to allocate kib_hca_dev\n");
2662                rdma_destroy_id(cmid);
2663                rc = -ENOMEM;
2664                goto out;
2665        }
2666
2667        atomic_set(&hdev->ibh_ref, 1);
2668        hdev->ibh_dev   = dev;
2669        hdev->ibh_cmid  = cmid;
2670        hdev->ibh_ibdev = cmid->device;
2671
2672        pd = ib_alloc_pd(cmid->device);
2673        if (IS_ERR(pd)) {
2674                rc = PTR_ERR(pd);
2675                CERROR("Can't allocate PD: %d\n", rc);
2676                goto out;
2677        }
2678
2679        hdev->ibh_pd = pd;
2680
2681        rc = rdma_listen(cmid, 0);
2682        if (rc != 0) {
2683                CERROR("Can't start new listener: %d\n", rc);
2684                goto out;
2685        }
2686
2687        rc = kiblnd_hdev_setup_mrs(hdev);
2688        if (rc != 0) {
2689                CERROR("Can't setup device: %d\n", rc);
2690                goto out;
2691        }
2692
2693        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2694
2695        old = dev->ibd_hdev;
2696        dev->ibd_hdev = hdev; /* take over the refcount */
2697        hdev = old;
2698
2699        list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
2700                cfs_cpt_for_each(i, lnet_cpt_table()) {
2701                        kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
2702                                            &zombie_tpo);
2703
2704                        if (net->ibn_fmr_ps != NULL) {
2705                                kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
2706                                                        &zombie_fpo);
2707
2708                        } else if (net->ibn_pmr_ps != NULL) {
2709                                kiblnd_fail_poolset(&net->ibn_pmr_ps[i]->
2710                                                    pps_poolset, &zombie_ppo);
2711                        }
2712                }
2713        }
2714
2715        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2716 out:
2717        if (!list_empty(&zombie_tpo))
2718                kiblnd_destroy_pool_list(&zombie_tpo);
2719        if (!list_empty(&zombie_ppo))
2720                kiblnd_destroy_pool_list(&zombie_ppo);
2721        if (!list_empty(&zombie_fpo))
2722                kiblnd_destroy_fmr_pool_list(&zombie_fpo);
2723        if (hdev != NULL)
2724                kiblnd_hdev_decref(hdev);
2725
2726        if (rc != 0)
2727                dev->ibd_failed_failover++;
2728        else
2729                dev->ibd_failed_failover = 0;
2730
2731        return rc;
2732}
2733
2734void
2735kiblnd_destroy_dev (kib_dev_t *dev)
2736{
2737        LASSERT (dev->ibd_nnets == 0);
2738        LASSERT (list_empty(&dev->ibd_nets));
2739
2740        list_del(&dev->ibd_fail_list);
2741        list_del(&dev->ibd_list);
2742
2743        if (dev->ibd_hdev != NULL)
2744                kiblnd_hdev_decref(dev->ibd_hdev);
2745
2746        LIBCFS_FREE(dev, sizeof(*dev));
2747}
2748
2749kib_dev_t *
2750kiblnd_create_dev(char *ifname)
2751{
2752        struct net_device *netdev;
2753        kib_dev_t        *dev;
2754        __u32         netmask;
2755        __u32         ip;
2756        int             up;
2757        int             rc;
2758
2759        rc = libcfs_ipif_query(ifname, &up, &ip, &netmask);
2760        if (rc != 0) {
2761                CERROR("Can't query IPoIB interface %s: %d\n",
2762                       ifname, rc);
2763                return NULL;
2764        }
2765
2766        if (!up) {
2767                CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
2768                return NULL;
2769        }
2770
2771        LIBCFS_ALLOC(dev, sizeof(*dev));
2772        if (dev == NULL)
2773                return NULL;
2774
2775        memset(dev, 0, sizeof(*dev));
2776        netdev = dev_get_by_name(&init_net, ifname);
2777        if (netdev == NULL) {
2778                dev->ibd_can_failover = 0;
2779        } else {
2780                dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
2781                dev_put(netdev);
2782        }
2783
2784        INIT_LIST_HEAD(&dev->ibd_nets);
2785        INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
2786        INIT_LIST_HEAD(&dev->ibd_fail_list);
2787        dev->ibd_ifip = ip;
2788        strcpy(&dev->ibd_ifname[0], ifname);
2789
2790        /* initialize the device */
2791        rc = kiblnd_dev_failover(dev);
2792        if (rc != 0) {
2793                CERROR("Can't initialize device: %d\n", rc);
2794                LIBCFS_FREE(dev, sizeof(*dev));
2795                return NULL;
2796        }
2797
2798        list_add_tail(&dev->ibd_list,
2799                          &kiblnd_data.kib_devs);
2800        return dev;
2801}
2802
2803void
2804kiblnd_base_shutdown(void)
2805{
2806        struct kib_sched_info   *sched;
2807        int                     i;
2808
2809        LASSERT (list_empty(&kiblnd_data.kib_devs));
2810
2811        CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n",
2812               atomic_read(&libcfs_kmemory));
2813
2814        switch (kiblnd_data.kib_init) {
2815        default:
2816                LBUG();
2817
2818        case IBLND_INIT_ALL:
2819        case IBLND_INIT_DATA:
2820                LASSERT (kiblnd_data.kib_peers != NULL);
2821                for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
2822                        LASSERT (list_empty(&kiblnd_data.kib_peers[i]));
2823                }
2824                LASSERT (list_empty(&kiblnd_data.kib_connd_zombies));
2825                LASSERT (list_empty(&kiblnd_data.kib_connd_conns));
2826
2827                /* flag threads to terminate; wake and wait for them to die */
2828                kiblnd_data.kib_shutdown = 1;
2829
2830                /* NB: we really want to stop scheduler threads net by net
2831                 * instead of the whole module, this should be improved
2832                 * with dynamic configuration LNet */
2833                cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
2834                        wake_up_all(&sched->ibs_waitq);
2835
2836                wake_up_all(&kiblnd_data.kib_connd_waitq);
2837                wake_up_all(&kiblnd_data.kib_failover_waitq);
2838
2839                i = 2;
2840                while (atomic_read(&kiblnd_data.kib_nthreads) != 0) {
2841                        i++;
2842                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
2843                               "Waiting for %d threads to terminate\n",
2844                               atomic_read(&kiblnd_data.kib_nthreads));
2845                        cfs_pause(cfs_time_seconds(1));
2846                }
2847
2848                /* fall through */
2849
2850        case IBLND_INIT_NOTHING:
2851                break;
2852        }
2853
2854        if (kiblnd_data.kib_peers != NULL) {
2855                LIBCFS_FREE(kiblnd_data.kib_peers,
2856                            sizeof(struct list_head) *
2857                            kiblnd_data.kib_peer_hash_size);
2858        }
2859
2860        if (kiblnd_data.kib_scheds != NULL)
2861                cfs_percpt_free(kiblnd_data.kib_scheds);
2862
2863        CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n",
2864               atomic_read(&libcfs_kmemory));
2865
2866        kiblnd_data.kib_init = IBLND_INIT_NOTHING;
2867        module_put(THIS_MODULE);
2868}
2869
2870void
2871kiblnd_shutdown (lnet_ni_t *ni)
2872{
2873        kib_net_t       *net = ni->ni_data;
2874        rwlock_t     *g_lock = &kiblnd_data.kib_global_lock;
2875        int            i;
2876        unsigned long     flags;
2877
2878        LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
2879
2880        if (net == NULL)
2881                goto out;
2882
2883        CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n",
2884               atomic_read(&libcfs_kmemory));
2885
2886        write_lock_irqsave(g_lock, flags);
2887        net->ibn_shutdown = 1;
2888        write_unlock_irqrestore(g_lock, flags);
2889
2890        switch (net->ibn_init) {
2891        default:
2892                LBUG();
2893
2894        case IBLND_INIT_ALL:
2895                /* nuke all existing peers within this net */
2896                kiblnd_del_peer(ni, LNET_NID_ANY);
2897
2898                /* Wait for all peer state to clean up */
2899                i = 2;
2900                while (atomic_read(&net->ibn_npeers) != 0) {
2901                        i++;
2902                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
2903                               "%s: waiting for %d peers to disconnect\n",
2904                               libcfs_nid2str(ni->ni_nid),
2905                               atomic_read(&net->ibn_npeers));
2906                        cfs_pause(cfs_time_seconds(1));
2907                }
2908
2909                kiblnd_net_fini_pools(net);
2910
2911                write_lock_irqsave(g_lock, flags);
2912                LASSERT(net->ibn_dev->ibd_nnets > 0);
2913                net->ibn_dev->ibd_nnets--;
2914                list_del(&net->ibn_list);
2915                write_unlock_irqrestore(g_lock, flags);
2916
2917                /* fall through */
2918
2919        case IBLND_INIT_NOTHING:
2920                LASSERT (atomic_read(&net->ibn_nconns) == 0);
2921
2922                if (net->ibn_dev != NULL &&
2923                    net->ibn_dev->ibd_nnets == 0)
2924                        kiblnd_destroy_dev(net->ibn_dev);
2925
2926                break;
2927        }
2928
2929        CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n",
2930               atomic_read(&libcfs_kmemory));
2931
2932        net->ibn_init = IBLND_INIT_NOTHING;
2933        ni->ni_data = NULL;
2934
2935        LIBCFS_FREE(net, sizeof(*net));
2936
2937out:
2938        if (list_empty(&kiblnd_data.kib_devs))
2939                kiblnd_base_shutdown();
2940        return;
2941}
2942
2943int
2944kiblnd_base_startup(void)
2945{
2946        struct kib_sched_info   *sched;
2947        int                     rc;
2948        int                     i;
2949
2950        LASSERT (kiblnd_data.kib_init == IBLND_INIT_NOTHING);
2951
2952        try_module_get(THIS_MODULE);
2953        memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
2954
2955        rwlock_init(&kiblnd_data.kib_global_lock);
2956
2957        INIT_LIST_HEAD(&kiblnd_data.kib_devs);
2958        INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
2959
2960        kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
2961        LIBCFS_ALLOC(kiblnd_data.kib_peers,
2962                     sizeof(struct list_head) *
2963                            kiblnd_data.kib_peer_hash_size);
2964        if (kiblnd_data.kib_peers == NULL) {
2965                goto failed;
2966        }
2967        for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
2968                INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
2969
2970        spin_lock_init(&kiblnd_data.kib_connd_lock);
2971        INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
2972        INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
2973        init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
2974        init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
2975
2976        kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
2977                                                  sizeof(*sched));
2978        if (kiblnd_data.kib_scheds == NULL)
2979                goto failed;
2980
2981        cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
2982                int     nthrs;
2983
2984                spin_lock_init(&sched->ibs_lock);
2985                INIT_LIST_HEAD(&sched->ibs_conns);
2986                init_waitqueue_head(&sched->ibs_waitq);
2987
2988                nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
2989                if (*kiblnd_tunables.kib_nscheds > 0) {
2990                        nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
2991                } else {
2992                        /* max to half of CPUs, another half is reserved for
2993                         * upper layer modules */
2994                        nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
2995                }
2996
2997                sched->ibs_nthreads_max = nthrs;
2998                sched->ibs_cpt = i;
2999        }
3000
3001        kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
3002
3003        /* lists/ptrs/locks initialised */
3004        kiblnd_data.kib_init = IBLND_INIT_DATA;
3005        /*****************************************************/
3006
3007        rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
3008        if (rc != 0) {
3009                CERROR("Can't spawn o2iblnd connd: %d\n", rc);
3010                goto failed;
3011        }
3012
3013        if (*kiblnd_tunables.kib_dev_failover != 0)
3014                rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
3015                                         "kiblnd_failover");
3016
3017        if (rc != 0) {
3018                CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
3019                goto failed;
3020        }
3021
3022        /* flag everything initialised */
3023        kiblnd_data.kib_init = IBLND_INIT_ALL;
3024        /*****************************************************/
3025
3026        return 0;
3027
3028 failed:
3029        kiblnd_base_shutdown();
3030        return -ENETDOWN;
3031}
3032
3033int
3034kiblnd_start_schedulers(struct kib_sched_info *sched)
3035{
3036        int     rc = 0;
3037        int     nthrs;
3038        int     i;
3039
3040        if (sched->ibs_nthreads == 0) {
3041                if (*kiblnd_tunables.kib_nscheds > 0) {
3042                        nthrs = sched->ibs_nthreads_max;
3043                } else {
3044                        nthrs = cfs_cpt_weight(lnet_cpt_table(),
3045                                               sched->ibs_cpt);
3046                        nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
3047                        nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
3048                }
3049        } else {
3050                LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
3051                /* increase one thread if there is new interface */
3052                nthrs = (sched->ibs_nthreads < sched->ibs_nthreads_max);
3053        }
3054
3055        for (i = 0; i < nthrs; i++) {
3056                long    id;
3057                char    name[20];
3058                id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
3059                snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
3060                         KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
3061                rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
3062                if (rc == 0)
3063                        continue;
3064
3065                CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
3066                       sched->ibs_cpt, sched->ibs_nthreads + i, rc);
3067                break;
3068        }
3069
3070        sched->ibs_nthreads += i;
3071        return rc;
3072}
3073
3074int
3075kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts, int ncpts)
3076{
3077        int     cpt;
3078        int     rc;
3079        int     i;
3080
3081        for (i = 0; i < ncpts; i++) {
3082                struct kib_sched_info *sched;
3083
3084                cpt = (cpts == NULL) ? i : cpts[i];
3085                sched = kiblnd_data.kib_scheds[cpt];
3086
3087                if (!newdev && sched->ibs_nthreads > 0)
3088                        continue;
3089
3090                rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
3091                if (rc != 0) {
3092                        CERROR("Failed to start scheduler threads for %s\n",
3093                               dev->ibd_ifname);
3094                        return rc;
3095                }
3096        }
3097        return 0;
3098}
3099
3100kib_dev_t *
3101kiblnd_dev_search(char *ifname)
3102{
3103        kib_dev_t       *alias = NULL;
3104        kib_dev_t       *dev;
3105        char            *colon;
3106        char            *colon2;
3107
3108        colon = strchr(ifname, ':');
3109        list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
3110                if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
3111                        return dev;
3112
3113                if (alias != NULL)
3114                        continue;
3115
3116                colon2 = strchr(dev->ibd_ifname, ':');
3117                if (colon != NULL)
3118                        *colon = 0;
3119                if (colon2 != NULL)
3120                        *colon2 = 0;
3121
3122                if (strcmp(&dev->ibd_ifname[0], ifname) == 0)
3123                        alias = dev;
3124
3125                if (colon != NULL)
3126                        *colon = ':';
3127                if (colon2 != NULL)
3128                        *colon2 = ':';
3129        }
3130        return alias;
3131}
3132
3133int
3134kiblnd_startup (lnet_ni_t *ni)
3135{
3136        char                 *ifname;
3137        kib_dev_t               *ibdev = NULL;
3138        kib_net_t               *net;
3139        struct timeval      tv;
3140        unsigned long        flags;
3141        int                    rc;
3142        int                       newdev;
3143
3144        LASSERT (ni->ni_lnd == &the_o2iblnd);
3145
3146        if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
3147                rc = kiblnd_base_startup();
3148                if (rc != 0)
3149                        return rc;
3150        }
3151
3152        LIBCFS_ALLOC(net, sizeof(*net));
3153        ni->ni_data = net;
3154        if (net == NULL)
3155                goto failed;
3156
3157        memset(net, 0, sizeof(*net));
3158
3159        do_gettimeofday(&tv);
3160        net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
3161
3162        ni->ni_peertimeout    = *kiblnd_tunables.kib_peertimeout;
3163        ni->ni_maxtxcredits   = *kiblnd_tunables.kib_credits;
3164        ni->ni_peertxcredits  = *kiblnd_tunables.kib_peertxcredits;
3165        ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits;
3166
3167        if (ni->ni_interfaces[0] != NULL) {
3168                /* Use the IPoIB interface specified in 'networks=' */
3169
3170                CLASSERT (LNET_MAX_INTERFACES > 1);
3171                if (ni->ni_interfaces[1] != NULL) {
3172                        CERROR("Multiple interfaces not supported\n");
3173                        goto failed;
3174                }
3175
3176                ifname = ni->ni_interfaces[0];
3177        } else {
3178                ifname = *kiblnd_tunables.kib_default_ipif;
3179        }
3180
3181        if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
3182                CERROR("IPoIB interface name too long: %s\n", ifname);
3183                goto failed;
3184        }
3185
3186        ibdev = kiblnd_dev_search(ifname);
3187
3188        newdev = ibdev == NULL;
3189        /* hmm...create kib_dev even for alias */
3190        if (ibdev == NULL || strcmp(&ibdev->ibd_ifname[0], ifname) != 0)
3191                ibdev = kiblnd_create_dev(ifname);
3192
3193        if (ibdev == NULL)
3194                goto failed;
3195
3196        net->ibn_dev = ibdev;
3197        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
3198
3199        rc = kiblnd_dev_start_threads(ibdev, newdev,
3200                                      ni->ni_cpts, ni->ni_ncpts);
3201        if (rc != 0)
3202                goto failed;
3203
3204        rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts);
3205        if (rc != 0) {
3206                CERROR("Failed to initialize NI pools: %d\n", rc);
3207                goto failed;
3208        }
3209
3210        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
3211        ibdev->ibd_nnets++;
3212        list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
3213        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
3214
3215        net->ibn_init = IBLND_INIT_ALL;
3216
3217        return 0;
3218
3219failed:
3220        if (net->ibn_dev == NULL && ibdev != NULL)
3221                kiblnd_destroy_dev(ibdev);
3222
3223        kiblnd_shutdown(ni);
3224
3225        CDEBUG(D_NET, "kiblnd_startup failed\n");
3226        return -ENETDOWN;
3227}
3228
3229void __exit
3230kiblnd_module_fini (void)
3231{
3232        lnet_unregister_lnd(&the_o2iblnd);
3233        kiblnd_tunables_fini();
3234}
3235
3236int __init
3237kiblnd_module_init (void)
3238{
3239        int    rc;
3240
3241        CLASSERT (sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
3242        CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
3243                  <= IBLND_MSG_SIZE);
3244        CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
3245                  <= IBLND_MSG_SIZE);
3246
3247        rc = kiblnd_tunables_init();
3248        if (rc != 0)
3249                return rc;
3250
3251        lnet_register_lnd(&the_o2iblnd);
3252
3253        return 0;
3254}
3255
3256MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
3257MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v2.00");
3258MODULE_LICENSE("GPL");
3259
3260module_init(kiblnd_module_init);
3261module_exit(kiblnd_module_fini);
3262