linux/drivers/infiniband/ulp/ipoib/ipoib_main.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
   3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
   4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
   5 *
   6 * This software is available to you under a choice of one of two
   7 * licenses.  You may choose to be licensed under the terms of the GNU
   8 * General Public License (GPL) Version 2, available from the file
   9 * COPYING in the main directory of this source tree, or the
  10 * OpenIB.org BSD license below:
  11 *
  12 *     Redistribution and use in source and binary forms, with or
  13 *     without modification, are permitted provided that the following
  14 *     conditions are met:
  15 *
  16 *      - Redistributions of source code must retain the above
  17 *        copyright notice, this list of conditions and the following
  18 *        disclaimer.
  19 *
  20 *      - Redistributions in binary form must reproduce the above
  21 *        copyright notice, this list of conditions and the following
  22 *        disclaimer in the documentation and/or other materials
  23 *        provided with the distribution.
  24 *
  25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32 * SOFTWARE.
  33 */
  34
  35#include "ipoib.h"
  36
  37#include <linux/module.h>
  38
  39#include <linux/init.h>
  40#include <linux/slab.h>
  41#include <linux/kernel.h>
  42#include <linux/vmalloc.h>
  43
  44#include <linux/if_arp.h>       /* For ARPHRD_xxx */
  45
  46#include <linux/ip.h>
  47#include <linux/in.h>
  48
  49#include <linux/jhash.h>
  50#include <net/arp.h>
  51
  52#define DRV_VERSION "1.0.0"
  53
  54const char ipoib_driver_version[] = DRV_VERSION;
  55
  56MODULE_AUTHOR("Roland Dreier");
  57MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
  58MODULE_LICENSE("Dual BSD/GPL");
  59MODULE_VERSION(DRV_VERSION);
  60
  61int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
  62int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
  63
  64module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
  65MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
  66module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
  67MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
  68
  69#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  70int ipoib_debug_level;
  71
  72module_param_named(debug_level, ipoib_debug_level, int, 0644);
  73MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
  74#endif
  75
  76struct ipoib_path_iter {
  77        struct net_device *dev;
  78        struct ipoib_path  path;
  79};
  80
  81static const u8 ipv4_bcast_addr[] = {
  82        0x00, 0xff, 0xff, 0xff,
  83        0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  84        0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
  85};
  86
  87struct workqueue_struct *ipoib_workqueue;
  88
  89struct ib_sa_client ipoib_sa_client;
  90
  91static void ipoib_add_one(struct ib_device *device);
  92static void ipoib_remove_one(struct ib_device *device);
  93static void ipoib_neigh_reclaim(struct rcu_head *rp);
  94
  95static struct ib_client ipoib_client = {
  96        .name   = "ipoib",
  97        .add    = ipoib_add_one,
  98        .remove = ipoib_remove_one
  99};
 100
 101int ipoib_open(struct net_device *dev)
 102{
 103        struct ipoib_dev_priv *priv = netdev_priv(dev);
 104
 105        ipoib_dbg(priv, "bringing up interface\n");
 106
 107        netif_carrier_off(dev);
 108
 109        set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 110
 111        if (ipoib_pkey_dev_delay_open(dev))
 112                return 0;
 113
 114        if (ipoib_ib_dev_open(dev))
 115                goto err_disable;
 116
 117        if (ipoib_ib_dev_up(dev))
 118                goto err_stop;
 119
 120        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 121                struct ipoib_dev_priv *cpriv;
 122
 123                /* Bring up any child interfaces too */
 124                down_read(&priv->vlan_rwsem);
 125                list_for_each_entry(cpriv, &priv->child_intfs, list) {
 126                        int flags;
 127
 128                        flags = cpriv->dev->flags;
 129                        if (flags & IFF_UP)
 130                                continue;
 131
 132                        dev_change_flags(cpriv->dev, flags | IFF_UP);
 133                }
 134                up_read(&priv->vlan_rwsem);
 135        }
 136
 137        netif_start_queue(dev);
 138
 139        return 0;
 140
 141err_stop:
 142        ipoib_ib_dev_stop(dev, 1);
 143
 144err_disable:
 145        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 146
 147        return -EINVAL;
 148}
 149
 150static int ipoib_stop(struct net_device *dev)
 151{
 152        struct ipoib_dev_priv *priv = netdev_priv(dev);
 153
 154        ipoib_dbg(priv, "stopping interface\n");
 155
 156        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 157
 158        netif_stop_queue(dev);
 159
 160        ipoib_ib_dev_down(dev, 1);
 161        ipoib_ib_dev_stop(dev, 0);
 162
 163        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 164                struct ipoib_dev_priv *cpriv;
 165
 166                /* Bring down any child interfaces too */
 167                down_read(&priv->vlan_rwsem);
 168                list_for_each_entry(cpriv, &priv->child_intfs, list) {
 169                        int flags;
 170
 171                        flags = cpriv->dev->flags;
 172                        if (!(flags & IFF_UP))
 173                                continue;
 174
 175                        dev_change_flags(cpriv->dev, flags & ~IFF_UP);
 176                }
 177                up_read(&priv->vlan_rwsem);
 178        }
 179
 180        return 0;
 181}
 182
 183static void ipoib_uninit(struct net_device *dev)
 184{
 185        ipoib_dev_cleanup(dev);
 186}
 187
 188static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
 189{
 190        struct ipoib_dev_priv *priv = netdev_priv(dev);
 191
 192        if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
 193                features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
 194
 195        return features;
 196}
 197
 198static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 199{
 200        struct ipoib_dev_priv *priv = netdev_priv(dev);
 201
 202        /* dev->mtu > 2K ==> connected mode */
 203        if (ipoib_cm_admin_enabled(dev)) {
 204                if (new_mtu > ipoib_cm_max_mtu(dev))
 205                        return -EINVAL;
 206
 207                if (new_mtu > priv->mcast_mtu)
 208                        ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
 209                                   priv->mcast_mtu);
 210
 211                dev->mtu = new_mtu;
 212                return 0;
 213        }
 214
 215        if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
 216                return -EINVAL;
 217
 218        priv->admin_mtu = new_mtu;
 219
 220        dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
 221
 222        return 0;
 223}
 224
 225int ipoib_set_mode(struct net_device *dev, const char *buf)
 226{
 227        struct ipoib_dev_priv *priv = netdev_priv(dev);
 228
 229        /* flush paths if we switch modes so that connections are restarted */
 230        if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
 231                set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
 232                ipoib_warn(priv, "enabling connected mode "
 233                           "will cause multicast packet drops\n");
 234                netdev_update_features(dev);
 235                rtnl_unlock();
 236                priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
 237
 238                ipoib_flush_paths(dev);
 239                rtnl_lock();
 240                return 0;
 241        }
 242
 243        if (!strcmp(buf, "datagram\n")) {
 244                clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
 245                netdev_update_features(dev);
 246                dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
 247                rtnl_unlock();
 248                ipoib_flush_paths(dev);
 249                rtnl_lock();
 250                return 0;
 251        }
 252
 253        return -EINVAL;
 254}
 255
 256static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
 257{
 258        struct ipoib_dev_priv *priv = netdev_priv(dev);
 259        struct rb_node *n = priv->path_tree.rb_node;
 260        struct ipoib_path *path;
 261        int ret;
 262
 263        while (n) {
 264                path = rb_entry(n, struct ipoib_path, rb_node);
 265
 266                ret = memcmp(gid, path->pathrec.dgid.raw,
 267                             sizeof (union ib_gid));
 268
 269                if (ret < 0)
 270                        n = n->rb_left;
 271                else if (ret > 0)
 272                        n = n->rb_right;
 273                else
 274                        return path;
 275        }
 276
 277        return NULL;
 278}
 279
 280static int __path_add(struct net_device *dev, struct ipoib_path *path)
 281{
 282        struct ipoib_dev_priv *priv = netdev_priv(dev);
 283        struct rb_node **n = &priv->path_tree.rb_node;
 284        struct rb_node *pn = NULL;
 285        struct ipoib_path *tpath;
 286        int ret;
 287
 288        while (*n) {
 289                pn = *n;
 290                tpath = rb_entry(pn, struct ipoib_path, rb_node);
 291
 292                ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
 293                             sizeof (union ib_gid));
 294                if (ret < 0)
 295                        n = &pn->rb_left;
 296                else if (ret > 0)
 297                        n = &pn->rb_right;
 298                else
 299                        return -EEXIST;
 300        }
 301
 302        rb_link_node(&path->rb_node, pn, n);
 303        rb_insert_color(&path->rb_node, &priv->path_tree);
 304
 305        list_add_tail(&path->list, &priv->path_list);
 306
 307        return 0;
 308}
 309
 310static void path_free(struct net_device *dev, struct ipoib_path *path)
 311{
 312        struct sk_buff *skb;
 313
 314        while ((skb = __skb_dequeue(&path->queue)))
 315                dev_kfree_skb_irq(skb);
 316
 317        ipoib_dbg(netdev_priv(dev), "path_free\n");
 318
 319        /* remove all neigh connected to this path */
 320        ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
 321
 322        if (path->ah)
 323                ipoib_put_ah(path->ah);
 324
 325        kfree(path);
 326}
 327
 328#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 329
 330struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
 331{
 332        struct ipoib_path_iter *iter;
 333
 334        iter = kmalloc(sizeof *iter, GFP_KERNEL);
 335        if (!iter)
 336                return NULL;
 337
 338        iter->dev = dev;
 339        memset(iter->path.pathrec.dgid.raw, 0, 16);
 340
 341        if (ipoib_path_iter_next(iter)) {
 342                kfree(iter);
 343                return NULL;
 344        }
 345
 346        return iter;
 347}
 348
 349int ipoib_path_iter_next(struct ipoib_path_iter *iter)
 350{
 351        struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
 352        struct rb_node *n;
 353        struct ipoib_path *path;
 354        int ret = 1;
 355
 356        spin_lock_irq(&priv->lock);
 357
 358        n = rb_first(&priv->path_tree);
 359
 360        while (n) {
 361                path = rb_entry(n, struct ipoib_path, rb_node);
 362
 363                if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
 364                           sizeof (union ib_gid)) < 0) {
 365                        iter->path = *path;
 366                        ret = 0;
 367                        break;
 368                }
 369
 370                n = rb_next(n);
 371        }
 372
 373        spin_unlock_irq(&priv->lock);
 374
 375        return ret;
 376}
 377
 378void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 379                          struct ipoib_path *path)
 380{
 381        *path = iter->path;
 382}
 383
 384#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
 385
 386void ipoib_mark_paths_invalid(struct net_device *dev)
 387{
 388        struct ipoib_dev_priv *priv = netdev_priv(dev);
 389        struct ipoib_path *path, *tp;
 390
 391        spin_lock_irq(&priv->lock);
 392
 393        list_for_each_entry_safe(path, tp, &priv->path_list, list) {
 394                ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
 395                        be16_to_cpu(path->pathrec.dlid),
 396                        path->pathrec.dgid.raw);
 397                path->valid =  0;
 398        }
 399
 400        spin_unlock_irq(&priv->lock);
 401}
 402
 403void ipoib_flush_paths(struct net_device *dev)
 404{
 405        struct ipoib_dev_priv *priv = netdev_priv(dev);
 406        struct ipoib_path *path, *tp;
 407        LIST_HEAD(remove_list);
 408        unsigned long flags;
 409
 410        netif_tx_lock_bh(dev);
 411        spin_lock_irqsave(&priv->lock, flags);
 412
 413        list_splice_init(&priv->path_list, &remove_list);
 414
 415        list_for_each_entry(path, &remove_list, list)
 416                rb_erase(&path->rb_node, &priv->path_tree);
 417
 418        list_for_each_entry_safe(path, tp, &remove_list, list) {
 419                if (path->query)
 420                        ib_sa_cancel_query(path->query_id, path->query);
 421                spin_unlock_irqrestore(&priv->lock, flags);
 422                netif_tx_unlock_bh(dev);
 423                wait_for_completion(&path->done);
 424                path_free(dev, path);
 425                netif_tx_lock_bh(dev);
 426                spin_lock_irqsave(&priv->lock, flags);
 427        }
 428
 429        spin_unlock_irqrestore(&priv->lock, flags);
 430        netif_tx_unlock_bh(dev);
 431}
 432
 433static void path_rec_completion(int status,
 434                                struct ib_sa_path_rec *pathrec,
 435                                void *path_ptr)
 436{
 437        struct ipoib_path *path = path_ptr;
 438        struct net_device *dev = path->dev;
 439        struct ipoib_dev_priv *priv = netdev_priv(dev);
 440        struct ipoib_ah *ah = NULL;
 441        struct ipoib_ah *old_ah = NULL;
 442        struct ipoib_neigh *neigh, *tn;
 443        struct sk_buff_head skqueue;
 444        struct sk_buff *skb;
 445        unsigned long flags;
 446
 447        if (!status)
 448                ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
 449                          be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
 450        else
 451                ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
 452                          status, path->pathrec.dgid.raw);
 453
 454        skb_queue_head_init(&skqueue);
 455
 456        if (!status) {
 457                struct ib_ah_attr av;
 458
 459                if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
 460                        ah = ipoib_create_ah(dev, priv->pd, &av);
 461        }
 462
 463        spin_lock_irqsave(&priv->lock, flags);
 464
 465        if (!IS_ERR_OR_NULL(ah)) {
 466                path->pathrec = *pathrec;
 467
 468                old_ah   = path->ah;
 469                path->ah = ah;
 470
 471                ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
 472                          ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
 473
 474                while ((skb = __skb_dequeue(&path->queue)))
 475                        __skb_queue_tail(&skqueue, skb);
 476
 477                list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
 478                        if (neigh->ah) {
 479                                WARN_ON(neigh->ah != old_ah);
 480                                /*
 481                                 * Dropping the ah reference inside
 482                                 * priv->lock is safe here, because we
 483                                 * will hold one more reference from
 484                                 * the original value of path->ah (ie
 485                                 * old_ah).
 486                                 */
 487                                ipoib_put_ah(neigh->ah);
 488                        }
 489                        kref_get(&path->ah->ref);
 490                        neigh->ah = path->ah;
 491
 492                        if (ipoib_cm_enabled(dev, neigh->daddr)) {
 493                                if (!ipoib_cm_get(neigh))
 494                                        ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
 495                                                                               path,
 496                                                                               neigh));
 497                                if (!ipoib_cm_get(neigh)) {
 498                                        ipoib_neigh_free(neigh);
 499                                        continue;
 500                                }
 501                        }
 502
 503                        while ((skb = __skb_dequeue(&neigh->queue)))
 504                                __skb_queue_tail(&skqueue, skb);
 505                }
 506                path->valid = 1;
 507        }
 508
 509        path->query = NULL;
 510        complete(&path->done);
 511
 512        spin_unlock_irqrestore(&priv->lock, flags);
 513
 514        if (IS_ERR_OR_NULL(ah))
 515                ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
 516
 517        if (old_ah)
 518                ipoib_put_ah(old_ah);
 519
 520        while ((skb = __skb_dequeue(&skqueue))) {
 521                skb->dev = dev;
 522                if (dev_queue_xmit(skb))
 523                        ipoib_warn(priv, "dev_queue_xmit failed "
 524                                   "to requeue packet\n");
 525        }
 526}
 527
 528static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
 529{
 530        struct ipoib_dev_priv *priv = netdev_priv(dev);
 531        struct ipoib_path *path;
 532
 533        if (!priv->broadcast)
 534                return NULL;
 535
 536        path = kzalloc(sizeof *path, GFP_ATOMIC);
 537        if (!path)
 538                return NULL;
 539
 540        path->dev = dev;
 541
 542        skb_queue_head_init(&path->queue);
 543
 544        INIT_LIST_HEAD(&path->neigh_list);
 545
 546        memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
 547        path->pathrec.sgid          = priv->local_gid;
 548        path->pathrec.pkey          = cpu_to_be16(priv->pkey);
 549        path->pathrec.numb_path     = 1;
 550        path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
 551
 552        return path;
 553}
 554
 555static int path_rec_start(struct net_device *dev,
 556                          struct ipoib_path *path)
 557{
 558        struct ipoib_dev_priv *priv = netdev_priv(dev);
 559
 560        ipoib_dbg(priv, "Start path record lookup for %pI6\n",
 561                  path->pathrec.dgid.raw);
 562
 563        init_completion(&path->done);
 564
 565        path->query_id =
 566                ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
 567                                   &path->pathrec,
 568                                   IB_SA_PATH_REC_DGID          |
 569                                   IB_SA_PATH_REC_SGID          |
 570                                   IB_SA_PATH_REC_NUMB_PATH     |
 571                                   IB_SA_PATH_REC_TRAFFIC_CLASS |
 572                                   IB_SA_PATH_REC_PKEY,
 573                                   1000, GFP_ATOMIC,
 574                                   path_rec_completion,
 575                                   path, &path->query);
 576        if (path->query_id < 0) {
 577                ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
 578                path->query = NULL;
 579                complete(&path->done);
 580                return path->query_id;
 581        }
 582
 583        return 0;
 584}
 585
 586static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 587                           struct net_device *dev)
 588{
 589        struct ipoib_dev_priv *priv = netdev_priv(dev);
 590        struct ipoib_path *path;
 591        struct ipoib_neigh *neigh;
 592        unsigned long flags;
 593
 594        spin_lock_irqsave(&priv->lock, flags);
 595        neigh = ipoib_neigh_alloc(daddr, dev);
 596        if (!neigh) {
 597                spin_unlock_irqrestore(&priv->lock, flags);
 598                ++dev->stats.tx_dropped;
 599                dev_kfree_skb_any(skb);
 600                return;
 601        }
 602
 603        path = __path_find(dev, daddr + 4);
 604        if (!path) {
 605                path = path_rec_create(dev, daddr + 4);
 606                if (!path)
 607                        goto err_path;
 608
 609                __path_add(dev, path);
 610        }
 611
 612        list_add_tail(&neigh->list, &path->neigh_list);
 613
 614        if (path->ah) {
 615                kref_get(&path->ah->ref);
 616                neigh->ah = path->ah;
 617
 618                if (ipoib_cm_enabled(dev, neigh->daddr)) {
 619                        if (!ipoib_cm_get(neigh))
 620                                ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
 621                        if (!ipoib_cm_get(neigh)) {
 622                                ipoib_neigh_free(neigh);
 623                                goto err_drop;
 624                        }
 625                        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
 626                                __skb_queue_tail(&neigh->queue, skb);
 627                        else {
 628                                ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
 629                                           skb_queue_len(&neigh->queue));
 630                                goto err_drop;
 631                        }
 632                } else {
 633                        spin_unlock_irqrestore(&priv->lock, flags);
 634                        ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
 635                        ipoib_neigh_put(neigh);
 636                        return;
 637                }
 638        } else {
 639                neigh->ah  = NULL;
 640
 641                if (!path->query && path_rec_start(dev, path))
 642                        goto err_path;
 643
 644                __skb_queue_tail(&neigh->queue, skb);
 645        }
 646
 647        spin_unlock_irqrestore(&priv->lock, flags);
 648        ipoib_neigh_put(neigh);
 649        return;
 650
 651err_path:
 652        ipoib_neigh_free(neigh);
 653err_drop:
 654        ++dev->stats.tx_dropped;
 655        dev_kfree_skb_any(skb);
 656
 657        spin_unlock_irqrestore(&priv->lock, flags);
 658        ipoib_neigh_put(neigh);
 659}
 660
 661static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 662                             struct ipoib_cb *cb)
 663{
 664        struct ipoib_dev_priv *priv = netdev_priv(dev);
 665        struct ipoib_path *path;
 666        unsigned long flags;
 667
 668        spin_lock_irqsave(&priv->lock, flags);
 669
 670        path = __path_find(dev, cb->hwaddr + 4);
 671        if (!path || !path->valid) {
 672                int new_path = 0;
 673
 674                if (!path) {
 675                        path = path_rec_create(dev, cb->hwaddr + 4);
 676                        new_path = 1;
 677                }
 678                if (path) {
 679                        __skb_queue_tail(&path->queue, skb);
 680
 681                        if (!path->query && path_rec_start(dev, path)) {
 682                                spin_unlock_irqrestore(&priv->lock, flags);
 683                                if (new_path)
 684                                        path_free(dev, path);
 685                                return;
 686                        } else
 687                                __path_add(dev, path);
 688                } else {
 689                        ++dev->stats.tx_dropped;
 690                        dev_kfree_skb_any(skb);
 691                }
 692
 693                spin_unlock_irqrestore(&priv->lock, flags);
 694                return;
 695        }
 696
 697        if (path->ah) {
 698                ipoib_dbg(priv, "Send unicast ARP to %04x\n",
 699                          be16_to_cpu(path->pathrec.dlid));
 700
 701                spin_unlock_irqrestore(&priv->lock, flags);
 702                ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
 703                return;
 704        } else if ((path->query || !path_rec_start(dev, path)) &&
 705                   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
 706                __skb_queue_tail(&path->queue, skb);
 707        } else {
 708                ++dev->stats.tx_dropped;
 709                dev_kfree_skb_any(skb);
 710        }
 711
 712        spin_unlock_irqrestore(&priv->lock, flags);
 713}
 714
 715static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 716{
 717        struct ipoib_dev_priv *priv = netdev_priv(dev);
 718        struct ipoib_neigh *neigh;
 719        struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
 720        struct ipoib_header *header;
 721        unsigned long flags;
 722
 723        header = (struct ipoib_header *) skb->data;
 724
 725        if (unlikely(cb->hwaddr[4] == 0xff)) {
 726                /* multicast, arrange "if" according to probability */
 727                if ((header->proto != htons(ETH_P_IP)) &&
 728                    (header->proto != htons(ETH_P_IPV6)) &&
 729                    (header->proto != htons(ETH_P_ARP)) &&
 730                    (header->proto != htons(ETH_P_RARP)) &&
 731                    (header->proto != htons(ETH_P_TIPC))) {
 732                        /* ethertype not supported by IPoIB */
 733                        ++dev->stats.tx_dropped;
 734                        dev_kfree_skb_any(skb);
 735                        return NETDEV_TX_OK;
 736                }
 737                /* Add in the P_Key for multicast*/
 738                cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
 739                cb->hwaddr[9] = priv->pkey & 0xff;
 740
 741                neigh = ipoib_neigh_get(dev, cb->hwaddr);
 742                if (likely(neigh))
 743                        goto send_using_neigh;
 744                ipoib_mcast_send(dev, cb->hwaddr, skb);
 745                return NETDEV_TX_OK;
 746        }
 747
 748        /* unicast, arrange "switch" according to probability */
 749        switch (header->proto) {
 750        case htons(ETH_P_IP):
 751        case htons(ETH_P_IPV6):
 752        case htons(ETH_P_TIPC):
 753                neigh = ipoib_neigh_get(dev, cb->hwaddr);
 754                if (unlikely(!neigh)) {
 755                        neigh_add_path(skb, cb->hwaddr, dev);
 756                        return NETDEV_TX_OK;
 757                }
 758                break;
 759        case htons(ETH_P_ARP):
 760        case htons(ETH_P_RARP):
 761                /* for unicast ARP and RARP should always perform path find */
 762                unicast_arp_send(skb, dev, cb);
 763                return NETDEV_TX_OK;
 764        default:
 765                /* ethertype not supported by IPoIB */
 766                ++dev->stats.tx_dropped;
 767                dev_kfree_skb_any(skb);
 768                return NETDEV_TX_OK;
 769        }
 770
 771send_using_neigh:
 772        /* note we now hold a ref to neigh */
 773        if (ipoib_cm_get(neigh)) {
 774                if (ipoib_cm_up(neigh)) {
 775                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
 776                        goto unref;
 777                }
 778        } else if (neigh->ah) {
 779                ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
 780                goto unref;
 781        }
 782
 783        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
 784                spin_lock_irqsave(&priv->lock, flags);
 785                __skb_queue_tail(&neigh->queue, skb);
 786                spin_unlock_irqrestore(&priv->lock, flags);
 787        } else {
 788                ++dev->stats.tx_dropped;
 789                dev_kfree_skb_any(skb);
 790        }
 791
 792unref:
 793        ipoib_neigh_put(neigh);
 794
 795        return NETDEV_TX_OK;
 796}
 797
 798static void ipoib_timeout(struct net_device *dev)
 799{
 800        struct ipoib_dev_priv *priv = netdev_priv(dev);
 801
 802        ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
 803                   jiffies_to_msecs(jiffies - dev->trans_start));
 804        ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
 805                   netif_queue_stopped(dev),
 806                   priv->tx_head, priv->tx_tail);
 807        /* XXX reset QP, etc. */
 808}
 809
 810static int ipoib_hard_header(struct sk_buff *skb,
 811                             struct net_device *dev,
 812                             unsigned short type,
 813                             const void *daddr, const void *saddr, unsigned len)
 814{
 815        struct ipoib_header *header;
 816        struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
 817
 818        header = (struct ipoib_header *) skb_push(skb, sizeof *header);
 819
 820        header->proto = htons(type);
 821        header->reserved = 0;
 822
 823        /*
 824         * we don't rely on dst_entry structure,  always stuff the
 825         * destination address into skb->cb so we can figure out where
 826         * to send the packet later.
 827         */
 828        memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
 829
 830        return sizeof *header;
 831}
 832
 833static void ipoib_set_mcast_list(struct net_device *dev)
 834{
 835        struct ipoib_dev_priv *priv = netdev_priv(dev);
 836
 837        if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
 838                ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
 839                return;
 840        }
 841
 842        queue_work(ipoib_workqueue, &priv->restart_task);
 843}
 844
 845static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
 846{
 847        /*
 848         * Use only the address parts that contributes to spreading
 849         * The subnet prefix is not used as one can not connect to
 850         * same remote port (GUID) using the same remote QPN via two
 851         * different subnets.
 852         */
 853         /* qpn octets[1:4) & port GUID octets[12:20) */
 854        u32 *d32 = (u32 *) daddr;
 855        u32 hv;
 856
 857        hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
 858        return hv & htbl->mask;
 859}
 860
 861struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
 862{
 863        struct ipoib_dev_priv *priv = netdev_priv(dev);
 864        struct ipoib_neigh_table *ntbl = &priv->ntbl;
 865        struct ipoib_neigh_hash *htbl;
 866        struct ipoib_neigh *neigh = NULL;
 867        u32 hash_val;
 868
 869        rcu_read_lock_bh();
 870
 871        htbl = rcu_dereference_bh(ntbl->htbl);
 872
 873        if (!htbl)
 874                goto out_unlock;
 875
 876        hash_val = ipoib_addr_hash(htbl, daddr);
 877        for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
 878             neigh != NULL;
 879             neigh = rcu_dereference_bh(neigh->hnext)) {
 880                if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
 881                        /* found, take one ref on behalf of the caller */
 882                        if (!atomic_inc_not_zero(&neigh->refcnt)) {
 883                                /* deleted */
 884                                neigh = NULL;
 885                                goto out_unlock;
 886                        }
 887                        neigh->alive = jiffies;
 888                        goto out_unlock;
 889                }
 890        }
 891
 892out_unlock:
 893        rcu_read_unlock_bh();
 894        return neigh;
 895}
 896
 897static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
 898{
 899        struct ipoib_neigh_table *ntbl = &priv->ntbl;
 900        struct ipoib_neigh_hash *htbl;
 901        unsigned long neigh_obsolete;
 902        unsigned long dt;
 903        unsigned long flags;
 904        int i;
 905
 906        if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
 907                return;
 908
 909        spin_lock_irqsave(&priv->lock, flags);
 910
 911        htbl = rcu_dereference_protected(ntbl->htbl,
 912                                         lockdep_is_held(&priv->lock));
 913
 914        if (!htbl)
 915                goto out_unlock;
 916
 917        /* neigh is obsolete if it was idle for two GC periods */
 918        dt = 2 * arp_tbl.gc_interval;
 919        neigh_obsolete = jiffies - dt;
 920        /* handle possible race condition */
 921        if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
 922                goto out_unlock;
 923
 924        for (i = 0; i < htbl->size; i++) {
 925                struct ipoib_neigh *neigh;
 926                struct ipoib_neigh __rcu **np = &htbl->buckets[i];
 927
 928                while ((neigh = rcu_dereference_protected(*np,
 929                                                          lockdep_is_held(&priv->lock))) != NULL) {
 930                        /* was the neigh idle for two GC periods */
 931                        if (time_after(neigh_obsolete, neigh->alive)) {
 932                                rcu_assign_pointer(*np,
 933                                                   rcu_dereference_protected(neigh->hnext,
 934                                                                             lockdep_is_held(&priv->lock)));
 935                                /* remove from path/mc list */
 936                                list_del(&neigh->list);
 937                                call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
 938                        } else {
 939                                np = &neigh->hnext;
 940                        }
 941
 942                }
 943        }
 944
 945out_unlock:
 946        spin_unlock_irqrestore(&priv->lock, flags);
 947}
 948
 949static void ipoib_reap_neigh(struct work_struct *work)
 950{
 951        struct ipoib_dev_priv *priv =
 952                container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
 953
 954        __ipoib_reap_neigh(priv);
 955
 956        if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
 957                queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
 958                                   arp_tbl.gc_interval);
 959}
 960
 961
 962static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
 963                                      struct net_device *dev)
 964{
 965        struct ipoib_neigh *neigh;
 966
 967        neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
 968        if (!neigh)
 969                return NULL;
 970
 971        neigh->dev = dev;
 972        memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
 973        skb_queue_head_init(&neigh->queue);
 974        INIT_LIST_HEAD(&neigh->list);
 975        ipoib_cm_set(neigh, NULL);
 976        /* one ref on behalf of the caller */
 977        atomic_set(&neigh->refcnt, 1);
 978
 979        return neigh;
 980}
 981
 982struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
 983                                      struct net_device *dev)
 984{
 985        struct ipoib_dev_priv *priv = netdev_priv(dev);
 986        struct ipoib_neigh_table *ntbl = &priv->ntbl;
 987        struct ipoib_neigh_hash *htbl;
 988        struct ipoib_neigh *neigh;
 989        u32 hash_val;
 990
 991        htbl = rcu_dereference_protected(ntbl->htbl,
 992                                         lockdep_is_held(&priv->lock));
 993        if (!htbl) {
 994                neigh = NULL;
 995                goto out_unlock;
 996        }
 997
 998        /* need to add a new neigh, but maybe some other thread succeeded?
 999         * recalc hash, maybe hash resize took place so we do a search
1000         */
1001        hash_val = ipoib_addr_hash(htbl, daddr);
1002        for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
1003                                               lockdep_is_held(&priv->lock));
1004             neigh != NULL;
1005             neigh = rcu_dereference_protected(neigh->hnext,
1006                                               lockdep_is_held(&priv->lock))) {
1007                if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
1008                        /* found, take one ref on behalf of the caller */
1009                        if (!atomic_inc_not_zero(&neigh->refcnt)) {
1010                                /* deleted */
1011                                neigh = NULL;
1012                                break;
1013                        }
1014                        neigh->alive = jiffies;
1015                        goto out_unlock;
1016                }
1017        }
1018
1019        neigh = ipoib_neigh_ctor(daddr, dev);
1020        if (!neigh)
1021                goto out_unlock;
1022
1023        /* one ref on behalf of the hash table */
1024        atomic_inc(&neigh->refcnt);
1025        neigh->alive = jiffies;
1026        /* put in hash */
1027        rcu_assign_pointer(neigh->hnext,
1028                           rcu_dereference_protected(htbl->buckets[hash_val],
1029                                                     lockdep_is_held(&priv->lock)));
1030        rcu_assign_pointer(htbl->buckets[hash_val], neigh);
1031        atomic_inc(&ntbl->entries);
1032
1033out_unlock:
1034
1035        return neigh;
1036}
1037
1038void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
1039{
1040        /* neigh reference count was dropprd to zero */
1041        struct net_device *dev = neigh->dev;
1042        struct ipoib_dev_priv *priv = netdev_priv(dev);
1043        struct sk_buff *skb;
1044        if (neigh->ah)
1045                ipoib_put_ah(neigh->ah);
1046        while ((skb = __skb_dequeue(&neigh->queue))) {
1047                ++dev->stats.tx_dropped;
1048                dev_kfree_skb_any(skb);
1049        }
1050        if (ipoib_cm_get(neigh))
1051                ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
1052        ipoib_dbg(netdev_priv(dev),
1053                  "neigh free for %06x %pI6\n",
1054                  IPOIB_QPN(neigh->daddr),
1055                  neigh->daddr + 4);
1056        kfree(neigh);
1057        if (atomic_dec_and_test(&priv->ntbl.entries)) {
1058                if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
1059                        complete(&priv->ntbl.flushed);
1060        }
1061}
1062
1063static void ipoib_neigh_reclaim(struct rcu_head *rp)
1064{
1065        /* Called as a result of removal from hash table */
1066        struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
1067        /* note TX context may hold another ref */
1068        ipoib_neigh_put(neigh);
1069}
1070
1071void ipoib_neigh_free(struct ipoib_neigh *neigh)
1072{
1073        struct net_device *dev = neigh->dev;
1074        struct ipoib_dev_priv *priv = netdev_priv(dev);
1075        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1076        struct ipoib_neigh_hash *htbl;
1077        struct ipoib_neigh __rcu **np;
1078        struct ipoib_neigh *n;
1079        u32 hash_val;
1080
1081        htbl = rcu_dereference_protected(ntbl->htbl,
1082                                        lockdep_is_held(&priv->lock));
1083        if (!htbl)
1084                return;
1085
1086        hash_val = ipoib_addr_hash(htbl, neigh->daddr);
1087        np = &htbl->buckets[hash_val];
1088        for (n = rcu_dereference_protected(*np,
1089                                            lockdep_is_held(&priv->lock));
1090             n != NULL;
1091             n = rcu_dereference_protected(*np,
1092                                        lockdep_is_held(&priv->lock))) {
1093                if (n == neigh) {
1094                        /* found */
1095                        rcu_assign_pointer(*np,
1096                                           rcu_dereference_protected(neigh->hnext,
1097                                                                     lockdep_is_held(&priv->lock)));
1098                        /* remove from parent list */
1099                        list_del(&neigh->list);
1100                        call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1101                        return;
1102                } else {
1103                        np = &n->hnext;
1104                }
1105        }
1106}
1107
1108static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
1109{
1110        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1111        struct ipoib_neigh_hash *htbl;
1112        struct ipoib_neigh **buckets;
1113        u32 size;
1114
1115        clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1116        ntbl->htbl = NULL;
1117        htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
1118        if (!htbl)
1119                return -ENOMEM;
1120        set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1121        size = roundup_pow_of_two(arp_tbl.gc_thresh3);
1122        buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
1123        if (!buckets) {
1124                kfree(htbl);
1125                return -ENOMEM;
1126        }
1127        htbl->size = size;
1128        htbl->mask = (size - 1);
1129        htbl->buckets = buckets;
1130        ntbl->htbl = htbl;
1131        htbl->ntbl = ntbl;
1132        atomic_set(&ntbl->entries, 0);
1133
1134        /* start garbage collection */
1135        clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1136        queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
1137                           arp_tbl.gc_interval);
1138
1139        return 0;
1140}
1141
1142static void neigh_hash_free_rcu(struct rcu_head *head)
1143{
1144        struct ipoib_neigh_hash *htbl = container_of(head,
1145                                                    struct ipoib_neigh_hash,
1146                                                    rcu);
1147        struct ipoib_neigh __rcu **buckets = htbl->buckets;
1148        struct ipoib_neigh_table *ntbl = htbl->ntbl;
1149
1150        kfree(buckets);
1151        kfree(htbl);
1152        complete(&ntbl->deleted);
1153}
1154
1155void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
1156{
1157        struct ipoib_dev_priv *priv = netdev_priv(dev);
1158        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1159        struct ipoib_neigh_hash *htbl;
1160        unsigned long flags;
1161        int i;
1162
1163        /* remove all neigh connected to a given path or mcast */
1164        spin_lock_irqsave(&priv->lock, flags);
1165
1166        htbl = rcu_dereference_protected(ntbl->htbl,
1167                                         lockdep_is_held(&priv->lock));
1168
1169        if (!htbl)
1170                goto out_unlock;
1171
1172        for (i = 0; i < htbl->size; i++) {
1173                struct ipoib_neigh *neigh;
1174                struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1175
1176                while ((neigh = rcu_dereference_protected(*np,
1177                                                          lockdep_is_held(&priv->lock))) != NULL) {
1178                        /* delete neighs belong to this parent */
1179                        if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
1180                                rcu_assign_pointer(*np,
1181                                                   rcu_dereference_protected(neigh->hnext,
1182                                                                             lockdep_is_held(&priv->lock)));
1183                                /* remove from parent list */
1184                                list_del(&neigh->list);
1185                                call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1186                        } else {
1187                                np = &neigh->hnext;
1188                        }
1189
1190                }
1191        }
1192out_unlock:
1193        spin_unlock_irqrestore(&priv->lock, flags);
1194}
1195
1196static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
1197{
1198        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1199        struct ipoib_neigh_hash *htbl;
1200        unsigned long flags;
1201        int i, wait_flushed = 0;
1202
1203        init_completion(&priv->ntbl.flushed);
1204
1205        spin_lock_irqsave(&priv->lock, flags);
1206
1207        htbl = rcu_dereference_protected(ntbl->htbl,
1208                                        lockdep_is_held(&priv->lock));
1209        if (!htbl)
1210                goto out_unlock;
1211
1212        wait_flushed = atomic_read(&priv->ntbl.entries);
1213        if (!wait_flushed)
1214                goto free_htbl;
1215
1216        for (i = 0; i < htbl->size; i++) {
1217                struct ipoib_neigh *neigh;
1218                struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1219
1220                while ((neigh = rcu_dereference_protected(*np,
1221                                       lockdep_is_held(&priv->lock))) != NULL) {
1222                        rcu_assign_pointer(*np,
1223                                           rcu_dereference_protected(neigh->hnext,
1224                                                                     lockdep_is_held(&priv->lock)));
1225                        /* remove from path/mc list */
1226                        list_del(&neigh->list);
1227                        call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1228                }
1229        }
1230
1231free_htbl:
1232        rcu_assign_pointer(ntbl->htbl, NULL);
1233        call_rcu(&htbl->rcu, neigh_hash_free_rcu);
1234
1235out_unlock:
1236        spin_unlock_irqrestore(&priv->lock, flags);
1237        if (wait_flushed)
1238                wait_for_completion(&priv->ntbl.flushed);
1239}
1240
1241static void ipoib_neigh_hash_uninit(struct net_device *dev)
1242{
1243        struct ipoib_dev_priv *priv = netdev_priv(dev);
1244        int stopped;
1245
1246        ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
1247        init_completion(&priv->ntbl.deleted);
1248        set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1249
1250        /* Stop GC if called at init fail need to cancel work */
1251        stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1252        if (!stopped)
1253                cancel_delayed_work(&priv->neigh_reap_task);
1254
1255        ipoib_flush_neighs(priv);
1256
1257        wait_for_completion(&priv->ntbl.deleted);
1258}
1259
1260
1261int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
1262{
1263        struct ipoib_dev_priv *priv = netdev_priv(dev);
1264
1265        if (ipoib_neigh_hash_init(priv) < 0)
1266                goto out;
1267        /* Allocate RX/TX "rings" to hold queued skbs */
1268        priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
1269                                GFP_KERNEL);
1270        if (!priv->rx_ring) {
1271                printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
1272                       ca->name, ipoib_recvq_size);
1273                goto out_neigh_hash_cleanup;
1274        }
1275
1276        priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
1277        if (!priv->tx_ring) {
1278                printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
1279                       ca->name, ipoib_sendq_size);
1280                goto out_rx_ring_cleanup;
1281        }
1282
1283        /* priv->tx_head, tx_tail & tx_outstanding are already 0 */
1284
1285        if (ipoib_ib_dev_init(dev, ca, port))
1286                goto out_tx_ring_cleanup;
1287
1288        return 0;
1289
1290out_tx_ring_cleanup:
1291        vfree(priv->tx_ring);
1292
1293out_rx_ring_cleanup:
1294        kfree(priv->rx_ring);
1295
1296out_neigh_hash_cleanup:
1297        ipoib_neigh_hash_uninit(dev);
1298out:
1299        return -ENOMEM;
1300}
1301
1302void ipoib_dev_cleanup(struct net_device *dev)
1303{
1304        struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
1305        LIST_HEAD(head);
1306
1307        ASSERT_RTNL();
1308
1309        ipoib_delete_debug_files(dev);
1310
1311        /* Delete any child interfaces first */
1312        list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
1313                /* Stop GC on child */
1314                set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
1315                cancel_delayed_work(&cpriv->neigh_reap_task);
1316                unregister_netdevice_queue(cpriv->dev, &head);
1317        }
1318        unregister_netdevice_many(&head);
1319
1320        ipoib_ib_dev_cleanup(dev);
1321
1322        kfree(priv->rx_ring);
1323        vfree(priv->tx_ring);
1324
1325        priv->rx_ring = NULL;
1326        priv->tx_ring = NULL;
1327
1328        ipoib_neigh_hash_uninit(dev);
1329}
1330
1331static const struct header_ops ipoib_header_ops = {
1332        .create = ipoib_hard_header,
1333};
1334
1335static const struct net_device_ops ipoib_netdev_ops = {
1336        .ndo_uninit              = ipoib_uninit,
1337        .ndo_open                = ipoib_open,
1338        .ndo_stop                = ipoib_stop,
1339        .ndo_change_mtu          = ipoib_change_mtu,
1340        .ndo_fix_features        = ipoib_fix_features,
1341        .ndo_start_xmit          = ipoib_start_xmit,
1342        .ndo_tx_timeout          = ipoib_timeout,
1343        .ndo_set_rx_mode         = ipoib_set_mcast_list,
1344};
1345
1346void ipoib_setup(struct net_device *dev)
1347{
1348        struct ipoib_dev_priv *priv = netdev_priv(dev);
1349
1350        dev->netdev_ops          = &ipoib_netdev_ops;
1351        dev->header_ops          = &ipoib_header_ops;
1352
1353        ipoib_set_ethtool_ops(dev);
1354
1355        netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT);
1356
1357        dev->watchdog_timeo      = HZ;
1358
1359        dev->flags              |= IFF_BROADCAST | IFF_MULTICAST;
1360
1361        dev->hard_header_len     = IPOIB_ENCAP_LEN;
1362        dev->addr_len            = INFINIBAND_ALEN;
1363        dev->type                = ARPHRD_INFINIBAND;
1364        dev->tx_queue_len        = ipoib_sendq_size * 2;
1365        dev->features            = (NETIF_F_VLAN_CHALLENGED     |
1366                                    NETIF_F_HIGHDMA);
1367        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1368
1369        memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
1370
1371        priv->dev = dev;
1372
1373        spin_lock_init(&priv->lock);
1374
1375        init_rwsem(&priv->vlan_rwsem);
1376
1377        INIT_LIST_HEAD(&priv->path_list);
1378        INIT_LIST_HEAD(&priv->child_intfs);
1379        INIT_LIST_HEAD(&priv->dead_ahs);
1380        INIT_LIST_HEAD(&priv->multicast_list);
1381
1382        INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
1383        INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
1384        INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
1385        INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
1386        INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
1387        INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
1388        INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
1389        INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
1390        INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
1391}
1392
1393struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
1394{
1395        struct net_device *dev;
1396
1397        dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,
1398                           ipoib_setup);
1399        if (!dev)
1400                return NULL;
1401
1402        return netdev_priv(dev);
1403}
1404
1405static ssize_t show_pkey(struct device *dev,
1406                         struct device_attribute *attr, char *buf)
1407{
1408        struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1409
1410        return sprintf(buf, "0x%04x\n", priv->pkey);
1411}
1412static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
1413
1414static ssize_t show_umcast(struct device *dev,
1415                           struct device_attribute *attr, char *buf)
1416{
1417        struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1418
1419        return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
1420}
1421
1422void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
1423{
1424        struct ipoib_dev_priv *priv = netdev_priv(ndev);
1425
1426        if (umcast_val > 0) {
1427                set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1428                ipoib_warn(priv, "ignoring multicast groups joined directly "
1429                                "by userspace\n");
1430        } else
1431                clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1432}
1433
1434static ssize_t set_umcast(struct device *dev,
1435                          struct device_attribute *attr,
1436                          const char *buf, size_t count)
1437{
1438        unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
1439
1440        ipoib_set_umcast(to_net_dev(dev), umcast_val);
1441
1442        return count;
1443}
1444static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
1445
1446int ipoib_add_umcast_attr(struct net_device *dev)
1447{
1448        return device_create_file(&dev->dev, &dev_attr_umcast);
1449}
1450
1451static ssize_t create_child(struct device *dev,
1452                            struct device_attribute *attr,
1453                            const char *buf, size_t count)
1454{
1455        int pkey;
1456        int ret;
1457
1458        if (sscanf(buf, "%i", &pkey) != 1)
1459                return -EINVAL;
1460
1461        if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
1462                return -EINVAL;
1463
1464        /*
1465         * Set the full membership bit, so that we join the right
1466         * broadcast group, etc.
1467         */
1468        pkey |= 0x8000;
1469
1470        ret = ipoib_vlan_add(to_net_dev(dev), pkey);
1471
1472        return ret ? ret : count;
1473}
1474static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
1475
1476static ssize_t delete_child(struct device *dev,
1477                            struct device_attribute *attr,
1478                            const char *buf, size_t count)
1479{
1480        int pkey;
1481        int ret;
1482
1483        if (sscanf(buf, "%i", &pkey) != 1)
1484                return -EINVAL;
1485
1486        if (pkey < 0 || pkey > 0xffff)
1487                return -EINVAL;
1488
1489        ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
1490
1491        return ret ? ret : count;
1492
1493}
1494static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
1495
1496int ipoib_add_pkey_attr(struct net_device *dev)
1497{
1498        return device_create_file(&dev->dev, &dev_attr_pkey);
1499}
1500
1501int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
1502{
1503        struct ib_device_attr *device_attr;
1504        int result = -ENOMEM;
1505
1506        device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
1507        if (!device_attr) {
1508                printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
1509                       hca->name, sizeof *device_attr);
1510                return result;
1511        }
1512
1513        result = ib_query_device(hca, device_attr);
1514        if (result) {
1515                printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
1516                       hca->name, result);
1517                kfree(device_attr);
1518                return result;
1519        }
1520        priv->hca_caps = device_attr->device_cap_flags;
1521
1522        kfree(device_attr);
1523
1524        if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
1525                priv->dev->hw_features = NETIF_F_SG |
1526                        NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
1527
1528                if (priv->hca_caps & IB_DEVICE_UD_TSO)
1529                        priv->dev->hw_features |= NETIF_F_TSO;
1530
1531                priv->dev->features |= priv->dev->hw_features;
1532        }
1533
1534        return 0;
1535}
1536
1537static struct net_device *ipoib_add_port(const char *format,
1538                                         struct ib_device *hca, u8 port)
1539{
1540        struct ipoib_dev_priv *priv;
1541        struct ib_port_attr attr;
1542        int result = -ENOMEM;
1543
1544        priv = ipoib_intf_alloc(format);
1545        if (!priv)
1546                goto alloc_mem_failed;
1547
1548        SET_NETDEV_DEV(priv->dev, hca->dma_device);
1549        priv->dev->dev_id = port - 1;
1550
1551        if (!ib_query_port(hca, port, &attr))
1552                priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
1553        else {
1554                printk(KERN_WARNING "%s: ib_query_port %d failed\n",
1555                       hca->name, port);
1556                goto device_init_failed;
1557        }
1558
1559        /* MTU will be reset when mcast join happens */
1560        priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
1561        priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
1562
1563        priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
1564
1565        result = ib_query_pkey(hca, port, 0, &priv->pkey);
1566        if (result) {
1567                printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
1568                       hca->name, port, result);
1569                goto device_init_failed;
1570        }
1571
1572        if (ipoib_set_dev_features(priv, hca))
1573                goto device_init_failed;
1574
1575        /*
1576         * Set the full membership bit, so that we join the right
1577         * broadcast group, etc.
1578         */
1579        priv->pkey |= 0x8000;
1580
1581        priv->dev->broadcast[8] = priv->pkey >> 8;
1582        priv->dev->broadcast[9] = priv->pkey & 0xff;
1583
1584        result = ib_query_gid(hca, port, 0, &priv->local_gid);
1585        if (result) {
1586                printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
1587                       hca->name, port, result);
1588                goto device_init_failed;
1589        } else
1590                memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
1591
1592        result = ipoib_dev_init(priv->dev, hca, port);
1593        if (result < 0) {
1594                printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
1595                       hca->name, port, result);
1596                goto device_init_failed;
1597        }
1598
1599        INIT_IB_EVENT_HANDLER(&priv->event_handler,
1600                              priv->ca, ipoib_event);
1601        result = ib_register_event_handler(&priv->event_handler);
1602        if (result < 0) {
1603                printk(KERN_WARNING "%s: ib_register_event_handler failed for "
1604                       "port %d (ret = %d)\n",
1605                       hca->name, port, result);
1606                goto event_failed;
1607        }
1608
1609        result = register_netdev(priv->dev);
1610        if (result) {
1611                printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
1612                       hca->name, port, result);
1613                goto register_failed;
1614        }
1615
1616        ipoib_create_debug_files(priv->dev);
1617
1618        if (ipoib_cm_add_mode_attr(priv->dev))
1619                goto sysfs_failed;
1620        if (ipoib_add_pkey_attr(priv->dev))
1621                goto sysfs_failed;
1622        if (ipoib_add_umcast_attr(priv->dev))
1623                goto sysfs_failed;
1624        if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
1625                goto sysfs_failed;
1626        if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
1627                goto sysfs_failed;
1628
1629        return priv->dev;
1630
1631sysfs_failed:
1632        ipoib_delete_debug_files(priv->dev);
1633        unregister_netdev(priv->dev);
1634
1635register_failed:
1636        ib_unregister_event_handler(&priv->event_handler);
1637        /* Stop GC if started before flush */
1638        set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1639        cancel_delayed_work(&priv->neigh_reap_task);
1640        flush_workqueue(ipoib_workqueue);
1641
1642event_failed:
1643        ipoib_dev_cleanup(priv->dev);
1644
1645device_init_failed:
1646        free_netdev(priv->dev);
1647
1648alloc_mem_failed:
1649        return ERR_PTR(result);
1650}
1651
1652static void ipoib_add_one(struct ib_device *device)
1653{
1654        struct list_head *dev_list;
1655        struct net_device *dev;
1656        struct ipoib_dev_priv *priv;
1657        int s, e, p;
1658
1659        if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1660                return;
1661
1662        dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
1663        if (!dev_list)
1664                return;
1665
1666        INIT_LIST_HEAD(dev_list);
1667
1668        if (device->node_type == RDMA_NODE_IB_SWITCH) {
1669                s = 0;
1670                e = 0;
1671        } else {
1672                s = 1;
1673                e = device->phys_port_cnt;
1674        }
1675
1676        for (p = s; p <= e; ++p) {
1677                if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
1678                        continue;
1679                dev = ipoib_add_port("ib%d", device, p);
1680                if (!IS_ERR(dev)) {
1681                        priv = netdev_priv(dev);
1682                        list_add_tail(&priv->list, dev_list);
1683                }
1684        }
1685
1686        ib_set_client_data(device, &ipoib_client, dev_list);
1687}
1688
1689static void ipoib_remove_one(struct ib_device *device)
1690{
1691        struct ipoib_dev_priv *priv, *tmp;
1692        struct list_head *dev_list;
1693
1694        if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1695                return;
1696
1697        dev_list = ib_get_client_data(device, &ipoib_client);
1698        if (!dev_list)
1699                return;
1700
1701        list_for_each_entry_safe(priv, tmp, dev_list, list) {
1702                ib_unregister_event_handler(&priv->event_handler);
1703
1704                rtnl_lock();
1705                dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
1706                rtnl_unlock();
1707
1708                /* Stop GC */
1709                set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1710                cancel_delayed_work(&priv->neigh_reap_task);
1711                flush_workqueue(ipoib_workqueue);
1712
1713                unregister_netdev(priv->dev);
1714                free_netdev(priv->dev);
1715        }
1716
1717        kfree(dev_list);
1718}
1719
1720static int __init ipoib_init_module(void)
1721{
1722        int ret;
1723
1724        ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
1725        ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
1726        ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
1727
1728        ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
1729        ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
1730        ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
1731#ifdef CONFIG_INFINIBAND_IPOIB_CM
1732        ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
1733#endif
1734
1735        /*
1736         * When copying small received packets, we only copy from the
1737         * linear data part of the SKB, so we rely on this condition.
1738         */
1739        BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
1740
1741        ret = ipoib_register_debugfs();
1742        if (ret)
1743                return ret;
1744
1745        /*
1746         * We create our own workqueue mainly because we want to be
1747         * able to flush it when devices are being removed.  We can't
1748         * use schedule_work()/flush_scheduled_work() because both
1749         * unregister_netdev() and linkwatch_event take the rtnl lock,
1750         * so flush_scheduled_work() can deadlock during device
1751         * removal.
1752         */
1753        ipoib_workqueue = create_singlethread_workqueue("ipoib");
1754        if (!ipoib_workqueue) {
1755                ret = -ENOMEM;
1756                goto err_fs;
1757        }
1758
1759        ib_sa_register_client(&ipoib_sa_client);
1760
1761        ret = ib_register_client(&ipoib_client);
1762        if (ret)
1763                goto err_sa;
1764
1765        ret = ipoib_netlink_init();
1766        if (ret)
1767                goto err_client;
1768
1769        return 0;
1770
1771err_client:
1772        ib_unregister_client(&ipoib_client);
1773
1774err_sa:
1775        ib_sa_unregister_client(&ipoib_sa_client);
1776        destroy_workqueue(ipoib_workqueue);
1777
1778err_fs:
1779        ipoib_unregister_debugfs();
1780
1781        return ret;
1782}
1783
1784static void __exit ipoib_cleanup_module(void)
1785{
1786        ipoib_netlink_fini();
1787        ib_unregister_client(&ipoib_client);
1788        ib_sa_unregister_client(&ipoib_sa_client);
1789        ipoib_unregister_debugfs();
1790        destroy_workqueue(ipoib_workqueue);
1791}
1792
1793module_init(ipoib_init_module);
1794module_exit(ipoib_cleanup_module);
1795