linux/drivers/infiniband/ulp/ipoib/ipoib_main.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
   3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
   4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
   5 *
   6 * This software is available to you under a choice of one of two
   7 * licenses.  You may choose to be licensed under the terms of the GNU
   8 * General Public License (GPL) Version 2, available from the file
   9 * COPYING in the main directory of this source tree, or the
  10 * OpenIB.org BSD license below:
  11 *
  12 *     Redistribution and use in source and binary forms, with or
  13 *     without modification, are permitted provided that the following
  14 *     conditions are met:
  15 *
  16 *      - Redistributions of source code must retain the above
  17 *        copyright notice, this list of conditions and the following
  18 *        disclaimer.
  19 *
  20 *      - Redistributions in binary form must reproduce the above
  21 *        copyright notice, this list of conditions and the following
  22 *        disclaimer in the documentation and/or other materials
  23 *        provided with the distribution.
  24 *
  25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32 * SOFTWARE.
  33 */
  34
  35#include "ipoib.h"
  36
  37#include <linux/module.h>
  38
  39#include <linux/init.h>
  40#include <linux/slab.h>
  41#include <linux/kernel.h>
  42#include <linux/vmalloc.h>
  43
  44#include <linux/if_arp.h>       /* For ARPHRD_xxx */
  45
  46#include <linux/ip.h>
  47#include <linux/in.h>
  48
  49#include <linux/jhash.h>
  50#include <net/arp.h>
  51
  52#define DRV_VERSION "1.0.0"
  53
  54const char ipoib_driver_version[] = DRV_VERSION;
  55
  56MODULE_AUTHOR("Roland Dreier");
  57MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
  58MODULE_LICENSE("Dual BSD/GPL");
  59MODULE_VERSION(DRV_VERSION);
  60
  61int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
  62int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
  63
  64module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
  65MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
  66module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
  67MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
  68
  69#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  70int ipoib_debug_level;
  71
  72module_param_named(debug_level, ipoib_debug_level, int, 0644);
  73MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
  74#endif
  75
  76struct ipoib_path_iter {
  77        struct net_device *dev;
  78        struct ipoib_path  path;
  79};
  80
  81static const u8 ipv4_bcast_addr[] = {
  82        0x00, 0xff, 0xff, 0xff,
  83        0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  84        0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
  85};
  86
  87struct workqueue_struct *ipoib_workqueue;
  88
  89struct ib_sa_client ipoib_sa_client;
  90
  91static void ipoib_add_one(struct ib_device *device);
  92static void ipoib_remove_one(struct ib_device *device);
  93static void ipoib_neigh_reclaim(struct rcu_head *rp);
  94
  95static struct ib_client ipoib_client = {
  96        .name   = "ipoib",
  97        .add    = ipoib_add_one,
  98        .remove = ipoib_remove_one
  99};
 100
 101int ipoib_open(struct net_device *dev)
 102{
 103        struct ipoib_dev_priv *priv = netdev_priv(dev);
 104
 105        ipoib_dbg(priv, "bringing up interface\n");
 106
 107        set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 108
 109        if (ipoib_pkey_dev_delay_open(dev))
 110                return 0;
 111
 112        if (ipoib_ib_dev_open(dev))
 113                goto err_disable;
 114
 115        if (ipoib_ib_dev_up(dev))
 116                goto err_stop;
 117
 118        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 119                struct ipoib_dev_priv *cpriv;
 120
 121                /* Bring up any child interfaces too */
 122                mutex_lock(&priv->vlan_mutex);
 123                list_for_each_entry(cpriv, &priv->child_intfs, list) {
 124                        int flags;
 125
 126                        flags = cpriv->dev->flags;
 127                        if (flags & IFF_UP)
 128                                continue;
 129
 130                        dev_change_flags(cpriv->dev, flags | IFF_UP);
 131                }
 132                mutex_unlock(&priv->vlan_mutex);
 133        }
 134
 135        netif_start_queue(dev);
 136
 137        return 0;
 138
 139err_stop:
 140        ipoib_ib_dev_stop(dev, 1);
 141
 142err_disable:
 143        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 144
 145        return -EINVAL;
 146}
 147
 148static int ipoib_stop(struct net_device *dev)
 149{
 150        struct ipoib_dev_priv *priv = netdev_priv(dev);
 151
 152        ipoib_dbg(priv, "stopping interface\n");
 153
 154        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 155
 156        netif_stop_queue(dev);
 157
 158        ipoib_ib_dev_down(dev, 1);
 159        ipoib_ib_dev_stop(dev, 0);
 160
 161        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 162                struct ipoib_dev_priv *cpriv;
 163
 164                /* Bring down any child interfaces too */
 165                mutex_lock(&priv->vlan_mutex);
 166                list_for_each_entry(cpriv, &priv->child_intfs, list) {
 167                        int flags;
 168
 169                        flags = cpriv->dev->flags;
 170                        if (!(flags & IFF_UP))
 171                                continue;
 172
 173                        dev_change_flags(cpriv->dev, flags & ~IFF_UP);
 174                }
 175                mutex_unlock(&priv->vlan_mutex);
 176        }
 177
 178        return 0;
 179}
 180
 181static void ipoib_uninit(struct net_device *dev)
 182{
 183        ipoib_dev_cleanup(dev);
 184}
 185
 186static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
 187{
 188        struct ipoib_dev_priv *priv = netdev_priv(dev);
 189
 190        if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
 191                features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
 192
 193        return features;
 194}
 195
 196static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 197{
 198        struct ipoib_dev_priv *priv = netdev_priv(dev);
 199
 200        /* dev->mtu > 2K ==> connected mode */
 201        if (ipoib_cm_admin_enabled(dev)) {
 202                if (new_mtu > ipoib_cm_max_mtu(dev))
 203                        return -EINVAL;
 204
 205                if (new_mtu > priv->mcast_mtu)
 206                        ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
 207                                   priv->mcast_mtu);
 208
 209                dev->mtu = new_mtu;
 210                return 0;
 211        }
 212
 213        if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
 214                return -EINVAL;
 215
 216        priv->admin_mtu = new_mtu;
 217
 218        dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
 219
 220        return 0;
 221}
 222
 223int ipoib_set_mode(struct net_device *dev, const char *buf)
 224{
 225        struct ipoib_dev_priv *priv = netdev_priv(dev);
 226
 227        /* flush paths if we switch modes so that connections are restarted */
 228        if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
 229                set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
 230                ipoib_warn(priv, "enabling connected mode "
 231                           "will cause multicast packet drops\n");
 232                netdev_update_features(dev);
 233                rtnl_unlock();
 234                priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
 235
 236                ipoib_flush_paths(dev);
 237                rtnl_lock();
 238                return 0;
 239        }
 240
 241        if (!strcmp(buf, "datagram\n")) {
 242                clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
 243                netdev_update_features(dev);
 244                dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
 245                rtnl_unlock();
 246                ipoib_flush_paths(dev);
 247                rtnl_lock();
 248                return 0;
 249        }
 250
 251        return -EINVAL;
 252}
 253
 254static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
 255{
 256        struct ipoib_dev_priv *priv = netdev_priv(dev);
 257        struct rb_node *n = priv->path_tree.rb_node;
 258        struct ipoib_path *path;
 259        int ret;
 260
 261        while (n) {
 262                path = rb_entry(n, struct ipoib_path, rb_node);
 263
 264                ret = memcmp(gid, path->pathrec.dgid.raw,
 265                             sizeof (union ib_gid));
 266
 267                if (ret < 0)
 268                        n = n->rb_left;
 269                else if (ret > 0)
 270                        n = n->rb_right;
 271                else
 272                        return path;
 273        }
 274
 275        return NULL;
 276}
 277
 278static int __path_add(struct net_device *dev, struct ipoib_path *path)
 279{
 280        struct ipoib_dev_priv *priv = netdev_priv(dev);
 281        struct rb_node **n = &priv->path_tree.rb_node;
 282        struct rb_node *pn = NULL;
 283        struct ipoib_path *tpath;
 284        int ret;
 285
 286        while (*n) {
 287                pn = *n;
 288                tpath = rb_entry(pn, struct ipoib_path, rb_node);
 289
 290                ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
 291                             sizeof (union ib_gid));
 292                if (ret < 0)
 293                        n = &pn->rb_left;
 294                else if (ret > 0)
 295                        n = &pn->rb_right;
 296                else
 297                        return -EEXIST;
 298        }
 299
 300        rb_link_node(&path->rb_node, pn, n);
 301        rb_insert_color(&path->rb_node, &priv->path_tree);
 302
 303        list_add_tail(&path->list, &priv->path_list);
 304
 305        return 0;
 306}
 307
 308static void path_free(struct net_device *dev, struct ipoib_path *path)
 309{
 310        struct sk_buff *skb;
 311
 312        while ((skb = __skb_dequeue(&path->queue)))
 313                dev_kfree_skb_irq(skb);
 314
 315        ipoib_dbg(netdev_priv(dev), "path_free\n");
 316
 317        /* remove all neigh connected to this path */
 318        ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
 319
 320        if (path->ah)
 321                ipoib_put_ah(path->ah);
 322
 323        kfree(path);
 324}
 325
 326#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 327
 328struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
 329{
 330        struct ipoib_path_iter *iter;
 331
 332        iter = kmalloc(sizeof *iter, GFP_KERNEL);
 333        if (!iter)
 334                return NULL;
 335
 336        iter->dev = dev;
 337        memset(iter->path.pathrec.dgid.raw, 0, 16);
 338
 339        if (ipoib_path_iter_next(iter)) {
 340                kfree(iter);
 341                return NULL;
 342        }
 343
 344        return iter;
 345}
 346
 347int ipoib_path_iter_next(struct ipoib_path_iter *iter)
 348{
 349        struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
 350        struct rb_node *n;
 351        struct ipoib_path *path;
 352        int ret = 1;
 353
 354        spin_lock_irq(&priv->lock);
 355
 356        n = rb_first(&priv->path_tree);
 357
 358        while (n) {
 359                path = rb_entry(n, struct ipoib_path, rb_node);
 360
 361                if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
 362                           sizeof (union ib_gid)) < 0) {
 363                        iter->path = *path;
 364                        ret = 0;
 365                        break;
 366                }
 367
 368                n = rb_next(n);
 369        }
 370
 371        spin_unlock_irq(&priv->lock);
 372
 373        return ret;
 374}
 375
 376void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 377                          struct ipoib_path *path)
 378{
 379        *path = iter->path;
 380}
 381
 382#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
 383
 384void ipoib_mark_paths_invalid(struct net_device *dev)
 385{
 386        struct ipoib_dev_priv *priv = netdev_priv(dev);
 387        struct ipoib_path *path, *tp;
 388
 389        spin_lock_irq(&priv->lock);
 390
 391        list_for_each_entry_safe(path, tp, &priv->path_list, list) {
 392                ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
 393                        be16_to_cpu(path->pathrec.dlid),
 394                        path->pathrec.dgid.raw);
 395                path->valid =  0;
 396        }
 397
 398        spin_unlock_irq(&priv->lock);
 399}
 400
 401void ipoib_flush_paths(struct net_device *dev)
 402{
 403        struct ipoib_dev_priv *priv = netdev_priv(dev);
 404        struct ipoib_path *path, *tp;
 405        LIST_HEAD(remove_list);
 406        unsigned long flags;
 407
 408        netif_tx_lock_bh(dev);
 409        spin_lock_irqsave(&priv->lock, flags);
 410
 411        list_splice_init(&priv->path_list, &remove_list);
 412
 413        list_for_each_entry(path, &remove_list, list)
 414                rb_erase(&path->rb_node, &priv->path_tree);
 415
 416        list_for_each_entry_safe(path, tp, &remove_list, list) {
 417                if (path->query)
 418                        ib_sa_cancel_query(path->query_id, path->query);
 419                spin_unlock_irqrestore(&priv->lock, flags);
 420                netif_tx_unlock_bh(dev);
 421                wait_for_completion(&path->done);
 422                path_free(dev, path);
 423                netif_tx_lock_bh(dev);
 424                spin_lock_irqsave(&priv->lock, flags);
 425        }
 426
 427        spin_unlock_irqrestore(&priv->lock, flags);
 428        netif_tx_unlock_bh(dev);
 429}
 430
 431static void path_rec_completion(int status,
 432                                struct ib_sa_path_rec *pathrec,
 433                                void *path_ptr)
 434{
 435        struct ipoib_path *path = path_ptr;
 436        struct net_device *dev = path->dev;
 437        struct ipoib_dev_priv *priv = netdev_priv(dev);
 438        struct ipoib_ah *ah = NULL;
 439        struct ipoib_ah *old_ah = NULL;
 440        struct ipoib_neigh *neigh, *tn;
 441        struct sk_buff_head skqueue;
 442        struct sk_buff *skb;
 443        unsigned long flags;
 444
 445        if (!status)
 446                ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
 447                          be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
 448        else
 449                ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
 450                          status, path->pathrec.dgid.raw);
 451
 452        skb_queue_head_init(&skqueue);
 453
 454        if (!status) {
 455                struct ib_ah_attr av;
 456
 457                if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
 458                        ah = ipoib_create_ah(dev, priv->pd, &av);
 459        }
 460
 461        spin_lock_irqsave(&priv->lock, flags);
 462
 463        if (!IS_ERR_OR_NULL(ah)) {
 464                path->pathrec = *pathrec;
 465
 466                old_ah   = path->ah;
 467                path->ah = ah;
 468
 469                ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
 470                          ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
 471
 472                while ((skb = __skb_dequeue(&path->queue)))
 473                        __skb_queue_tail(&skqueue, skb);
 474
 475                list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
 476                        if (neigh->ah) {
 477                                WARN_ON(neigh->ah != old_ah);
 478                                /*
 479                                 * Dropping the ah reference inside
 480                                 * priv->lock is safe here, because we
 481                                 * will hold one more reference from
 482                                 * the original value of path->ah (ie
 483                                 * old_ah).
 484                                 */
 485                                ipoib_put_ah(neigh->ah);
 486                        }
 487                        kref_get(&path->ah->ref);
 488                        neigh->ah = path->ah;
 489
 490                        if (ipoib_cm_enabled(dev, neigh->daddr)) {
 491                                if (!ipoib_cm_get(neigh))
 492                                        ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
 493                                                                               path,
 494                                                                               neigh));
 495                                if (!ipoib_cm_get(neigh)) {
 496                                        list_del(&neigh->list);
 497                                        ipoib_neigh_free(neigh);
 498                                        continue;
 499                                }
 500                        }
 501
 502                        while ((skb = __skb_dequeue(&neigh->queue)))
 503                                __skb_queue_tail(&skqueue, skb);
 504                }
 505                path->valid = 1;
 506        }
 507
 508        path->query = NULL;
 509        complete(&path->done);
 510
 511        spin_unlock_irqrestore(&priv->lock, flags);
 512
 513        if (IS_ERR_OR_NULL(ah))
 514                ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
 515
 516        if (old_ah)
 517                ipoib_put_ah(old_ah);
 518
 519        while ((skb = __skb_dequeue(&skqueue))) {
 520                skb->dev = dev;
 521                if (dev_queue_xmit(skb))
 522                        ipoib_warn(priv, "dev_queue_xmit failed "
 523                                   "to requeue packet\n");
 524        }
 525}
 526
 527static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
 528{
 529        struct ipoib_dev_priv *priv = netdev_priv(dev);
 530        struct ipoib_path *path;
 531
 532        if (!priv->broadcast)
 533                return NULL;
 534
 535        path = kzalloc(sizeof *path, GFP_ATOMIC);
 536        if (!path)
 537                return NULL;
 538
 539        path->dev = dev;
 540
 541        skb_queue_head_init(&path->queue);
 542
 543        INIT_LIST_HEAD(&path->neigh_list);
 544
 545        memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
 546        path->pathrec.sgid          = priv->local_gid;
 547        path->pathrec.pkey          = cpu_to_be16(priv->pkey);
 548        path->pathrec.numb_path     = 1;
 549        path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
 550
 551        return path;
 552}
 553
 554static int path_rec_start(struct net_device *dev,
 555                          struct ipoib_path *path)
 556{
 557        struct ipoib_dev_priv *priv = netdev_priv(dev);
 558
 559        ipoib_dbg(priv, "Start path record lookup for %pI6\n",
 560                  path->pathrec.dgid.raw);
 561
 562        init_completion(&path->done);
 563
 564        path->query_id =
 565                ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
 566                                   &path->pathrec,
 567                                   IB_SA_PATH_REC_DGID          |
 568                                   IB_SA_PATH_REC_SGID          |
 569                                   IB_SA_PATH_REC_NUMB_PATH     |
 570                                   IB_SA_PATH_REC_TRAFFIC_CLASS |
 571                                   IB_SA_PATH_REC_PKEY,
 572                                   1000, GFP_ATOMIC,
 573                                   path_rec_completion,
 574                                   path, &path->query);
 575        if (path->query_id < 0) {
 576                ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
 577                path->query = NULL;
 578                complete(&path->done);
 579                return path->query_id;
 580        }
 581
 582        return 0;
 583}
 584
 585static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 586                           struct net_device *dev)
 587{
 588        struct ipoib_dev_priv *priv = netdev_priv(dev);
 589        struct ipoib_path *path;
 590        struct ipoib_neigh *neigh;
 591        unsigned long flags;
 592
 593        spin_lock_irqsave(&priv->lock, flags);
 594        neigh = ipoib_neigh_alloc(daddr, dev);
 595        if (!neigh) {
 596                spin_unlock_irqrestore(&priv->lock, flags);
 597                ++dev->stats.tx_dropped;
 598                dev_kfree_skb_any(skb);
 599                return;
 600        }
 601
 602        path = __path_find(dev, daddr + 4);
 603        if (!path) {
 604                path = path_rec_create(dev, daddr + 4);
 605                if (!path)
 606                        goto err_path;
 607
 608                __path_add(dev, path);
 609        }
 610
 611        list_add_tail(&neigh->list, &path->neigh_list);
 612
 613        if (path->ah) {
 614                kref_get(&path->ah->ref);
 615                neigh->ah = path->ah;
 616
 617                if (ipoib_cm_enabled(dev, neigh->daddr)) {
 618                        if (!ipoib_cm_get(neigh))
 619                                ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
 620                        if (!ipoib_cm_get(neigh)) {
 621                                list_del(&neigh->list);
 622                                ipoib_neigh_free(neigh);
 623                                goto err_drop;
 624                        }
 625                        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
 626                                __skb_queue_tail(&neigh->queue, skb);
 627                        else {
 628                                ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
 629                                           skb_queue_len(&neigh->queue));
 630                                goto err_drop;
 631                        }
 632                } else {
 633                        spin_unlock_irqrestore(&priv->lock, flags);
 634                        ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
 635                        ipoib_neigh_put(neigh);
 636                        return;
 637                }
 638        } else {
 639                neigh->ah  = NULL;
 640
 641                if (!path->query && path_rec_start(dev, path))
 642                        goto err_list;
 643
 644                __skb_queue_tail(&neigh->queue, skb);
 645        }
 646
 647        spin_unlock_irqrestore(&priv->lock, flags);
 648        ipoib_neigh_put(neigh);
 649        return;
 650
 651err_list:
 652        list_del(&neigh->list);
 653
 654err_path:
 655        ipoib_neigh_free(neigh);
 656err_drop:
 657        ++dev->stats.tx_dropped;
 658        dev_kfree_skb_any(skb);
 659
 660        spin_unlock_irqrestore(&priv->lock, flags);
 661        ipoib_neigh_put(neigh);
 662}
 663
 664static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 665                             struct ipoib_cb *cb)
 666{
 667        struct ipoib_dev_priv *priv = netdev_priv(dev);
 668        struct ipoib_path *path;
 669        unsigned long flags;
 670
 671        spin_lock_irqsave(&priv->lock, flags);
 672
 673        path = __path_find(dev, cb->hwaddr + 4);
 674        if (!path || !path->valid) {
 675                int new_path = 0;
 676
 677                if (!path) {
 678                        path = path_rec_create(dev, cb->hwaddr + 4);
 679                        new_path = 1;
 680                }
 681                if (path) {
 682                        __skb_queue_tail(&path->queue, skb);
 683
 684                        if (!path->query && path_rec_start(dev, path)) {
 685                                spin_unlock_irqrestore(&priv->lock, flags);
 686                                if (new_path)
 687                                        path_free(dev, path);
 688                                return;
 689                        } else
 690                                __path_add(dev, path);
 691                } else {
 692                        ++dev->stats.tx_dropped;
 693                        dev_kfree_skb_any(skb);
 694                }
 695
 696                spin_unlock_irqrestore(&priv->lock, flags);
 697                return;
 698        }
 699
 700        if (path->ah) {
 701                ipoib_dbg(priv, "Send unicast ARP to %04x\n",
 702                          be16_to_cpu(path->pathrec.dlid));
 703
 704                spin_unlock_irqrestore(&priv->lock, flags);
 705                ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
 706                return;
 707        } else if ((path->query || !path_rec_start(dev, path)) &&
 708                   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
 709                __skb_queue_tail(&path->queue, skb);
 710        } else {
 711                ++dev->stats.tx_dropped;
 712                dev_kfree_skb_any(skb);
 713        }
 714
 715        spin_unlock_irqrestore(&priv->lock, flags);
 716}
 717
 718static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 719{
 720        struct ipoib_dev_priv *priv = netdev_priv(dev);
 721        struct ipoib_neigh *neigh;
 722        struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
 723        struct ipoib_header *header;
 724        unsigned long flags;
 725
 726        header = (struct ipoib_header *) skb->data;
 727
 728        if (unlikely(cb->hwaddr[4] == 0xff)) {
 729                /* multicast, arrange "if" according to probability */
 730                if ((header->proto != htons(ETH_P_IP)) &&
 731                    (header->proto != htons(ETH_P_IPV6)) &&
 732                    (header->proto != htons(ETH_P_ARP)) &&
 733                    (header->proto != htons(ETH_P_RARP)) &&
 734                    (header->proto != htons(ETH_P_TIPC))) {
 735                        /* ethertype not supported by IPoIB */
 736                        ++dev->stats.tx_dropped;
 737                        dev_kfree_skb_any(skb);
 738                        return NETDEV_TX_OK;
 739                }
 740                /* Add in the P_Key for multicast*/
 741                cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
 742                cb->hwaddr[9] = priv->pkey & 0xff;
 743
 744                neigh = ipoib_neigh_get(dev, cb->hwaddr);
 745                if (likely(neigh))
 746                        goto send_using_neigh;
 747                ipoib_mcast_send(dev, cb->hwaddr, skb);
 748                return NETDEV_TX_OK;
 749        }
 750
 751        /* unicast, arrange "switch" according to probability */
 752        switch (header->proto) {
 753        case htons(ETH_P_IP):
 754        case htons(ETH_P_IPV6):
 755        case htons(ETH_P_TIPC):
 756                neigh = ipoib_neigh_get(dev, cb->hwaddr);
 757                if (unlikely(!neigh)) {
 758                        neigh_add_path(skb, cb->hwaddr, dev);
 759                        return NETDEV_TX_OK;
 760                }
 761                break;
 762        case htons(ETH_P_ARP):
 763        case htons(ETH_P_RARP):
 764                /* for unicast ARP and RARP should always perform path find */
 765                unicast_arp_send(skb, dev, cb);
 766                return NETDEV_TX_OK;
 767        default:
 768                /* ethertype not supported by IPoIB */
 769                ++dev->stats.tx_dropped;
 770                dev_kfree_skb_any(skb);
 771                return NETDEV_TX_OK;
 772        }
 773
 774send_using_neigh:
 775        /* note we now hold a ref to neigh */
 776        if (ipoib_cm_get(neigh)) {
 777                if (ipoib_cm_up(neigh)) {
 778                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
 779                        goto unref;
 780                }
 781        } else if (neigh->ah) {
 782                ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
 783                goto unref;
 784        }
 785
 786        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
 787                spin_lock_irqsave(&priv->lock, flags);
 788                __skb_queue_tail(&neigh->queue, skb);
 789                spin_unlock_irqrestore(&priv->lock, flags);
 790        } else {
 791                ++dev->stats.tx_dropped;
 792                dev_kfree_skb_any(skb);
 793        }
 794
 795unref:
 796        ipoib_neigh_put(neigh);
 797
 798        return NETDEV_TX_OK;
 799}
 800
 801static void ipoib_timeout(struct net_device *dev)
 802{
 803        struct ipoib_dev_priv *priv = netdev_priv(dev);
 804
 805        ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
 806                   jiffies_to_msecs(jiffies - dev->trans_start));
 807        ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
 808                   netif_queue_stopped(dev),
 809                   priv->tx_head, priv->tx_tail);
 810        /* XXX reset QP, etc. */
 811}
 812
 813static int ipoib_hard_header(struct sk_buff *skb,
 814                             struct net_device *dev,
 815                             unsigned short type,
 816                             const void *daddr, const void *saddr, unsigned len)
 817{
 818        struct ipoib_header *header;
 819        struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
 820
 821        header = (struct ipoib_header *) skb_push(skb, sizeof *header);
 822
 823        header->proto = htons(type);
 824        header->reserved = 0;
 825
 826        /*
 827         * we don't rely on dst_entry structure,  always stuff the
 828         * destination address into skb->cb so we can figure out where
 829         * to send the packet later.
 830         */
 831        memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
 832
 833        return sizeof *header;
 834}
 835
 836static void ipoib_set_mcast_list(struct net_device *dev)
 837{
 838        struct ipoib_dev_priv *priv = netdev_priv(dev);
 839
 840        if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
 841                ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
 842                return;
 843        }
 844
 845        queue_work(ipoib_workqueue, &priv->restart_task);
 846}
 847
 848static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
 849{
 850        /*
 851         * Use only the address parts that contributes to spreading
 852         * The subnet prefix is not used as one can not connect to
 853         * same remote port (GUID) using the same remote QPN via two
 854         * different subnets.
 855         */
 856         /* qpn octets[1:4) & port GUID octets[12:20) */
 857        u32 *d32 = (u32 *) daddr;
 858        u32 hv;
 859
 860        hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
 861        return hv & htbl->mask;
 862}
 863
 864struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
 865{
 866        struct ipoib_dev_priv *priv = netdev_priv(dev);
 867        struct ipoib_neigh_table *ntbl = &priv->ntbl;
 868        struct ipoib_neigh_hash *htbl;
 869        struct ipoib_neigh *neigh = NULL;
 870        u32 hash_val;
 871
 872        rcu_read_lock_bh();
 873
 874        htbl = rcu_dereference_bh(ntbl->htbl);
 875
 876        if (!htbl)
 877                goto out_unlock;
 878
 879        hash_val = ipoib_addr_hash(htbl, daddr);
 880        for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
 881             neigh != NULL;
 882             neigh = rcu_dereference_bh(neigh->hnext)) {
 883                if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
 884                        /* found, take one ref on behalf of the caller */
 885                        if (!atomic_inc_not_zero(&neigh->refcnt)) {
 886                                /* deleted */
 887                                neigh = NULL;
 888                                goto out_unlock;
 889                        }
 890                        neigh->alive = jiffies;
 891                        goto out_unlock;
 892                }
 893        }
 894
 895out_unlock:
 896        rcu_read_unlock_bh();
 897        return neigh;
 898}
 899
 900static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
 901{
 902        struct ipoib_neigh_table *ntbl = &priv->ntbl;
 903        struct ipoib_neigh_hash *htbl;
 904        unsigned long neigh_obsolete;
 905        unsigned long dt;
 906        unsigned long flags;
 907        int i;
 908
 909        if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
 910                return;
 911
 912        spin_lock_irqsave(&priv->lock, flags);
 913
 914        htbl = rcu_dereference_protected(ntbl->htbl,
 915                                         lockdep_is_held(&priv->lock));
 916
 917        if (!htbl)
 918                goto out_unlock;
 919
 920        /* neigh is obsolete if it was idle for two GC periods */
 921        dt = 2 * arp_tbl.gc_interval;
 922        neigh_obsolete = jiffies - dt;
 923        /* handle possible race condition */
 924        if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
 925                goto out_unlock;
 926
 927        for (i = 0; i < htbl->size; i++) {
 928                struct ipoib_neigh *neigh;
 929                struct ipoib_neigh __rcu **np = &htbl->buckets[i];
 930
 931                while ((neigh = rcu_dereference_protected(*np,
 932                                                          lockdep_is_held(&priv->lock))) != NULL) {
 933                        /* was the neigh idle for two GC periods */
 934                        if (time_after(neigh_obsolete, neigh->alive)) {
 935                                rcu_assign_pointer(*np,
 936                                                   rcu_dereference_protected(neigh->hnext,
 937                                                                             lockdep_is_held(&priv->lock)));
 938                                /* remove from path/mc list */
 939                                list_del(&neigh->list);
 940                                call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
 941                        } else {
 942                                np = &neigh->hnext;
 943                        }
 944
 945                }
 946        }
 947
 948out_unlock:
 949        spin_unlock_irqrestore(&priv->lock, flags);
 950}
 951
 952static void ipoib_reap_neigh(struct work_struct *work)
 953{
 954        struct ipoib_dev_priv *priv =
 955                container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
 956
 957        __ipoib_reap_neigh(priv);
 958
 959        if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
 960                queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
 961                                   arp_tbl.gc_interval);
 962}
 963
 964
 965static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
 966                                      struct net_device *dev)
 967{
 968        struct ipoib_neigh *neigh;
 969
 970        neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
 971        if (!neigh)
 972                return NULL;
 973
 974        neigh->dev = dev;
 975        memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
 976        skb_queue_head_init(&neigh->queue);
 977        INIT_LIST_HEAD(&neigh->list);
 978        ipoib_cm_set(neigh, NULL);
 979        /* one ref on behalf of the caller */
 980        atomic_set(&neigh->refcnt, 1);
 981
 982        return neigh;
 983}
 984
 985struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
 986                                      struct net_device *dev)
 987{
 988        struct ipoib_dev_priv *priv = netdev_priv(dev);
 989        struct ipoib_neigh_table *ntbl = &priv->ntbl;
 990        struct ipoib_neigh_hash *htbl;
 991        struct ipoib_neigh *neigh;
 992        u32 hash_val;
 993
 994        htbl = rcu_dereference_protected(ntbl->htbl,
 995                                         lockdep_is_held(&priv->lock));
 996        if (!htbl) {
 997                neigh = NULL;
 998                goto out_unlock;
 999        }
1000
1001        /* need to add a new neigh, but maybe some other thread succeeded?
1002         * recalc hash, maybe hash resize took place so we do a search
1003         */
1004        hash_val = ipoib_addr_hash(htbl, daddr);
1005        for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
1006                                               lockdep_is_held(&priv->lock));
1007             neigh != NULL;
1008             neigh = rcu_dereference_protected(neigh->hnext,
1009                                               lockdep_is_held(&priv->lock))) {
1010                if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
1011                        /* found, take one ref on behalf of the caller */
1012                        if (!atomic_inc_not_zero(&neigh->refcnt)) {
1013                                /* deleted */
1014                                neigh = NULL;
1015                                break;
1016                        }
1017                        neigh->alive = jiffies;
1018                        goto out_unlock;
1019                }
1020        }
1021
1022        neigh = ipoib_neigh_ctor(daddr, dev);
1023        if (!neigh)
1024                goto out_unlock;
1025
1026        /* one ref on behalf of the hash table */
1027        atomic_inc(&neigh->refcnt);
1028        neigh->alive = jiffies;
1029        /* put in hash */
1030        rcu_assign_pointer(neigh->hnext,
1031                           rcu_dereference_protected(htbl->buckets[hash_val],
1032                                                     lockdep_is_held(&priv->lock)));
1033        rcu_assign_pointer(htbl->buckets[hash_val], neigh);
1034        atomic_inc(&ntbl->entries);
1035
1036out_unlock:
1037
1038        return neigh;
1039}
1040
1041void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
1042{
1043        /* neigh reference count was dropprd to zero */
1044        struct net_device *dev = neigh->dev;
1045        struct ipoib_dev_priv *priv = netdev_priv(dev);
1046        struct sk_buff *skb;
1047        if (neigh->ah)
1048                ipoib_put_ah(neigh->ah);
1049        while ((skb = __skb_dequeue(&neigh->queue))) {
1050                ++dev->stats.tx_dropped;
1051                dev_kfree_skb_any(skb);
1052        }
1053        if (ipoib_cm_get(neigh))
1054                ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
1055        ipoib_dbg(netdev_priv(dev),
1056                  "neigh free for %06x %pI6\n",
1057                  IPOIB_QPN(neigh->daddr),
1058                  neigh->daddr + 4);
1059        kfree(neigh);
1060        if (atomic_dec_and_test(&priv->ntbl.entries)) {
1061                if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
1062                        complete(&priv->ntbl.flushed);
1063        }
1064}
1065
1066static void ipoib_neigh_reclaim(struct rcu_head *rp)
1067{
1068        /* Called as a result of removal from hash table */
1069        struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
1070        /* note TX context may hold another ref */
1071        ipoib_neigh_put(neigh);
1072}
1073
1074void ipoib_neigh_free(struct ipoib_neigh *neigh)
1075{
1076        struct net_device *dev = neigh->dev;
1077        struct ipoib_dev_priv *priv = netdev_priv(dev);
1078        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1079        struct ipoib_neigh_hash *htbl;
1080        struct ipoib_neigh __rcu **np;
1081        struct ipoib_neigh *n;
1082        u32 hash_val;
1083
1084        htbl = rcu_dereference_protected(ntbl->htbl,
1085                                        lockdep_is_held(&priv->lock));
1086        if (!htbl)
1087                return;
1088
1089        hash_val = ipoib_addr_hash(htbl, neigh->daddr);
1090        np = &htbl->buckets[hash_val];
1091        for (n = rcu_dereference_protected(*np,
1092                                            lockdep_is_held(&priv->lock));
1093             n != NULL;
1094             n = rcu_dereference_protected(*np,
1095                                        lockdep_is_held(&priv->lock))) {
1096                if (n == neigh) {
1097                        /* found */
1098                        rcu_assign_pointer(*np,
1099                                           rcu_dereference_protected(neigh->hnext,
1100                                                                     lockdep_is_held(&priv->lock)));
1101                        call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1102                        return;
1103                } else {
1104                        np = &n->hnext;
1105                }
1106        }
1107}
1108
1109static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
1110{
1111        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1112        struct ipoib_neigh_hash *htbl;
1113        struct ipoib_neigh **buckets;
1114        u32 size;
1115
1116        clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1117        ntbl->htbl = NULL;
1118        htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
1119        if (!htbl)
1120                return -ENOMEM;
1121        set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1122        size = roundup_pow_of_two(arp_tbl.gc_thresh3);
1123        buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
1124        if (!buckets) {
1125                kfree(htbl);
1126                return -ENOMEM;
1127        }
1128        htbl->size = size;
1129        htbl->mask = (size - 1);
1130        htbl->buckets = buckets;
1131        ntbl->htbl = htbl;
1132        htbl->ntbl = ntbl;
1133        atomic_set(&ntbl->entries, 0);
1134
1135        /* start garbage collection */
1136        clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1137        queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
1138                           arp_tbl.gc_interval);
1139
1140        return 0;
1141}
1142
1143static void neigh_hash_free_rcu(struct rcu_head *head)
1144{
1145        struct ipoib_neigh_hash *htbl = container_of(head,
1146                                                    struct ipoib_neigh_hash,
1147                                                    rcu);
1148        struct ipoib_neigh __rcu **buckets = htbl->buckets;
1149        struct ipoib_neigh_table *ntbl = htbl->ntbl;
1150
1151        kfree(buckets);
1152        kfree(htbl);
1153        complete(&ntbl->deleted);
1154}
1155
1156void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
1157{
1158        struct ipoib_dev_priv *priv = netdev_priv(dev);
1159        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1160        struct ipoib_neigh_hash *htbl;
1161        unsigned long flags;
1162        int i;
1163
1164        /* remove all neigh connected to a given path or mcast */
1165        spin_lock_irqsave(&priv->lock, flags);
1166
1167        htbl = rcu_dereference_protected(ntbl->htbl,
1168                                         lockdep_is_held(&priv->lock));
1169
1170        if (!htbl)
1171                goto out_unlock;
1172
1173        for (i = 0; i < htbl->size; i++) {
1174                struct ipoib_neigh *neigh;
1175                struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1176
1177                while ((neigh = rcu_dereference_protected(*np,
1178                                                          lockdep_is_held(&priv->lock))) != NULL) {
1179                        /* delete neighs belong to this parent */
1180                        if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
1181                                rcu_assign_pointer(*np,
1182                                                   rcu_dereference_protected(neigh->hnext,
1183                                                                             lockdep_is_held(&priv->lock)));
1184                                /* remove from parent list */
1185                                list_del(&neigh->list);
1186                                call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1187                        } else {
1188                                np = &neigh->hnext;
1189                        }
1190
1191                }
1192        }
1193out_unlock:
1194        spin_unlock_irqrestore(&priv->lock, flags);
1195}
1196
1197static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
1198{
1199        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1200        struct ipoib_neigh_hash *htbl;
1201        unsigned long flags;
1202        int i, wait_flushed = 0;
1203
1204        init_completion(&priv->ntbl.flushed);
1205
1206        spin_lock_irqsave(&priv->lock, flags);
1207
1208        htbl = rcu_dereference_protected(ntbl->htbl,
1209                                        lockdep_is_held(&priv->lock));
1210        if (!htbl)
1211                goto out_unlock;
1212
1213        wait_flushed = atomic_read(&priv->ntbl.entries);
1214        if (!wait_flushed)
1215                goto free_htbl;
1216
1217        for (i = 0; i < htbl->size; i++) {
1218                struct ipoib_neigh *neigh;
1219                struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1220
1221                while ((neigh = rcu_dereference_protected(*np,
1222                                       lockdep_is_held(&priv->lock))) != NULL) {
1223                        rcu_assign_pointer(*np,
1224                                           rcu_dereference_protected(neigh->hnext,
1225                                                                     lockdep_is_held(&priv->lock)));
1226                        /* remove from path/mc list */
1227                        list_del(&neigh->list);
1228                        call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1229                }
1230        }
1231
1232free_htbl:
1233        rcu_assign_pointer(ntbl->htbl, NULL);
1234        call_rcu(&htbl->rcu, neigh_hash_free_rcu);
1235
1236out_unlock:
1237        spin_unlock_irqrestore(&priv->lock, flags);
1238        if (wait_flushed)
1239                wait_for_completion(&priv->ntbl.flushed);
1240}
1241
1242static void ipoib_neigh_hash_uninit(struct net_device *dev)
1243{
1244        struct ipoib_dev_priv *priv = netdev_priv(dev);
1245        int stopped;
1246
1247        ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
1248        init_completion(&priv->ntbl.deleted);
1249        set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1250
1251        /* Stop GC if called at init fail need to cancel work */
1252        stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1253        if (!stopped)
1254                cancel_delayed_work(&priv->neigh_reap_task);
1255
1256        ipoib_flush_neighs(priv);
1257
1258        wait_for_completion(&priv->ntbl.deleted);
1259}
1260
1261
1262int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
1263{
1264        struct ipoib_dev_priv *priv = netdev_priv(dev);
1265
1266        if (ipoib_neigh_hash_init(priv) < 0)
1267                goto out;
1268        /* Allocate RX/TX "rings" to hold queued skbs */
1269        priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
1270                                GFP_KERNEL);
1271        if (!priv->rx_ring) {
1272                printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
1273                       ca->name, ipoib_recvq_size);
1274                goto out_neigh_hash_cleanup;
1275        }
1276
1277        priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
1278        if (!priv->tx_ring) {
1279                printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
1280                       ca->name, ipoib_sendq_size);
1281                goto out_rx_ring_cleanup;
1282        }
1283
1284        /* priv->tx_head, tx_tail & tx_outstanding are already 0 */
1285
1286        if (ipoib_ib_dev_init(dev, ca, port))
1287                goto out_tx_ring_cleanup;
1288
1289        return 0;
1290
1291out_tx_ring_cleanup:
1292        vfree(priv->tx_ring);
1293
1294out_rx_ring_cleanup:
1295        kfree(priv->rx_ring);
1296
1297out_neigh_hash_cleanup:
1298        ipoib_neigh_hash_uninit(dev);
1299out:
1300        return -ENOMEM;
1301}
1302
1303void ipoib_dev_cleanup(struct net_device *dev)
1304{
1305        struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
1306        LIST_HEAD(head);
1307
1308        ASSERT_RTNL();
1309
1310        ipoib_delete_debug_files(dev);
1311
1312        /* Delete any child interfaces first */
1313        list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
1314                /* Stop GC on child */
1315                set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
1316                cancel_delayed_work(&cpriv->neigh_reap_task);
1317                unregister_netdevice_queue(cpriv->dev, &head);
1318        }
1319        unregister_netdevice_many(&head);
1320
1321        ipoib_ib_dev_cleanup(dev);
1322
1323        kfree(priv->rx_ring);
1324        vfree(priv->tx_ring);
1325
1326        priv->rx_ring = NULL;
1327        priv->tx_ring = NULL;
1328
1329        ipoib_neigh_hash_uninit(dev);
1330}
1331
1332static const struct header_ops ipoib_header_ops = {
1333        .create = ipoib_hard_header,
1334};
1335
1336static const struct net_device_ops ipoib_netdev_ops = {
1337        .ndo_uninit              = ipoib_uninit,
1338        .ndo_open                = ipoib_open,
1339        .ndo_stop                = ipoib_stop,
1340        .ndo_change_mtu          = ipoib_change_mtu,
1341        .ndo_fix_features        = ipoib_fix_features,
1342        .ndo_start_xmit          = ipoib_start_xmit,
1343        .ndo_tx_timeout          = ipoib_timeout,
1344        .ndo_set_rx_mode         = ipoib_set_mcast_list,
1345};
1346
1347void ipoib_setup(struct net_device *dev)
1348{
1349        struct ipoib_dev_priv *priv = netdev_priv(dev);
1350
1351        dev->netdev_ops          = &ipoib_netdev_ops;
1352        dev->header_ops          = &ipoib_header_ops;
1353
1354        ipoib_set_ethtool_ops(dev);
1355
1356        netif_napi_add(dev, &priv->napi, ipoib_poll, 100);
1357
1358        dev->watchdog_timeo      = HZ;
1359
1360        dev->flags              |= IFF_BROADCAST | IFF_MULTICAST;
1361
1362        dev->hard_header_len     = IPOIB_ENCAP_LEN;
1363        dev->addr_len            = INFINIBAND_ALEN;
1364        dev->type                = ARPHRD_INFINIBAND;
1365        dev->tx_queue_len        = ipoib_sendq_size * 2;
1366        dev->features            = (NETIF_F_VLAN_CHALLENGED     |
1367                                    NETIF_F_HIGHDMA);
1368        dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1369
1370        memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
1371
1372        netif_carrier_off(dev);
1373
1374        priv->dev = dev;
1375
1376        spin_lock_init(&priv->lock);
1377
1378        mutex_init(&priv->vlan_mutex);
1379
1380        INIT_LIST_HEAD(&priv->path_list);
1381        INIT_LIST_HEAD(&priv->child_intfs);
1382        INIT_LIST_HEAD(&priv->dead_ahs);
1383        INIT_LIST_HEAD(&priv->multicast_list);
1384
1385        INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
1386        INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
1387        INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
1388        INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
1389        INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
1390        INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
1391        INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
1392        INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
1393        INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
1394}
1395
1396struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
1397{
1398        struct net_device *dev;
1399
1400        dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,
1401                           ipoib_setup);
1402        if (!dev)
1403                return NULL;
1404
1405        return netdev_priv(dev);
1406}
1407
1408static ssize_t show_pkey(struct device *dev,
1409                         struct device_attribute *attr, char *buf)
1410{
1411        struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1412
1413        return sprintf(buf, "0x%04x\n", priv->pkey);
1414}
1415static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
1416
1417static ssize_t show_umcast(struct device *dev,
1418                           struct device_attribute *attr, char *buf)
1419{
1420        struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1421
1422        return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
1423}
1424
1425void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
1426{
1427        struct ipoib_dev_priv *priv = netdev_priv(ndev);
1428
1429        if (umcast_val > 0) {
1430                set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1431                ipoib_warn(priv, "ignoring multicast groups joined directly "
1432                                "by userspace\n");
1433        } else
1434                clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1435}
1436
1437static ssize_t set_umcast(struct device *dev,
1438                          struct device_attribute *attr,
1439                          const char *buf, size_t count)
1440{
1441        unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
1442
1443        ipoib_set_umcast(to_net_dev(dev), umcast_val);
1444
1445        return count;
1446}
1447static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
1448
1449int ipoib_add_umcast_attr(struct net_device *dev)
1450{
1451        return device_create_file(&dev->dev, &dev_attr_umcast);
1452}
1453
1454static ssize_t create_child(struct device *dev,
1455                            struct device_attribute *attr,
1456                            const char *buf, size_t count)
1457{
1458        int pkey;
1459        int ret;
1460
1461        if (sscanf(buf, "%i", &pkey) != 1)
1462                return -EINVAL;
1463
1464        if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
1465                return -EINVAL;
1466
1467        /*
1468         * Set the full membership bit, so that we join the right
1469         * broadcast group, etc.
1470         */
1471        pkey |= 0x8000;
1472
1473        ret = ipoib_vlan_add(to_net_dev(dev), pkey);
1474
1475        return ret ? ret : count;
1476}
1477static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
1478
1479static ssize_t delete_child(struct device *dev,
1480                            struct device_attribute *attr,
1481                            const char *buf, size_t count)
1482{
1483        int pkey;
1484        int ret;
1485
1486        if (sscanf(buf, "%i", &pkey) != 1)
1487                return -EINVAL;
1488
1489        if (pkey < 0 || pkey > 0xffff)
1490                return -EINVAL;
1491
1492        ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
1493
1494        return ret ? ret : count;
1495
1496}
1497static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
1498
1499int ipoib_add_pkey_attr(struct net_device *dev)
1500{
1501        return device_create_file(&dev->dev, &dev_attr_pkey);
1502}
1503
1504int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
1505{
1506        struct ib_device_attr *device_attr;
1507        int result = -ENOMEM;
1508
1509        device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
1510        if (!device_attr) {
1511                printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
1512                       hca->name, sizeof *device_attr);
1513                return result;
1514        }
1515
1516        result = ib_query_device(hca, device_attr);
1517        if (result) {
1518                printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
1519                       hca->name, result);
1520                kfree(device_attr);
1521                return result;
1522        }
1523        priv->hca_caps = device_attr->device_cap_flags;
1524
1525        kfree(device_attr);
1526
1527        if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
1528                priv->dev->hw_features = NETIF_F_SG |
1529                        NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
1530
1531                if (priv->hca_caps & IB_DEVICE_UD_TSO)
1532                        priv->dev->hw_features |= NETIF_F_TSO;
1533
1534                priv->dev->features |= priv->dev->hw_features;
1535        }
1536
1537        return 0;
1538}
1539
1540static struct net_device *ipoib_add_port(const char *format,
1541                                         struct ib_device *hca, u8 port)
1542{
1543        struct ipoib_dev_priv *priv;
1544        struct ib_port_attr attr;
1545        int result = -ENOMEM;
1546
1547        priv = ipoib_intf_alloc(format);
1548        if (!priv)
1549                goto alloc_mem_failed;
1550
1551        SET_NETDEV_DEV(priv->dev, hca->dma_device);
1552        priv->dev->dev_id = port - 1;
1553
1554        if (!ib_query_port(hca, port, &attr))
1555                priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
1556        else {
1557                printk(KERN_WARNING "%s: ib_query_port %d failed\n",
1558                       hca->name, port);
1559                goto device_init_failed;
1560        }
1561
1562        /* MTU will be reset when mcast join happens */
1563        priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
1564        priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
1565
1566        priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
1567
1568        result = ib_query_pkey(hca, port, 0, &priv->pkey);
1569        if (result) {
1570                printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
1571                       hca->name, port, result);
1572                goto device_init_failed;
1573        }
1574
1575        if (ipoib_set_dev_features(priv, hca))
1576                goto device_init_failed;
1577
1578        /*
1579         * Set the full membership bit, so that we join the right
1580         * broadcast group, etc.
1581         */
1582        priv->pkey |= 0x8000;
1583
1584        priv->dev->broadcast[8] = priv->pkey >> 8;
1585        priv->dev->broadcast[9] = priv->pkey & 0xff;
1586
1587        result = ib_query_gid(hca, port, 0, &priv->local_gid);
1588        if (result) {
1589                printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
1590                       hca->name, port, result);
1591                goto device_init_failed;
1592        } else
1593                memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
1594
1595        result = ipoib_dev_init(priv->dev, hca, port);
1596        if (result < 0) {
1597                printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
1598                       hca->name, port, result);
1599                goto device_init_failed;
1600        }
1601
1602        INIT_IB_EVENT_HANDLER(&priv->event_handler,
1603                              priv->ca, ipoib_event);
1604        result = ib_register_event_handler(&priv->event_handler);
1605        if (result < 0) {
1606                printk(KERN_WARNING "%s: ib_register_event_handler failed for "
1607                       "port %d (ret = %d)\n",
1608                       hca->name, port, result);
1609                goto event_failed;
1610        }
1611
1612        result = register_netdev(priv->dev);
1613        if (result) {
1614                printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
1615                       hca->name, port, result);
1616                goto register_failed;
1617        }
1618
1619        ipoib_create_debug_files(priv->dev);
1620
1621        if (ipoib_cm_add_mode_attr(priv->dev))
1622                goto sysfs_failed;
1623        if (ipoib_add_pkey_attr(priv->dev))
1624                goto sysfs_failed;
1625        if (ipoib_add_umcast_attr(priv->dev))
1626                goto sysfs_failed;
1627        if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
1628                goto sysfs_failed;
1629        if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
1630                goto sysfs_failed;
1631
1632        return priv->dev;
1633
1634sysfs_failed:
1635        ipoib_delete_debug_files(priv->dev);
1636        unregister_netdev(priv->dev);
1637
1638register_failed:
1639        ib_unregister_event_handler(&priv->event_handler);
1640        /* Stop GC if started before flush */
1641        set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1642        cancel_delayed_work(&priv->neigh_reap_task);
1643        flush_workqueue(ipoib_workqueue);
1644
1645event_failed:
1646        ipoib_dev_cleanup(priv->dev);
1647
1648device_init_failed:
1649        free_netdev(priv->dev);
1650
1651alloc_mem_failed:
1652        return ERR_PTR(result);
1653}
1654
1655static void ipoib_add_one(struct ib_device *device)
1656{
1657        struct list_head *dev_list;
1658        struct net_device *dev;
1659        struct ipoib_dev_priv *priv;
1660        int s, e, p;
1661
1662        if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1663                return;
1664
1665        dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
1666        if (!dev_list)
1667                return;
1668
1669        INIT_LIST_HEAD(dev_list);
1670
1671        if (device->node_type == RDMA_NODE_IB_SWITCH) {
1672                s = 0;
1673                e = 0;
1674        } else {
1675                s = 1;
1676                e = device->phys_port_cnt;
1677        }
1678
1679        for (p = s; p <= e; ++p) {
1680                if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
1681                        continue;
1682                dev = ipoib_add_port("ib%d", device, p);
1683                if (!IS_ERR(dev)) {
1684                        priv = netdev_priv(dev);
1685                        list_add_tail(&priv->list, dev_list);
1686                }
1687        }
1688
1689        ib_set_client_data(device, &ipoib_client, dev_list);
1690}
1691
1692static void ipoib_remove_one(struct ib_device *device)
1693{
1694        struct ipoib_dev_priv *priv, *tmp;
1695        struct list_head *dev_list;
1696
1697        if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1698                return;
1699
1700        dev_list = ib_get_client_data(device, &ipoib_client);
1701        if (!dev_list)
1702                return;
1703
1704        list_for_each_entry_safe(priv, tmp, dev_list, list) {
1705                ib_unregister_event_handler(&priv->event_handler);
1706
1707                rtnl_lock();
1708                dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
1709                rtnl_unlock();
1710
1711                /* Stop GC */
1712                set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1713                cancel_delayed_work(&priv->neigh_reap_task);
1714                flush_workqueue(ipoib_workqueue);
1715
1716                unregister_netdev(priv->dev);
1717                free_netdev(priv->dev);
1718        }
1719
1720        kfree(dev_list);
1721}
1722
1723static int __init ipoib_init_module(void)
1724{
1725        int ret;
1726
1727        ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
1728        ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
1729        ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
1730
1731        ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
1732        ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
1733        ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
1734#ifdef CONFIG_INFINIBAND_IPOIB_CM
1735        ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
1736#endif
1737
1738        /*
1739         * When copying small received packets, we only copy from the
1740         * linear data part of the SKB, so we rely on this condition.
1741         */
1742        BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
1743
1744        ret = ipoib_register_debugfs();
1745        if (ret)
1746                return ret;
1747
1748        /*
1749         * We create our own workqueue mainly because we want to be
1750         * able to flush it when devices are being removed.  We can't
1751         * use schedule_work()/flush_scheduled_work() because both
1752         * unregister_netdev() and linkwatch_event take the rtnl lock,
1753         * so flush_scheduled_work() can deadlock during device
1754         * removal.
1755         */
1756        ipoib_workqueue = create_singlethread_workqueue("ipoib");
1757        if (!ipoib_workqueue) {
1758                ret = -ENOMEM;
1759                goto err_fs;
1760        }
1761
1762        ib_sa_register_client(&ipoib_sa_client);
1763
1764        ret = ib_register_client(&ipoib_client);
1765        if (ret)
1766                goto err_sa;
1767
1768        ret = ipoib_netlink_init();
1769        if (ret)
1770                goto err_client;
1771
1772        return 0;
1773
1774err_client:
1775        ib_unregister_client(&ipoib_client);
1776
1777err_sa:
1778        ib_sa_unregister_client(&ipoib_sa_client);
1779        destroy_workqueue(ipoib_workqueue);
1780
1781err_fs:
1782        ipoib_unregister_debugfs();
1783
1784        return ret;
1785}
1786
1787static void __exit ipoib_cleanup_module(void)
1788{
1789        ipoib_netlink_fini();
1790        ib_unregister_client(&ipoib_client);
1791        ib_sa_unregister_client(&ipoib_sa_client);
1792        ipoib_unregister_debugfs();
1793        destroy_workqueue(ipoib_workqueue);
1794}
1795
1796module_init(ipoib_init_module);
1797module_exit(ipoib_cleanup_module);
1798