linux/drivers/infiniband/ulp/ipoib/ipoib_main.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
   3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
   4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
   5 *
   6 * This software is available to you under a choice of one of two
   7 * licenses.  You may choose to be licensed under the terms of the GNU
   8 * General Public License (GPL) Version 2, available from the file
   9 * COPYING in the main directory of this source tree, or the
  10 * OpenIB.org BSD license below:
  11 *
  12 *     Redistribution and use in source and binary forms, with or
  13 *     without modification, are permitted provided that the following
  14 *     conditions are met:
  15 *
  16 *      - Redistributions of source code must retain the above
  17 *        copyright notice, this list of conditions and the following
  18 *        disclaimer.
  19 *
  20 *      - Redistributions in binary form must reproduce the above
  21 *        copyright notice, this list of conditions and the following
  22 *        disclaimer in the documentation and/or other materials
  23 *        provided with the distribution.
  24 *
  25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32 * SOFTWARE.
  33 */
  34
  35#include "ipoib.h"
  36
  37#include <linux/module.h>
  38
  39#include <linux/init.h>
  40#include <linux/slab.h>
  41#include <linux/kernel.h>
  42#include <linux/vmalloc.h>
  43
  44#include <linux/if_arp.h>       /* For ARPHRD_xxx */
  45
  46#include <linux/ip.h>
  47#include <linux/in.h>
  48
  49#include <linux/jhash.h>
  50#include <net/arp.h>
  51#include <net/addrconf.h>
  52#include <linux/inetdevice.h>
  53#include <rdma/ib_cache.h>
  54#include <linux/pci.h>
  55
  56#define DRV_VERSION "1.0.0"
  57
  58const char ipoib_driver_version[] = DRV_VERSION;
  59
  60MODULE_AUTHOR("Roland Dreier");
  61MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
  62MODULE_LICENSE("Dual BSD/GPL");
  63MODULE_VERSION(DRV_VERSION);
  64
  65int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
  66int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
  67
  68module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
  69MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
  70module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
  71MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
  72
  73#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  74int ipoib_debug_level;
  75
  76module_param_named(debug_level, ipoib_debug_level, int, 0644);
  77MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
  78#endif
  79
  80struct ipoib_path_iter {
  81        struct net_device *dev;
  82        struct ipoib_path  path;
  83};
  84
  85static const u8 ipv4_bcast_addr[] = {
  86        0x00, 0xff, 0xff, 0xff,
  87        0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  88        0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
  89};
  90
  91struct workqueue_struct *ipoib_workqueue;
  92
  93struct ib_sa_client ipoib_sa_client;
  94
  95static void ipoib_add_one(struct ib_device *device);
  96static void ipoib_remove_one(struct ib_device *device, void *client_data);
  97static void ipoib_neigh_reclaim(struct rcu_head *rp);
  98static struct net_device *ipoib_get_net_dev_by_params(
  99                struct ib_device *dev, u8 port, u16 pkey,
 100                const union ib_gid *gid, const struct sockaddr *addr,
 101                void *client_data);
 102
 103static struct ib_client ipoib_client = {
 104        .name   = "ipoib",
 105        .add    = ipoib_add_one,
 106        .remove = ipoib_remove_one,
 107        .get_net_dev_by_params = ipoib_get_net_dev_by_params,
 108};
 109
 110int ipoib_open(struct net_device *dev)
 111{
 112        struct ipoib_dev_priv *priv = netdev_priv(dev);
 113
 114        ipoib_dbg(priv, "bringing up interface\n");
 115
 116        netif_carrier_off(dev);
 117
 118        set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 119
 120        if (ipoib_ib_dev_open(dev)) {
 121                if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
 122                        return 0;
 123                goto err_disable;
 124        }
 125
 126        if (ipoib_ib_dev_up(dev))
 127                goto err_stop;
 128
 129        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 130                struct ipoib_dev_priv *cpriv;
 131
 132                /* Bring up any child interfaces too */
 133                down_read(&priv->vlan_rwsem);
 134                list_for_each_entry(cpriv, &priv->child_intfs, list) {
 135                        int flags;
 136
 137                        flags = cpriv->dev->flags;
 138                        if (flags & IFF_UP)
 139                                continue;
 140
 141                        dev_change_flags(cpriv->dev, flags | IFF_UP);
 142                }
 143                up_read(&priv->vlan_rwsem);
 144        }
 145
 146        netif_start_queue(dev);
 147
 148        return 0;
 149
 150err_stop:
 151        ipoib_ib_dev_stop(dev);
 152
 153err_disable:
 154        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 155
 156        return -EINVAL;
 157}
 158
 159static int ipoib_stop(struct net_device *dev)
 160{
 161        struct ipoib_dev_priv *priv = netdev_priv(dev);
 162
 163        ipoib_dbg(priv, "stopping interface\n");
 164
 165        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 166
 167        netif_stop_queue(dev);
 168
 169        ipoib_ib_dev_down(dev);
 170        ipoib_ib_dev_stop(dev);
 171
 172        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 173                struct ipoib_dev_priv *cpriv;
 174
 175                /* Bring down any child interfaces too */
 176                down_read(&priv->vlan_rwsem);
 177                list_for_each_entry(cpriv, &priv->child_intfs, list) {
 178                        int flags;
 179
 180                        flags = cpriv->dev->flags;
 181                        if (!(flags & IFF_UP))
 182                                continue;
 183
 184                        dev_change_flags(cpriv->dev, flags & ~IFF_UP);
 185                }
 186                up_read(&priv->vlan_rwsem);
 187        }
 188
 189        return 0;
 190}
 191
 192static void ipoib_uninit(struct net_device *dev)
 193{
 194        ipoib_dev_cleanup(dev);
 195}
 196
 197static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
 198{
 199        struct ipoib_dev_priv *priv = netdev_priv(dev);
 200
 201        if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
 202                features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
 203
 204        return features;
 205}
 206
 207static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 208{
 209        struct ipoib_dev_priv *priv = netdev_priv(dev);
 210
 211        /* dev->mtu > 2K ==> connected mode */
 212        if (ipoib_cm_admin_enabled(dev)) {
 213                if (new_mtu > ipoib_cm_max_mtu(dev))
 214                        return -EINVAL;
 215
 216                if (new_mtu > priv->mcast_mtu)
 217                        ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
 218                                   priv->mcast_mtu);
 219
 220                dev->mtu = new_mtu;
 221                return 0;
 222        }
 223
 224        if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
 225                return -EINVAL;
 226
 227        priv->admin_mtu = new_mtu;
 228
 229        dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
 230
 231        return 0;
 232}
 233
 234/* Called with an RCU read lock taken */
 235static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
 236                                        struct net_device *dev)
 237{
 238        struct net *net = dev_net(dev);
 239        struct in_device *in_dev;
 240        struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
 241        struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
 242        __be32 ret_addr;
 243
 244        switch (addr->sa_family) {
 245        case AF_INET:
 246                in_dev = in_dev_get(dev);
 247                if (!in_dev)
 248                        return false;
 249
 250                ret_addr = inet_confirm_addr(net, in_dev, 0,
 251                                             addr_in->sin_addr.s_addr,
 252                                             RT_SCOPE_HOST);
 253                in_dev_put(in_dev);
 254                if (ret_addr)
 255                        return true;
 256
 257                break;
 258        case AF_INET6:
 259                if (IS_ENABLED(CONFIG_IPV6) &&
 260                    ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
 261                        return true;
 262
 263                break;
 264        }
 265        return false;
 266}
 267
 268/**
 269 * Find the master net_device on top of the given net_device.
 270 * @dev: base IPoIB net_device
 271 *
 272 * Returns the master net_device with a reference held, or the same net_device
 273 * if no master exists.
 274 */
 275static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
 276{
 277        struct net_device *master;
 278
 279        rcu_read_lock();
 280        master = netdev_master_upper_dev_get_rcu(dev);
 281        if (master)
 282                dev_hold(master);
 283        rcu_read_unlock();
 284
 285        if (master)
 286                return master;
 287
 288        dev_hold(dev);
 289        return dev;
 290}
 291
 292/**
 293 * Find a net_device matching the given address, which is an upper device of
 294 * the given net_device.
 295 * @addr: IP address to look for.
 296 * @dev: base IPoIB net_device
 297 *
 298 * If found, returns the net_device with a reference held. Otherwise return
 299 * NULL.
 300 */
 301static struct net_device *ipoib_get_net_dev_match_addr(
 302                const struct sockaddr *addr, struct net_device *dev)
 303{
 304        struct net_device *upper,
 305                          *result = NULL;
 306        struct list_head *iter;
 307
 308        rcu_read_lock();
 309        if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
 310                dev_hold(dev);
 311                result = dev;
 312                goto out;
 313        }
 314
 315        netdev_for_each_all_upper_dev_rcu(dev, upper, iter) {
 316                if (ipoib_is_dev_match_addr_rcu(addr, upper)) {
 317                        dev_hold(upper);
 318                        result = upper;
 319                        break;
 320                }
 321        }
 322out:
 323        rcu_read_unlock();
 324        return result;
 325}
 326
 327/* returns the number of IPoIB netdevs on top a given ipoib device matching a
 328 * pkey_index and address, if one exists.
 329 *
 330 * @found_net_dev: contains a matching net_device if the return value >= 1,
 331 * with a reference held. */
 332static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
 333                                     const union ib_gid *gid,
 334                                     u16 pkey_index,
 335                                     const struct sockaddr *addr,
 336                                     int nesting,
 337                                     struct net_device **found_net_dev)
 338{
 339        struct ipoib_dev_priv *child_priv;
 340        struct net_device *net_dev = NULL;
 341        int matches = 0;
 342
 343        if (priv->pkey_index == pkey_index &&
 344            (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
 345                if (!addr) {
 346                        net_dev = ipoib_get_master_net_dev(priv->dev);
 347                } else {
 348                        /* Verify the net_device matches the IP address, as
 349                         * IPoIB child devices currently share a GID. */
 350                        net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
 351                }
 352                if (net_dev) {
 353                        if (!*found_net_dev)
 354                                *found_net_dev = net_dev;
 355                        else
 356                                dev_put(net_dev);
 357                        ++matches;
 358                }
 359        }
 360
 361        /* Check child interfaces */
 362        down_read_nested(&priv->vlan_rwsem, nesting);
 363        list_for_each_entry(child_priv, &priv->child_intfs, list) {
 364                matches += ipoib_match_gid_pkey_addr(child_priv, gid,
 365                                                    pkey_index, addr,
 366                                                    nesting + 1,
 367                                                    found_net_dev);
 368                if (matches > 1)
 369                        break;
 370        }
 371        up_read(&priv->vlan_rwsem);
 372
 373        return matches;
 374}
 375
 376/* Returns the number of matching net_devs found (between 0 and 2). Also
 377 * return the matching net_device in the @net_dev parameter, holding a
 378 * reference to the net_device, if the number of matches >= 1 */
 379static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
 380                                         u16 pkey_index,
 381                                         const union ib_gid *gid,
 382                                         const struct sockaddr *addr,
 383                                         struct net_device **net_dev)
 384{
 385        struct ipoib_dev_priv *priv;
 386        int matches = 0;
 387
 388        *net_dev = NULL;
 389
 390        list_for_each_entry(priv, dev_list, list) {
 391                if (priv->port != port)
 392                        continue;
 393
 394                matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
 395                                                     addr, 0, net_dev);
 396                if (matches > 1)
 397                        break;
 398        }
 399
 400        return matches;
 401}
 402
 403static struct net_device *ipoib_get_net_dev_by_params(
 404                struct ib_device *dev, u8 port, u16 pkey,
 405                const union ib_gid *gid, const struct sockaddr *addr,
 406                void *client_data)
 407{
 408        struct net_device *net_dev;
 409        struct list_head *dev_list = client_data;
 410        u16 pkey_index;
 411        int matches;
 412        int ret;
 413
 414        if (!rdma_protocol_ib(dev, port))
 415                return NULL;
 416
 417        ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
 418        if (ret)
 419                return NULL;
 420
 421        if (!dev_list)
 422                return NULL;
 423
 424        /* See if we can find a unique device matching the L2 parameters */
 425        matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
 426                                                gid, NULL, &net_dev);
 427
 428        switch (matches) {
 429        case 0:
 430                return NULL;
 431        case 1:
 432                return net_dev;
 433        }
 434
 435        dev_put(net_dev);
 436
 437        /* Couldn't find a unique device with L2 parameters only. Use L3
 438         * address to uniquely match the net device */
 439        matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
 440                                                gid, addr, &net_dev);
 441        switch (matches) {
 442        case 0:
 443                return NULL;
 444        default:
 445                dev_warn_ratelimited(&dev->dev,
 446                                     "duplicate IP address detected\n");
 447                /* Fall through */
 448        case 1:
 449                return net_dev;
 450        }
 451}
 452
 453int ipoib_set_mode(struct net_device *dev, const char *buf)
 454{
 455        struct ipoib_dev_priv *priv = netdev_priv(dev);
 456
 457        /* flush paths if we switch modes so that connections are restarted */
 458        if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
 459                set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
 460                ipoib_warn(priv, "enabling connected mode "
 461                           "will cause multicast packet drops\n");
 462                netdev_update_features(dev);
 463                dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
 464                rtnl_unlock();
 465                priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
 466
 467                ipoib_flush_paths(dev);
 468                rtnl_lock();
 469                return 0;
 470        }
 471
 472        if (!strcmp(buf, "datagram\n")) {
 473                clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
 474                netdev_update_features(dev);
 475                dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
 476                rtnl_unlock();
 477                ipoib_flush_paths(dev);
 478                rtnl_lock();
 479                return 0;
 480        }
 481
 482        return -EINVAL;
 483}
 484
 485static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
 486{
 487        struct ipoib_dev_priv *priv = netdev_priv(dev);
 488        struct rb_node *n = priv->path_tree.rb_node;
 489        struct ipoib_path *path;
 490        int ret;
 491
 492        while (n) {
 493                path = rb_entry(n, struct ipoib_path, rb_node);
 494
 495                ret = memcmp(gid, path->pathrec.dgid.raw,
 496                             sizeof (union ib_gid));
 497
 498                if (ret < 0)
 499                        n = n->rb_left;
 500                else if (ret > 0)
 501                        n = n->rb_right;
 502                else
 503                        return path;
 504        }
 505
 506        return NULL;
 507}
 508
 509static int __path_add(struct net_device *dev, struct ipoib_path *path)
 510{
 511        struct ipoib_dev_priv *priv = netdev_priv(dev);
 512        struct rb_node **n = &priv->path_tree.rb_node;
 513        struct rb_node *pn = NULL;
 514        struct ipoib_path *tpath;
 515        int ret;
 516
 517        while (*n) {
 518                pn = *n;
 519                tpath = rb_entry(pn, struct ipoib_path, rb_node);
 520
 521                ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
 522                             sizeof (union ib_gid));
 523                if (ret < 0)
 524                        n = &pn->rb_left;
 525                else if (ret > 0)
 526                        n = &pn->rb_right;
 527                else
 528                        return -EEXIST;
 529        }
 530
 531        rb_link_node(&path->rb_node, pn, n);
 532        rb_insert_color(&path->rb_node, &priv->path_tree);
 533
 534        list_add_tail(&path->list, &priv->path_list);
 535
 536        return 0;
 537}
 538
 539static void path_free(struct net_device *dev, struct ipoib_path *path)
 540{
 541        struct sk_buff *skb;
 542
 543        while ((skb = __skb_dequeue(&path->queue)))
 544                dev_kfree_skb_irq(skb);
 545
 546        ipoib_dbg(netdev_priv(dev), "path_free\n");
 547
 548        /* remove all neigh connected to this path */
 549        ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
 550
 551        if (path->ah)
 552                ipoib_put_ah(path->ah);
 553
 554        kfree(path);
 555}
 556
 557#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 558
 559struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
 560{
 561        struct ipoib_path_iter *iter;
 562
 563        iter = kmalloc(sizeof *iter, GFP_KERNEL);
 564        if (!iter)
 565                return NULL;
 566
 567        iter->dev = dev;
 568        memset(iter->path.pathrec.dgid.raw, 0, 16);
 569
 570        if (ipoib_path_iter_next(iter)) {
 571                kfree(iter);
 572                return NULL;
 573        }
 574
 575        return iter;
 576}
 577
 578int ipoib_path_iter_next(struct ipoib_path_iter *iter)
 579{
 580        struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
 581        struct rb_node *n;
 582        struct ipoib_path *path;
 583        int ret = 1;
 584
 585        spin_lock_irq(&priv->lock);
 586
 587        n = rb_first(&priv->path_tree);
 588
 589        while (n) {
 590                path = rb_entry(n, struct ipoib_path, rb_node);
 591
 592                if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
 593                           sizeof (union ib_gid)) < 0) {
 594                        iter->path = *path;
 595                        ret = 0;
 596                        break;
 597                }
 598
 599                n = rb_next(n);
 600        }
 601
 602        spin_unlock_irq(&priv->lock);
 603
 604        return ret;
 605}
 606
 607void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 608                          struct ipoib_path *path)
 609{
 610        *path = iter->path;
 611}
 612
 613#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
 614
 615void ipoib_mark_paths_invalid(struct net_device *dev)
 616{
 617        struct ipoib_dev_priv *priv = netdev_priv(dev);
 618        struct ipoib_path *path, *tp;
 619
 620        spin_lock_irq(&priv->lock);
 621
 622        list_for_each_entry_safe(path, tp, &priv->path_list, list) {
 623                ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
 624                        be16_to_cpu(path->pathrec.dlid),
 625                        path->pathrec.dgid.raw);
 626                path->valid =  0;
 627        }
 628
 629        spin_unlock_irq(&priv->lock);
 630}
 631
 632void ipoib_flush_paths(struct net_device *dev)
 633{
 634        struct ipoib_dev_priv *priv = netdev_priv(dev);
 635        struct ipoib_path *path, *tp;
 636        LIST_HEAD(remove_list);
 637        unsigned long flags;
 638
 639        netif_tx_lock_bh(dev);
 640        spin_lock_irqsave(&priv->lock, flags);
 641
 642        list_splice_init(&priv->path_list, &remove_list);
 643
 644        list_for_each_entry(path, &remove_list, list)
 645                rb_erase(&path->rb_node, &priv->path_tree);
 646
 647        list_for_each_entry_safe(path, tp, &remove_list, list) {
 648                if (path->query)
 649                        ib_sa_cancel_query(path->query_id, path->query);
 650                spin_unlock_irqrestore(&priv->lock, flags);
 651                netif_tx_unlock_bh(dev);
 652                wait_for_completion(&path->done);
 653                path_free(dev, path);
 654                netif_tx_lock_bh(dev);
 655                spin_lock_irqsave(&priv->lock, flags);
 656        }
 657
 658        spin_unlock_irqrestore(&priv->lock, flags);
 659        netif_tx_unlock_bh(dev);
 660}
 661
 662static void path_rec_completion(int status,
 663                                struct ib_sa_path_rec *pathrec,
 664                                void *path_ptr)
 665{
 666        struct ipoib_path *path = path_ptr;
 667        struct net_device *dev = path->dev;
 668        struct ipoib_dev_priv *priv = netdev_priv(dev);
 669        struct ipoib_ah *ah = NULL;
 670        struct ipoib_ah *old_ah = NULL;
 671        struct ipoib_neigh *neigh, *tn;
 672        struct sk_buff_head skqueue;
 673        struct sk_buff *skb;
 674        unsigned long flags;
 675
 676        if (!status)
 677                ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
 678                          be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
 679        else
 680                ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
 681                          status, path->pathrec.dgid.raw);
 682
 683        skb_queue_head_init(&skqueue);
 684
 685        if (!status) {
 686                struct ib_ah_attr av;
 687
 688                if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
 689                        ah = ipoib_create_ah(dev, priv->pd, &av);
 690        }
 691
 692        spin_lock_irqsave(&priv->lock, flags);
 693
 694        if (!IS_ERR_OR_NULL(ah)) {
 695                path->pathrec = *pathrec;
 696
 697                old_ah   = path->ah;
 698                path->ah = ah;
 699
 700                ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
 701                          ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
 702
 703                while ((skb = __skb_dequeue(&path->queue)))
 704                        __skb_queue_tail(&skqueue, skb);
 705
 706                list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
 707                        if (neigh->ah) {
 708                                WARN_ON(neigh->ah != old_ah);
 709                                /*
 710                                 * Dropping the ah reference inside
 711                                 * priv->lock is safe here, because we
 712                                 * will hold one more reference from
 713                                 * the original value of path->ah (ie
 714                                 * old_ah).
 715                                 */
 716                                ipoib_put_ah(neigh->ah);
 717                        }
 718                        kref_get(&path->ah->ref);
 719                        neigh->ah = path->ah;
 720
 721                        if (ipoib_cm_enabled(dev, neigh->daddr)) {
 722                                if (!ipoib_cm_get(neigh))
 723                                        ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
 724                                                                               path,
 725                                                                               neigh));
 726                                if (!ipoib_cm_get(neigh)) {
 727                                        ipoib_neigh_free(neigh);
 728                                        continue;
 729                                }
 730                        }
 731
 732                        while ((skb = __skb_dequeue(&neigh->queue)))
 733                                __skb_queue_tail(&skqueue, skb);
 734                }
 735                path->valid = 1;
 736        }
 737
 738        path->query = NULL;
 739        complete(&path->done);
 740
 741        spin_unlock_irqrestore(&priv->lock, flags);
 742
 743        if (IS_ERR_OR_NULL(ah))
 744                ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
 745
 746        if (old_ah)
 747                ipoib_put_ah(old_ah);
 748
 749        while ((skb = __skb_dequeue(&skqueue))) {
 750                skb->dev = dev;
 751                if (dev_queue_xmit(skb))
 752                        ipoib_warn(priv, "dev_queue_xmit failed "
 753                                   "to requeue packet\n");
 754        }
 755}
 756
 757static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
 758{
 759        struct ipoib_dev_priv *priv = netdev_priv(dev);
 760        struct ipoib_path *path;
 761
 762        if (!priv->broadcast)
 763                return NULL;
 764
 765        path = kzalloc(sizeof *path, GFP_ATOMIC);
 766        if (!path)
 767                return NULL;
 768
 769        path->dev = dev;
 770
 771        skb_queue_head_init(&path->queue);
 772
 773        INIT_LIST_HEAD(&path->neigh_list);
 774
 775        memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
 776        path->pathrec.sgid          = priv->local_gid;
 777        path->pathrec.pkey          = cpu_to_be16(priv->pkey);
 778        path->pathrec.numb_path     = 1;
 779        path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
 780
 781        return path;
 782}
 783
 784static int path_rec_start(struct net_device *dev,
 785                          struct ipoib_path *path)
 786{
 787        struct ipoib_dev_priv *priv = netdev_priv(dev);
 788
 789        ipoib_dbg(priv, "Start path record lookup for %pI6\n",
 790                  path->pathrec.dgid.raw);
 791
 792        init_completion(&path->done);
 793
 794        path->query_id =
 795                ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
 796                                   &path->pathrec,
 797                                   IB_SA_PATH_REC_DGID          |
 798                                   IB_SA_PATH_REC_SGID          |
 799                                   IB_SA_PATH_REC_NUMB_PATH     |
 800                                   IB_SA_PATH_REC_TRAFFIC_CLASS |
 801                                   IB_SA_PATH_REC_PKEY,
 802                                   1000, GFP_ATOMIC,
 803                                   path_rec_completion,
 804                                   path, &path->query);
 805        if (path->query_id < 0) {
 806                ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
 807                path->query = NULL;
 808                complete(&path->done);
 809                return path->query_id;
 810        }
 811
 812        return 0;
 813}
 814
 815static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 816                           struct net_device *dev)
 817{
 818        struct ipoib_dev_priv *priv = netdev_priv(dev);
 819        struct ipoib_path *path;
 820        struct ipoib_neigh *neigh;
 821        unsigned long flags;
 822
 823        spin_lock_irqsave(&priv->lock, flags);
 824        neigh = ipoib_neigh_alloc(daddr, dev);
 825        if (!neigh) {
 826                spin_unlock_irqrestore(&priv->lock, flags);
 827                ++dev->stats.tx_dropped;
 828                dev_kfree_skb_any(skb);
 829                return;
 830        }
 831
 832        path = __path_find(dev, daddr + 4);
 833        if (!path) {
 834                path = path_rec_create(dev, daddr + 4);
 835                if (!path)
 836                        goto err_path;
 837
 838                __path_add(dev, path);
 839        }
 840
 841        list_add_tail(&neigh->list, &path->neigh_list);
 842
 843        if (path->ah) {
 844                kref_get(&path->ah->ref);
 845                neigh->ah = path->ah;
 846
 847                if (ipoib_cm_enabled(dev, neigh->daddr)) {
 848                        if (!ipoib_cm_get(neigh))
 849                                ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
 850                        if (!ipoib_cm_get(neigh)) {
 851                                ipoib_neigh_free(neigh);
 852                                goto err_drop;
 853                        }
 854                        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
 855                                __skb_queue_tail(&neigh->queue, skb);
 856                        else {
 857                                ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
 858                                           skb_queue_len(&neigh->queue));
 859                                goto err_drop;
 860                        }
 861                } else {
 862                        spin_unlock_irqrestore(&priv->lock, flags);
 863                        ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
 864                        ipoib_neigh_put(neigh);
 865                        return;
 866                }
 867        } else {
 868                neigh->ah  = NULL;
 869
 870                if (!path->query && path_rec_start(dev, path))
 871                        goto err_path;
 872                if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
 873                        __skb_queue_tail(&neigh->queue, skb);
 874                else
 875                        goto err_drop;
 876        }
 877
 878        spin_unlock_irqrestore(&priv->lock, flags);
 879        ipoib_neigh_put(neigh);
 880        return;
 881
 882err_path:
 883        ipoib_neigh_free(neigh);
 884err_drop:
 885        ++dev->stats.tx_dropped;
 886        dev_kfree_skb_any(skb);
 887
 888        spin_unlock_irqrestore(&priv->lock, flags);
 889        ipoib_neigh_put(neigh);
 890}
 891
 892static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 893                             struct ipoib_cb *cb)
 894{
 895        struct ipoib_dev_priv *priv = netdev_priv(dev);
 896        struct ipoib_path *path;
 897        unsigned long flags;
 898
 899        spin_lock_irqsave(&priv->lock, flags);
 900
 901        path = __path_find(dev, cb->hwaddr + 4);
 902        if (!path || !path->valid) {
 903                int new_path = 0;
 904
 905                if (!path) {
 906                        path = path_rec_create(dev, cb->hwaddr + 4);
 907                        new_path = 1;
 908                }
 909                if (path) {
 910                        if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
 911                                __skb_queue_tail(&path->queue, skb);
 912                        } else {
 913                                ++dev->stats.tx_dropped;
 914                                dev_kfree_skb_any(skb);
 915                        }
 916
 917                        if (!path->query && path_rec_start(dev, path)) {
 918                                spin_unlock_irqrestore(&priv->lock, flags);
 919                                if (new_path)
 920                                        path_free(dev, path);
 921                                return;
 922                        } else
 923                                __path_add(dev, path);
 924                } else {
 925                        ++dev->stats.tx_dropped;
 926                        dev_kfree_skb_any(skb);
 927                }
 928
 929                spin_unlock_irqrestore(&priv->lock, flags);
 930                return;
 931        }
 932
 933        if (path->ah) {
 934                ipoib_dbg(priv, "Send unicast ARP to %04x\n",
 935                          be16_to_cpu(path->pathrec.dlid));
 936
 937                spin_unlock_irqrestore(&priv->lock, flags);
 938                ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
 939                return;
 940        } else if ((path->query || !path_rec_start(dev, path)) &&
 941                   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
 942                __skb_queue_tail(&path->queue, skb);
 943        } else {
 944                ++dev->stats.tx_dropped;
 945                dev_kfree_skb_any(skb);
 946        }
 947
 948        spin_unlock_irqrestore(&priv->lock, flags);
 949}
 950
 951static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 952{
 953        struct ipoib_dev_priv *priv = netdev_priv(dev);
 954        struct ipoib_neigh *neigh;
 955        struct ipoib_cb *cb = ipoib_skb_cb(skb);
 956        struct ipoib_header *header;
 957        unsigned long flags;
 958
 959        header = (struct ipoib_header *) skb->data;
 960
 961        if (unlikely(cb->hwaddr[4] == 0xff)) {
 962                /* multicast, arrange "if" according to probability */
 963                if ((header->proto != htons(ETH_P_IP)) &&
 964                    (header->proto != htons(ETH_P_IPV6)) &&
 965                    (header->proto != htons(ETH_P_ARP)) &&
 966                    (header->proto != htons(ETH_P_RARP)) &&
 967                    (header->proto != htons(ETH_P_TIPC))) {
 968                        /* ethertype not supported by IPoIB */
 969                        ++dev->stats.tx_dropped;
 970                        dev_kfree_skb_any(skb);
 971                        return NETDEV_TX_OK;
 972                }
 973                /* Add in the P_Key for multicast*/
 974                cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
 975                cb->hwaddr[9] = priv->pkey & 0xff;
 976
 977                neigh = ipoib_neigh_get(dev, cb->hwaddr);
 978                if (likely(neigh))
 979                        goto send_using_neigh;
 980                ipoib_mcast_send(dev, cb->hwaddr, skb);
 981                return NETDEV_TX_OK;
 982        }
 983
 984        /* unicast, arrange "switch" according to probability */
 985        switch (header->proto) {
 986        case htons(ETH_P_IP):
 987        case htons(ETH_P_IPV6):
 988        case htons(ETH_P_TIPC):
 989                neigh = ipoib_neigh_get(dev, cb->hwaddr);
 990                if (unlikely(!neigh)) {
 991                        neigh_add_path(skb, cb->hwaddr, dev);
 992                        return NETDEV_TX_OK;
 993                }
 994                break;
 995        case htons(ETH_P_ARP):
 996        case htons(ETH_P_RARP):
 997                /* for unicast ARP and RARP should always perform path find */
 998                unicast_arp_send(skb, dev, cb);
 999                return NETDEV_TX_OK;
1000        default:
1001                /* ethertype not supported by IPoIB */
1002                ++dev->stats.tx_dropped;
1003                dev_kfree_skb_any(skb);
1004                return NETDEV_TX_OK;
1005        }
1006
1007send_using_neigh:
1008        /* note we now hold a ref to neigh */
1009        if (ipoib_cm_get(neigh)) {
1010                if (ipoib_cm_up(neigh)) {
1011                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
1012                        goto unref;
1013                }
1014        } else if (neigh->ah) {
1015                ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
1016                goto unref;
1017        }
1018
1019        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1020                spin_lock_irqsave(&priv->lock, flags);
1021                __skb_queue_tail(&neigh->queue, skb);
1022                spin_unlock_irqrestore(&priv->lock, flags);
1023        } else {
1024                ++dev->stats.tx_dropped;
1025                dev_kfree_skb_any(skb);
1026        }
1027
1028unref:
1029        ipoib_neigh_put(neigh);
1030
1031        return NETDEV_TX_OK;
1032}
1033
1034static void ipoib_timeout(struct net_device *dev)
1035{
1036        struct ipoib_dev_priv *priv = netdev_priv(dev);
1037
1038        ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
1039                   jiffies_to_msecs(jiffies - dev->trans_start));
1040        ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
1041                   netif_queue_stopped(dev),
1042                   priv->tx_head, priv->tx_tail);
1043        /* XXX reset QP, etc. */
1044}
1045
1046static int ipoib_hard_header(struct sk_buff *skb,
1047                             struct net_device *dev,
1048                             unsigned short type,
1049                             const void *daddr, const void *saddr, unsigned len)
1050{
1051        struct ipoib_header *header;
1052        struct ipoib_cb *cb = ipoib_skb_cb(skb);
1053
1054        header = (struct ipoib_header *) skb_push(skb, sizeof *header);
1055
1056        header->proto = htons(type);
1057        header->reserved = 0;
1058
1059        /*
1060         * we don't rely on dst_entry structure,  always stuff the
1061         * destination address into skb->cb so we can figure out where
1062         * to send the packet later.
1063         */
1064        memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
1065
1066        return sizeof *header;
1067}
1068
1069static void ipoib_set_mcast_list(struct net_device *dev)
1070{
1071        struct ipoib_dev_priv *priv = netdev_priv(dev);
1072
1073        if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
1074                ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
1075                return;
1076        }
1077
1078        queue_work(priv->wq, &priv->restart_task);
1079}
1080
1081static int ipoib_get_iflink(const struct net_device *dev)
1082{
1083        struct ipoib_dev_priv *priv = netdev_priv(dev);
1084
1085        /* parent interface */
1086        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
1087                return dev->ifindex;
1088
1089        /* child/vlan interface */
1090        return priv->parent->ifindex;
1091}
1092
1093static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
1094{
1095        /*
1096         * Use only the address parts that contributes to spreading
1097         * The subnet prefix is not used as one can not connect to
1098         * same remote port (GUID) using the same remote QPN via two
1099         * different subnets.
1100         */
1101         /* qpn octets[1:4) & port GUID octets[12:20) */
1102        u32 *d32 = (u32 *) daddr;
1103        u32 hv;
1104
1105        hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
1106        return hv & htbl->mask;
1107}
1108
1109struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
1110{
1111        struct ipoib_dev_priv *priv = netdev_priv(dev);
1112        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1113        struct ipoib_neigh_hash *htbl;
1114        struct ipoib_neigh *neigh = NULL;
1115        u32 hash_val;
1116
1117        rcu_read_lock_bh();
1118
1119        htbl = rcu_dereference_bh(ntbl->htbl);
1120
1121        if (!htbl)
1122                goto out_unlock;
1123
1124        hash_val = ipoib_addr_hash(htbl, daddr);
1125        for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
1126             neigh != NULL;
1127             neigh = rcu_dereference_bh(neigh->hnext)) {
1128                if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
1129                        /* found, take one ref on behalf of the caller */
1130                        if (!atomic_inc_not_zero(&neigh->refcnt)) {
1131                                /* deleted */
1132                                neigh = NULL;
1133                                goto out_unlock;
1134                        }
1135                        neigh->alive = jiffies;
1136                        goto out_unlock;
1137                }
1138        }
1139
1140out_unlock:
1141        rcu_read_unlock_bh();
1142        return neigh;
1143}
1144
1145static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
1146{
1147        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1148        struct ipoib_neigh_hash *htbl;
1149        unsigned long neigh_obsolete;
1150        unsigned long dt;
1151        unsigned long flags;
1152        int i;
1153        LIST_HEAD(remove_list);
1154
1155        if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
1156                return;
1157
1158        spin_lock_irqsave(&priv->lock, flags);
1159
1160        htbl = rcu_dereference_protected(ntbl->htbl,
1161                                         lockdep_is_held(&priv->lock));
1162
1163        if (!htbl)
1164                goto out_unlock;
1165
1166        /* neigh is obsolete if it was idle for two GC periods */
1167        dt = 2 * arp_tbl.gc_interval;
1168        neigh_obsolete = jiffies - dt;
1169        /* handle possible race condition */
1170        if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
1171                goto out_unlock;
1172
1173        for (i = 0; i < htbl->size; i++) {
1174                struct ipoib_neigh *neigh;
1175                struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1176
1177                while ((neigh = rcu_dereference_protected(*np,
1178                                                          lockdep_is_held(&priv->lock))) != NULL) {
1179                        /* was the neigh idle for two GC periods */
1180                        if (time_after(neigh_obsolete, neigh->alive)) {
1181
1182                                ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list);
1183
1184                                rcu_assign_pointer(*np,
1185                                                   rcu_dereference_protected(neigh->hnext,
1186                                                                             lockdep_is_held(&priv->lock)));
1187                                /* remove from path/mc list */
1188                                list_del(&neigh->list);
1189                                call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1190                        } else {
1191                                np = &neigh->hnext;
1192                        }
1193
1194                }
1195        }
1196
1197out_unlock:
1198        spin_unlock_irqrestore(&priv->lock, flags);
1199        ipoib_mcast_remove_list(&remove_list);
1200}
1201
1202static void ipoib_reap_neigh(struct work_struct *work)
1203{
1204        struct ipoib_dev_priv *priv =
1205                container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
1206
1207        __ipoib_reap_neigh(priv);
1208
1209        if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
1210                queue_delayed_work(priv->wq, &priv->neigh_reap_task,
1211                                   arp_tbl.gc_interval);
1212}
1213
1214
1215static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
1216                                      struct net_device *dev)
1217{
1218        struct ipoib_neigh *neigh;
1219
1220        neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
1221        if (!neigh)
1222                return NULL;
1223
1224        neigh->dev = dev;
1225        memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
1226        skb_queue_head_init(&neigh->queue);
1227        INIT_LIST_HEAD(&neigh->list);
1228        ipoib_cm_set(neigh, NULL);
1229        /* one ref on behalf of the caller */
1230        atomic_set(&neigh->refcnt, 1);
1231
1232        return neigh;
1233}
1234
1235struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
1236                                      struct net_device *dev)
1237{
1238        struct ipoib_dev_priv *priv = netdev_priv(dev);
1239        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1240        struct ipoib_neigh_hash *htbl;
1241        struct ipoib_neigh *neigh;
1242        u32 hash_val;
1243
1244        htbl = rcu_dereference_protected(ntbl->htbl,
1245                                         lockdep_is_held(&priv->lock));
1246        if (!htbl) {
1247                neigh = NULL;
1248                goto out_unlock;
1249        }
1250
1251        /* need to add a new neigh, but maybe some other thread succeeded?
1252         * recalc hash, maybe hash resize took place so we do a search
1253         */
1254        hash_val = ipoib_addr_hash(htbl, daddr);
1255        for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
1256                                               lockdep_is_held(&priv->lock));
1257             neigh != NULL;
1258             neigh = rcu_dereference_protected(neigh->hnext,
1259                                               lockdep_is_held(&priv->lock))) {
1260                if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
1261                        /* found, take one ref on behalf of the caller */
1262                        if (!atomic_inc_not_zero(&neigh->refcnt)) {
1263                                /* deleted */
1264                                neigh = NULL;
1265                                break;
1266                        }
1267                        neigh->alive = jiffies;
1268                        goto out_unlock;
1269                }
1270        }
1271
1272        neigh = ipoib_neigh_ctor(daddr, dev);
1273        if (!neigh)
1274                goto out_unlock;
1275
1276        /* one ref on behalf of the hash table */
1277        atomic_inc(&neigh->refcnt);
1278        neigh->alive = jiffies;
1279        /* put in hash */
1280        rcu_assign_pointer(neigh->hnext,
1281                           rcu_dereference_protected(htbl->buckets[hash_val],
1282                                                     lockdep_is_held(&priv->lock)));
1283        rcu_assign_pointer(htbl->buckets[hash_val], neigh);
1284        atomic_inc(&ntbl->entries);
1285
1286out_unlock:
1287
1288        return neigh;
1289}
1290
1291void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
1292{
1293        /* neigh reference count was dropprd to zero */
1294        struct net_device *dev = neigh->dev;
1295        struct ipoib_dev_priv *priv = netdev_priv(dev);
1296        struct sk_buff *skb;
1297        if (neigh->ah)
1298                ipoib_put_ah(neigh->ah);
1299        while ((skb = __skb_dequeue(&neigh->queue))) {
1300                ++dev->stats.tx_dropped;
1301                dev_kfree_skb_any(skb);
1302        }
1303        if (ipoib_cm_get(neigh))
1304                ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
1305        ipoib_dbg(netdev_priv(dev),
1306                  "neigh free for %06x %pI6\n",
1307                  IPOIB_QPN(neigh->daddr),
1308                  neigh->daddr + 4);
1309        kfree(neigh);
1310        if (atomic_dec_and_test(&priv->ntbl.entries)) {
1311                if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
1312                        complete(&priv->ntbl.flushed);
1313        }
1314}
1315
1316static void ipoib_neigh_reclaim(struct rcu_head *rp)
1317{
1318        /* Called as a result of removal from hash table */
1319        struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
1320        /* note TX context may hold another ref */
1321        ipoib_neigh_put(neigh);
1322}
1323
1324void ipoib_neigh_free(struct ipoib_neigh *neigh)
1325{
1326        struct net_device *dev = neigh->dev;
1327        struct ipoib_dev_priv *priv = netdev_priv(dev);
1328        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1329        struct ipoib_neigh_hash *htbl;
1330        struct ipoib_neigh __rcu **np;
1331        struct ipoib_neigh *n;
1332        u32 hash_val;
1333
1334        htbl = rcu_dereference_protected(ntbl->htbl,
1335                                        lockdep_is_held(&priv->lock));
1336        if (!htbl)
1337                return;
1338
1339        hash_val = ipoib_addr_hash(htbl, neigh->daddr);
1340        np = &htbl->buckets[hash_val];
1341        for (n = rcu_dereference_protected(*np,
1342                                            lockdep_is_held(&priv->lock));
1343             n != NULL;
1344             n = rcu_dereference_protected(*np,
1345                                        lockdep_is_held(&priv->lock))) {
1346                if (n == neigh) {
1347                        /* found */
1348                        rcu_assign_pointer(*np,
1349                                           rcu_dereference_protected(neigh->hnext,
1350                                                                     lockdep_is_held(&priv->lock)));
1351                        /* remove from parent list */
1352                        list_del(&neigh->list);
1353                        call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1354                        return;
1355                } else {
1356                        np = &n->hnext;
1357                }
1358        }
1359}
1360
1361static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
1362{
1363        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1364        struct ipoib_neigh_hash *htbl;
1365        struct ipoib_neigh __rcu **buckets;
1366        u32 size;
1367
1368        clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1369        ntbl->htbl = NULL;
1370        htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
1371        if (!htbl)
1372                return -ENOMEM;
1373        set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1374        size = roundup_pow_of_two(arp_tbl.gc_thresh3);
1375        buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
1376        if (!buckets) {
1377                kfree(htbl);
1378                return -ENOMEM;
1379        }
1380        htbl->size = size;
1381        htbl->mask = (size - 1);
1382        htbl->buckets = buckets;
1383        RCU_INIT_POINTER(ntbl->htbl, htbl);
1384        htbl->ntbl = ntbl;
1385        atomic_set(&ntbl->entries, 0);
1386
1387        /* start garbage collection */
1388        clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1389        queue_delayed_work(priv->wq, &priv->neigh_reap_task,
1390                           arp_tbl.gc_interval);
1391
1392        return 0;
1393}
1394
1395static void neigh_hash_free_rcu(struct rcu_head *head)
1396{
1397        struct ipoib_neigh_hash *htbl = container_of(head,
1398                                                    struct ipoib_neigh_hash,
1399                                                    rcu);
1400        struct ipoib_neigh __rcu **buckets = htbl->buckets;
1401        struct ipoib_neigh_table *ntbl = htbl->ntbl;
1402
1403        kfree(buckets);
1404        kfree(htbl);
1405        complete(&ntbl->deleted);
1406}
1407
1408void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
1409{
1410        struct ipoib_dev_priv *priv = netdev_priv(dev);
1411        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1412        struct ipoib_neigh_hash *htbl;
1413        unsigned long flags;
1414        int i;
1415
1416        /* remove all neigh connected to a given path or mcast */
1417        spin_lock_irqsave(&priv->lock, flags);
1418
1419        htbl = rcu_dereference_protected(ntbl->htbl,
1420                                         lockdep_is_held(&priv->lock));
1421
1422        if (!htbl)
1423                goto out_unlock;
1424
1425        for (i = 0; i < htbl->size; i++) {
1426                struct ipoib_neigh *neigh;
1427                struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1428
1429                while ((neigh = rcu_dereference_protected(*np,
1430                                                          lockdep_is_held(&priv->lock))) != NULL) {
1431                        /* delete neighs belong to this parent */
1432                        if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
1433                                rcu_assign_pointer(*np,
1434                                                   rcu_dereference_protected(neigh->hnext,
1435                                                                             lockdep_is_held(&priv->lock)));
1436                                /* remove from parent list */
1437                                list_del(&neigh->list);
1438                                call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1439                        } else {
1440                                np = &neigh->hnext;
1441                        }
1442
1443                }
1444        }
1445out_unlock:
1446        spin_unlock_irqrestore(&priv->lock, flags);
1447}
1448
1449static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
1450{
1451        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1452        struct ipoib_neigh_hash *htbl;
1453        unsigned long flags;
1454        int i, wait_flushed = 0;
1455
1456        init_completion(&priv->ntbl.flushed);
1457
1458        spin_lock_irqsave(&priv->lock, flags);
1459
1460        htbl = rcu_dereference_protected(ntbl->htbl,
1461                                        lockdep_is_held(&priv->lock));
1462        if (!htbl)
1463                goto out_unlock;
1464
1465        wait_flushed = atomic_read(&priv->ntbl.entries);
1466        if (!wait_flushed)
1467                goto free_htbl;
1468
1469        for (i = 0; i < htbl->size; i++) {
1470                struct ipoib_neigh *neigh;
1471                struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1472
1473                while ((neigh = rcu_dereference_protected(*np,
1474                                       lockdep_is_held(&priv->lock))) != NULL) {
1475                        rcu_assign_pointer(*np,
1476                                           rcu_dereference_protected(neigh->hnext,
1477                                                                     lockdep_is_held(&priv->lock)));
1478                        /* remove from path/mc list */
1479                        list_del(&neigh->list);
1480                        call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1481                }
1482        }
1483
1484free_htbl:
1485        rcu_assign_pointer(ntbl->htbl, NULL);
1486        call_rcu(&htbl->rcu, neigh_hash_free_rcu);
1487
1488out_unlock:
1489        spin_unlock_irqrestore(&priv->lock, flags);
1490        if (wait_flushed)
1491                wait_for_completion(&priv->ntbl.flushed);
1492}
1493
1494static void ipoib_neigh_hash_uninit(struct net_device *dev)
1495{
1496        struct ipoib_dev_priv *priv = netdev_priv(dev);
1497        int stopped;
1498
1499        ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
1500        init_completion(&priv->ntbl.deleted);
1501        set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1502
1503        /* Stop GC if called at init fail need to cancel work */
1504        stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1505        if (!stopped)
1506                cancel_delayed_work(&priv->neigh_reap_task);
1507
1508        ipoib_flush_neighs(priv);
1509
1510        wait_for_completion(&priv->ntbl.deleted);
1511}
1512
1513
1514int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
1515{
1516        struct ipoib_dev_priv *priv = netdev_priv(dev);
1517
1518        /* Allocate RX/TX "rings" to hold queued skbs */
1519        priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
1520                                GFP_KERNEL);
1521        if (!priv->rx_ring) {
1522                printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
1523                       ca->name, ipoib_recvq_size);
1524                goto out;
1525        }
1526
1527        priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
1528        if (!priv->tx_ring) {
1529                printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
1530                       ca->name, ipoib_sendq_size);
1531                goto out_rx_ring_cleanup;
1532        }
1533
1534        /* priv->tx_head, tx_tail & tx_outstanding are already 0 */
1535
1536        if (ipoib_ib_dev_init(dev, ca, port))
1537                goto out_tx_ring_cleanup;
1538
1539        /*
1540         * Must be after ipoib_ib_dev_init so we can allocate a per
1541         * device wq there and use it here
1542         */
1543        if (ipoib_neigh_hash_init(priv) < 0)
1544                goto out_dev_uninit;
1545
1546        return 0;
1547
1548out_dev_uninit:
1549        ipoib_ib_dev_cleanup(dev);
1550
1551out_tx_ring_cleanup:
1552        vfree(priv->tx_ring);
1553
1554out_rx_ring_cleanup:
1555        kfree(priv->rx_ring);
1556
1557out:
1558        return -ENOMEM;
1559}
1560
1561void ipoib_dev_cleanup(struct net_device *dev)
1562{
1563        struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
1564        LIST_HEAD(head);
1565
1566        ASSERT_RTNL();
1567
1568        ipoib_delete_debug_files(dev);
1569
1570        /* Delete any child interfaces first */
1571        list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
1572                /* Stop GC on child */
1573                set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
1574                cancel_delayed_work(&cpriv->neigh_reap_task);
1575                unregister_netdevice_queue(cpriv->dev, &head);
1576        }
1577        unregister_netdevice_many(&head);
1578
1579        /*
1580         * Must be before ipoib_ib_dev_cleanup or we delete an in use
1581         * work queue
1582         */
1583        ipoib_neigh_hash_uninit(dev);
1584
1585        ipoib_ib_dev_cleanup(dev);
1586
1587        kfree(priv->rx_ring);
1588        vfree(priv->tx_ring);
1589
1590        priv->rx_ring = NULL;
1591        priv->tx_ring = NULL;
1592}
1593
1594static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state)
1595{
1596        struct ipoib_dev_priv *priv = netdev_priv(dev);
1597
1598        return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state);
1599}
1600
1601static int ipoib_get_vf_config(struct net_device *dev, int vf,
1602                               struct ifla_vf_info *ivf)
1603{
1604        struct ipoib_dev_priv *priv = netdev_priv(dev);
1605        int err;
1606
1607        err = ib_get_vf_config(priv->ca, vf, priv->port, ivf);
1608        if (err)
1609                return err;
1610
1611        ivf->vf = vf;
1612
1613        return 0;
1614}
1615
1616static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type)
1617{
1618        struct ipoib_dev_priv *priv = netdev_priv(dev);
1619
1620        if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID)
1621                return -EINVAL;
1622
1623        return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type);
1624}
1625
1626static int ipoib_get_vf_stats(struct net_device *dev, int vf,
1627                              struct ifla_vf_stats *vf_stats)
1628{
1629        struct ipoib_dev_priv *priv = netdev_priv(dev);
1630
1631        return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats);
1632}
1633
1634static const struct header_ops ipoib_header_ops = {
1635        .create = ipoib_hard_header,
1636};
1637
1638static const struct net_device_ops ipoib_netdev_ops_pf = {
1639        .ndo_uninit              = ipoib_uninit,
1640        .ndo_open                = ipoib_open,
1641        .ndo_stop                = ipoib_stop,
1642        .ndo_change_mtu          = ipoib_change_mtu,
1643        .ndo_fix_features        = ipoib_fix_features,
1644        .ndo_start_xmit          = ipoib_start_xmit,
1645        .ndo_tx_timeout          = ipoib_timeout,
1646        .ndo_set_rx_mode         = ipoib_set_mcast_list,
1647        .ndo_get_iflink          = ipoib_get_iflink,
1648        .ndo_set_vf_link_state   = ipoib_set_vf_link_state,
1649        .ndo_get_vf_config       = ipoib_get_vf_config,
1650        .ndo_get_vf_stats        = ipoib_get_vf_stats,
1651        .ndo_set_vf_guid         = ipoib_set_vf_guid,
1652};
1653
1654static const struct net_device_ops ipoib_netdev_ops_vf = {
1655        .ndo_uninit              = ipoib_uninit,
1656        .ndo_open                = ipoib_open,
1657        .ndo_stop                = ipoib_stop,
1658        .ndo_change_mtu          = ipoib_change_mtu,
1659        .ndo_fix_features        = ipoib_fix_features,
1660        .ndo_start_xmit          = ipoib_start_xmit,
1661        .ndo_tx_timeout          = ipoib_timeout,
1662        .ndo_set_rx_mode         = ipoib_set_mcast_list,
1663        .ndo_get_iflink          = ipoib_get_iflink,
1664};
1665
1666void ipoib_setup(struct net_device *dev)
1667{
1668        struct ipoib_dev_priv *priv = netdev_priv(dev);
1669
1670        if (priv->hca_caps & IB_DEVICE_VIRTUAL_FUNCTION)
1671                dev->netdev_ops = &ipoib_netdev_ops_vf;
1672        else
1673                dev->netdev_ops = &ipoib_netdev_ops_pf;
1674
1675        dev->header_ops          = &ipoib_header_ops;
1676
1677        ipoib_set_ethtool_ops(dev);
1678
1679        netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT);
1680
1681        dev->watchdog_timeo      = HZ;
1682
1683        dev->flags              |= IFF_BROADCAST | IFF_MULTICAST;
1684
1685        dev->hard_header_len     = IPOIB_ENCAP_LEN;
1686        dev->addr_len            = INFINIBAND_ALEN;
1687        dev->type                = ARPHRD_INFINIBAND;
1688        dev->tx_queue_len        = ipoib_sendq_size * 2;
1689        dev->features            = (NETIF_F_VLAN_CHALLENGED     |
1690                                    NETIF_F_HIGHDMA);
1691        netif_keep_dst(dev);
1692
1693        memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
1694
1695        priv->dev = dev;
1696
1697        spin_lock_init(&priv->lock);
1698
1699        init_rwsem(&priv->vlan_rwsem);
1700
1701        INIT_LIST_HEAD(&priv->path_list);
1702        INIT_LIST_HEAD(&priv->child_intfs);
1703        INIT_LIST_HEAD(&priv->dead_ahs);
1704        INIT_LIST_HEAD(&priv->multicast_list);
1705
1706        INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
1707        INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
1708        INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
1709        INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
1710        INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
1711        INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
1712        INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
1713        INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
1714}
1715
1716struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
1717{
1718        struct net_device *dev;
1719
1720        dev = alloc_netdev((int)sizeof(struct ipoib_dev_priv), name,
1721                           NET_NAME_UNKNOWN, ipoib_setup);
1722        if (!dev)
1723                return NULL;
1724
1725        return netdev_priv(dev);
1726}
1727
1728static ssize_t show_pkey(struct device *dev,
1729                         struct device_attribute *attr, char *buf)
1730{
1731        struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1732
1733        return sprintf(buf, "0x%04x\n", priv->pkey);
1734}
1735static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
1736
1737static ssize_t show_umcast(struct device *dev,
1738                           struct device_attribute *attr, char *buf)
1739{
1740        struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1741
1742        return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
1743}
1744
1745void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
1746{
1747        struct ipoib_dev_priv *priv = netdev_priv(ndev);
1748
1749        if (umcast_val > 0) {
1750                set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1751                ipoib_warn(priv, "ignoring multicast groups joined directly "
1752                                "by userspace\n");
1753        } else
1754                clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1755}
1756
1757static ssize_t set_umcast(struct device *dev,
1758                          struct device_attribute *attr,
1759                          const char *buf, size_t count)
1760{
1761        unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
1762
1763        ipoib_set_umcast(to_net_dev(dev), umcast_val);
1764
1765        return count;
1766}
1767static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
1768
1769int ipoib_add_umcast_attr(struct net_device *dev)
1770{
1771        return device_create_file(&dev->dev, &dev_attr_umcast);
1772}
1773
1774static ssize_t create_child(struct device *dev,
1775                            struct device_attribute *attr,
1776                            const char *buf, size_t count)
1777{
1778        int pkey;
1779        int ret;
1780
1781        if (sscanf(buf, "%i", &pkey) != 1)
1782                return -EINVAL;
1783
1784        if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
1785                return -EINVAL;
1786
1787        /*
1788         * Set the full membership bit, so that we join the right
1789         * broadcast group, etc.
1790         */
1791        pkey |= 0x8000;
1792
1793        ret = ipoib_vlan_add(to_net_dev(dev), pkey);
1794
1795        return ret ? ret : count;
1796}
1797static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
1798
1799static ssize_t delete_child(struct device *dev,
1800                            struct device_attribute *attr,
1801                            const char *buf, size_t count)
1802{
1803        int pkey;
1804        int ret;
1805
1806        if (sscanf(buf, "%i", &pkey) != 1)
1807                return -EINVAL;
1808
1809        if (pkey < 0 || pkey > 0xffff)
1810                return -EINVAL;
1811
1812        ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
1813
1814        return ret ? ret : count;
1815
1816}
1817static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
1818
1819int ipoib_add_pkey_attr(struct net_device *dev)
1820{
1821        return device_create_file(&dev->dev, &dev_attr_pkey);
1822}
1823
1824int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
1825{
1826        priv->hca_caps = hca->attrs.device_cap_flags;
1827
1828        if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
1829                priv->dev->hw_features = NETIF_F_SG |
1830                        NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
1831
1832                if (priv->hca_caps & IB_DEVICE_UD_TSO)
1833                        priv->dev->hw_features |= NETIF_F_TSO;
1834
1835                priv->dev->features |= priv->dev->hw_features;
1836        }
1837
1838        return 0;
1839}
1840
1841static struct net_device *ipoib_add_port(const char *format,
1842                                         struct ib_device *hca, u8 port)
1843{
1844        struct ipoib_dev_priv *priv;
1845        struct ib_port_attr attr;
1846        int result = -ENOMEM;
1847
1848        priv = ipoib_intf_alloc(format);
1849        if (!priv)
1850                goto alloc_mem_failed;
1851
1852        SET_NETDEV_DEV(priv->dev, hca->dma_device);
1853        priv->dev->dev_id = port - 1;
1854
1855        result = ib_query_port(hca, port, &attr);
1856        if (!result)
1857                priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
1858        else {
1859                printk(KERN_WARNING "%s: ib_query_port %d failed\n",
1860                       hca->name, port);
1861                goto device_init_failed;
1862        }
1863
1864        /* MTU will be reset when mcast join happens */
1865        priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
1866        priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
1867
1868        priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
1869
1870        result = ib_query_pkey(hca, port, 0, &priv->pkey);
1871        if (result) {
1872                printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
1873                       hca->name, port, result);
1874                goto device_init_failed;
1875        }
1876
1877        result = ipoib_set_dev_features(priv, hca);
1878        if (result)
1879                goto device_init_failed;
1880
1881        /*
1882         * Set the full membership bit, so that we join the right
1883         * broadcast group, etc.
1884         */
1885        priv->pkey |= 0x8000;
1886
1887        priv->dev->broadcast[8] = priv->pkey >> 8;
1888        priv->dev->broadcast[9] = priv->pkey & 0xff;
1889
1890        result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);
1891        if (result) {
1892                printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
1893                       hca->name, port, result);
1894                goto device_init_failed;
1895        } else
1896                memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
1897
1898        result = ipoib_dev_init(priv->dev, hca, port);
1899        if (result < 0) {
1900                printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
1901                       hca->name, port, result);
1902                goto device_init_failed;
1903        }
1904
1905        INIT_IB_EVENT_HANDLER(&priv->event_handler,
1906                              priv->ca, ipoib_event);
1907        result = ib_register_event_handler(&priv->event_handler);
1908        if (result < 0) {
1909                printk(KERN_WARNING "%s: ib_register_event_handler failed for "
1910                       "port %d (ret = %d)\n",
1911                       hca->name, port, result);
1912                goto event_failed;
1913        }
1914
1915        result = register_netdev(priv->dev);
1916        if (result) {
1917                printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
1918                       hca->name, port, result);
1919                goto register_failed;
1920        }
1921
1922        ipoib_create_debug_files(priv->dev);
1923
1924        if (ipoib_cm_add_mode_attr(priv->dev))
1925                goto sysfs_failed;
1926        if (ipoib_add_pkey_attr(priv->dev))
1927                goto sysfs_failed;
1928        if (ipoib_add_umcast_attr(priv->dev))
1929                goto sysfs_failed;
1930        if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
1931                goto sysfs_failed;
1932        if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
1933                goto sysfs_failed;
1934
1935        return priv->dev;
1936
1937sysfs_failed:
1938        ipoib_delete_debug_files(priv->dev);
1939        unregister_netdev(priv->dev);
1940
1941register_failed:
1942        ib_unregister_event_handler(&priv->event_handler);
1943        flush_workqueue(ipoib_workqueue);
1944        /* Stop GC if started before flush */
1945        set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1946        cancel_delayed_work(&priv->neigh_reap_task);
1947        flush_workqueue(priv->wq);
1948
1949event_failed:
1950        ipoib_dev_cleanup(priv->dev);
1951
1952device_init_failed:
1953        free_netdev(priv->dev);
1954
1955alloc_mem_failed:
1956        return ERR_PTR(result);
1957}
1958
1959static void ipoib_add_one(struct ib_device *device)
1960{
1961        struct list_head *dev_list;
1962        struct net_device *dev;
1963        struct ipoib_dev_priv *priv;
1964        int p;
1965        int count = 0;
1966
1967        dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
1968        if (!dev_list)
1969                return;
1970
1971        INIT_LIST_HEAD(dev_list);
1972
1973        for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
1974                if (!rdma_protocol_ib(device, p))
1975                        continue;
1976                dev = ipoib_add_port("ib%d", device, p);
1977                if (!IS_ERR(dev)) {
1978                        priv = netdev_priv(dev);
1979                        list_add_tail(&priv->list, dev_list);
1980                        count++;
1981                }
1982        }
1983
1984        if (!count) {
1985                kfree(dev_list);
1986                return;
1987        }
1988
1989        ib_set_client_data(device, &ipoib_client, dev_list);
1990}
1991
1992static void ipoib_remove_one(struct ib_device *device, void *client_data)
1993{
1994        struct ipoib_dev_priv *priv, *tmp;
1995        struct list_head *dev_list = client_data;
1996
1997        if (!dev_list)
1998                return;
1999
2000        list_for_each_entry_safe(priv, tmp, dev_list, list) {
2001                ib_unregister_event_handler(&priv->event_handler);
2002                flush_workqueue(ipoib_workqueue);
2003
2004                rtnl_lock();
2005                dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
2006                rtnl_unlock();
2007
2008                /* Stop GC */
2009                set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
2010                cancel_delayed_work(&priv->neigh_reap_task);
2011                flush_workqueue(priv->wq);
2012
2013                unregister_netdev(priv->dev);
2014                free_netdev(priv->dev);
2015        }
2016
2017        kfree(dev_list);
2018}
2019
2020static int __init ipoib_init_module(void)
2021{
2022        int ret;
2023
2024        ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
2025        ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
2026        ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
2027
2028        ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
2029        ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
2030        ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
2031#ifdef CONFIG_INFINIBAND_IPOIB_CM
2032        ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
2033#endif
2034
2035        /*
2036         * When copying small received packets, we only copy from the
2037         * linear data part of the SKB, so we rely on this condition.
2038         */
2039        BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
2040
2041        ret = ipoib_register_debugfs();
2042        if (ret)
2043                return ret;
2044
2045        /*
2046         * We create a global workqueue here that is used for all flush
2047         * operations.  However, if you attempt to flush a workqueue
2048         * from a task on that same workqueue, it deadlocks the system.
2049         * We want to be able to flush the tasks associated with a
2050         * specific net device, so we also create a workqueue for each
2051         * netdevice.  We queue up the tasks for that device only on
2052         * its private workqueue, and we only queue up flush events
2053         * on our global flush workqueue.  This avoids the deadlocks.
2054         */
2055        ipoib_workqueue = create_singlethread_workqueue("ipoib_flush");
2056        if (!ipoib_workqueue) {
2057                ret = -ENOMEM;
2058                goto err_fs;
2059        }
2060
2061        ib_sa_register_client(&ipoib_sa_client);
2062
2063        ret = ib_register_client(&ipoib_client);
2064        if (ret)
2065                goto err_sa;
2066
2067        ret = ipoib_netlink_init();
2068        if (ret)
2069                goto err_client;
2070
2071        return 0;
2072
2073err_client:
2074        ib_unregister_client(&ipoib_client);
2075
2076err_sa:
2077        ib_sa_unregister_client(&ipoib_sa_client);
2078        destroy_workqueue(ipoib_workqueue);
2079
2080err_fs:
2081        ipoib_unregister_debugfs();
2082
2083        return ret;
2084}
2085
2086static void __exit ipoib_cleanup_module(void)
2087{
2088        ipoib_netlink_fini();
2089        ib_unregister_client(&ipoib_client);
2090        ib_sa_unregister_client(&ipoib_sa_client);
2091        ipoib_unregister_debugfs();
2092        destroy_workqueue(ipoib_workqueue);
2093}
2094
2095module_init(ipoib_init_module);
2096module_exit(ipoib_cleanup_module);
2097