linux/drivers/infiniband/ulp/ipoib/ipoib_main.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
   3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
   4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
   5 *
   6 * This software is available to you under a choice of one of two
   7 * licenses.  You may choose to be licensed under the terms of the GNU
   8 * General Public License (GPL) Version 2, available from the file
   9 * COPYING in the main directory of this source tree, or the
  10 * OpenIB.org BSD license below:
  11 *
  12 *     Redistribution and use in source and binary forms, with or
  13 *     without modification, are permitted provided that the following
  14 *     conditions are met:
  15 *
  16 *      - Redistributions of source code must retain the above
  17 *        copyright notice, this list of conditions and the following
  18 *        disclaimer.
  19 *
  20 *      - Redistributions in binary form must reproduce the above
  21 *        copyright notice, this list of conditions and the following
  22 *        disclaimer in the documentation and/or other materials
  23 *        provided with the distribution.
  24 *
  25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32 * SOFTWARE.
  33 */
  34
  35#include "ipoib.h"
  36
  37#include <linux/module.h>
  38
  39#include <linux/init.h>
  40#include <linux/slab.h>
  41#include <linux/kernel.h>
  42#include <linux/vmalloc.h>
  43
  44#include <linux/if_arp.h>       /* For ARPHRD_xxx */
  45
  46#include <linux/ip.h>
  47#include <linux/in.h>
  48
  49#include <linux/jhash.h>
  50#include <net/arp.h>
  51#include <net/addrconf.h>
  52#include <linux/inetdevice.h>
  53#include <rdma/ib_cache.h>
  54
  55MODULE_AUTHOR("Roland Dreier");
  56MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
  57MODULE_LICENSE("Dual BSD/GPL");
  58
  59int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
  60int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
  61
  62module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
  63MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
  64module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
  65MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
  66
  67#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  68int ipoib_debug_level;
  69
  70module_param_named(debug_level, ipoib_debug_level, int, 0644);
  71MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
  72#endif
  73
  74struct ipoib_path_iter {
  75        struct net_device *dev;
  76        struct ipoib_path  path;
  77};
  78
  79static const u8 ipv4_bcast_addr[] = {
  80        0x00, 0xff, 0xff, 0xff,
  81        0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  82        0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
  83};
  84
  85struct workqueue_struct *ipoib_workqueue;
  86
  87struct ib_sa_client ipoib_sa_client;
  88
  89static int ipoib_add_one(struct ib_device *device);
  90static void ipoib_remove_one(struct ib_device *device, void *client_data);
  91static void ipoib_neigh_reclaim(struct rcu_head *rp);
  92static struct net_device *ipoib_get_net_dev_by_params(
  93                struct ib_device *dev, u8 port, u16 pkey,
  94                const union ib_gid *gid, const struct sockaddr *addr,
  95                void *client_data);
  96static int ipoib_set_mac(struct net_device *dev, void *addr);
  97static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr,
  98                       int cmd);
  99
 100static struct ib_client ipoib_client = {
 101        .name   = "ipoib",
 102        .add    = ipoib_add_one,
 103        .remove = ipoib_remove_one,
 104        .get_net_dev_by_params = ipoib_get_net_dev_by_params,
 105};
 106
 107#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 108static int ipoib_netdev_event(struct notifier_block *this,
 109                              unsigned long event, void *ptr)
 110{
 111        struct netdev_notifier_info *ni = ptr;
 112        struct net_device *dev = ni->dev;
 113
 114        if (dev->netdev_ops->ndo_open != ipoib_open)
 115                return NOTIFY_DONE;
 116
 117        switch (event) {
 118        case NETDEV_REGISTER:
 119                ipoib_create_debug_files(dev);
 120                break;
 121        case NETDEV_CHANGENAME:
 122                ipoib_delete_debug_files(dev);
 123                ipoib_create_debug_files(dev);
 124                break;
 125        case NETDEV_UNREGISTER:
 126                ipoib_delete_debug_files(dev);
 127                break;
 128        }
 129
 130        return NOTIFY_DONE;
 131}
 132#endif
 133
 134int ipoib_open(struct net_device *dev)
 135{
 136        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 137
 138        ipoib_dbg(priv, "bringing up interface\n");
 139
 140        netif_carrier_off(dev);
 141
 142        set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 143
 144        if (ipoib_ib_dev_open(dev)) {
 145                if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
 146                        return 0;
 147                goto err_disable;
 148        }
 149
 150        ipoib_ib_dev_up(dev);
 151
 152        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 153                struct ipoib_dev_priv *cpriv;
 154
 155                /* Bring up any child interfaces too */
 156                down_read(&priv->vlan_rwsem);
 157                list_for_each_entry(cpriv, &priv->child_intfs, list) {
 158                        int flags;
 159
 160                        flags = cpriv->dev->flags;
 161                        if (flags & IFF_UP)
 162                                continue;
 163
 164                        dev_change_flags(cpriv->dev, flags | IFF_UP, NULL);
 165                }
 166                up_read(&priv->vlan_rwsem);
 167        }
 168
 169        netif_start_queue(dev);
 170
 171        return 0;
 172
 173err_disable:
 174        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 175
 176        return -EINVAL;
 177}
 178
 179static int ipoib_stop(struct net_device *dev)
 180{
 181        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 182
 183        ipoib_dbg(priv, "stopping interface\n");
 184
 185        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 186
 187        netif_stop_queue(dev);
 188
 189        ipoib_ib_dev_down(dev);
 190        ipoib_ib_dev_stop(dev);
 191
 192        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 193                struct ipoib_dev_priv *cpriv;
 194
 195                /* Bring down any child interfaces too */
 196                down_read(&priv->vlan_rwsem);
 197                list_for_each_entry(cpriv, &priv->child_intfs, list) {
 198                        int flags;
 199
 200                        flags = cpriv->dev->flags;
 201                        if (!(flags & IFF_UP))
 202                                continue;
 203
 204                        dev_change_flags(cpriv->dev, flags & ~IFF_UP, NULL);
 205                }
 206                up_read(&priv->vlan_rwsem);
 207        }
 208
 209        return 0;
 210}
 211
 212static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
 213{
 214        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 215
 216        if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
 217                features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
 218
 219        return features;
 220}
 221
 222static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 223{
 224        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 225        int ret = 0;
 226
 227        /* dev->mtu > 2K ==> connected mode */
 228        if (ipoib_cm_admin_enabled(dev)) {
 229                if (new_mtu > ipoib_cm_max_mtu(dev))
 230                        return -EINVAL;
 231
 232                if (new_mtu > priv->mcast_mtu)
 233                        ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
 234                                   priv->mcast_mtu);
 235
 236                dev->mtu = new_mtu;
 237                return 0;
 238        }
 239
 240        if (new_mtu < (ETH_MIN_MTU + IPOIB_ENCAP_LEN) ||
 241            new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
 242                return -EINVAL;
 243
 244        priv->admin_mtu = new_mtu;
 245
 246        if (priv->mcast_mtu < priv->admin_mtu)
 247                ipoib_dbg(priv, "MTU must be smaller than the underlying "
 248                                "link layer MTU - 4 (%u)\n", priv->mcast_mtu);
 249
 250        new_mtu = min(priv->mcast_mtu, priv->admin_mtu);
 251
 252        if (priv->rn_ops->ndo_change_mtu) {
 253                bool carrier_status = netif_carrier_ok(dev);
 254
 255                netif_carrier_off(dev);
 256
 257                /* notify lower level on the real mtu */
 258                ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu);
 259
 260                if (carrier_status)
 261                        netif_carrier_on(dev);
 262        } else {
 263                dev->mtu = new_mtu;
 264        }
 265
 266        return ret;
 267}
 268
 269static void ipoib_get_stats(struct net_device *dev,
 270                            struct rtnl_link_stats64 *stats)
 271{
 272        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 273
 274        if (priv->rn_ops->ndo_get_stats64)
 275                priv->rn_ops->ndo_get_stats64(dev, stats);
 276        else
 277                netdev_stats_to_stats64(stats, &dev->stats);
 278}
 279
 280/* Called with an RCU read lock taken */
 281static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
 282                                        struct net_device *dev)
 283{
 284        struct net *net = dev_net(dev);
 285        struct in_device *in_dev;
 286        struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
 287        struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
 288        __be32 ret_addr;
 289
 290        switch (addr->sa_family) {
 291        case AF_INET:
 292                in_dev = in_dev_get(dev);
 293                if (!in_dev)
 294                        return false;
 295
 296                ret_addr = inet_confirm_addr(net, in_dev, 0,
 297                                             addr_in->sin_addr.s_addr,
 298                                             RT_SCOPE_HOST);
 299                in_dev_put(in_dev);
 300                if (ret_addr)
 301                        return true;
 302
 303                break;
 304        case AF_INET6:
 305                if (IS_ENABLED(CONFIG_IPV6) &&
 306                    ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
 307                        return true;
 308
 309                break;
 310        }
 311        return false;
 312}
 313
 314/**
 315 * Find the master net_device on top of the given net_device.
 316 * @dev: base IPoIB net_device
 317 *
 318 * Returns the master net_device with a reference held, or the same net_device
 319 * if no master exists.
 320 */
 321static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
 322{
 323        struct net_device *master;
 324
 325        rcu_read_lock();
 326        master = netdev_master_upper_dev_get_rcu(dev);
 327        if (master)
 328                dev_hold(master);
 329        rcu_read_unlock();
 330
 331        if (master)
 332                return master;
 333
 334        dev_hold(dev);
 335        return dev;
 336}
 337
 338struct ipoib_walk_data {
 339        const struct sockaddr *addr;
 340        struct net_device *result;
 341};
 342
 343static int ipoib_upper_walk(struct net_device *upper,
 344                            struct netdev_nested_priv *priv)
 345{
 346        struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data;
 347        int ret = 0;
 348
 349        if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) {
 350                dev_hold(upper);
 351                data->result = upper;
 352                ret = 1;
 353        }
 354
 355        return ret;
 356}
 357
 358/**
 359 * Find a net_device matching the given address, which is an upper device of
 360 * the given net_device.
 361 * @addr: IP address to look for.
 362 * @dev: base IPoIB net_device
 363 *
 364 * If found, returns the net_device with a reference held. Otherwise return
 365 * NULL.
 366 */
 367static struct net_device *ipoib_get_net_dev_match_addr(
 368                const struct sockaddr *addr, struct net_device *dev)
 369{
 370        struct netdev_nested_priv priv;
 371        struct ipoib_walk_data data = {
 372                .addr = addr,
 373        };
 374
 375        priv.data = (void *)&data;
 376        rcu_read_lock();
 377        if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
 378                dev_hold(dev);
 379                data.result = dev;
 380                goto out;
 381        }
 382
 383        netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv);
 384out:
 385        rcu_read_unlock();
 386        return data.result;
 387}
 388
 389/* returns the number of IPoIB netdevs on top a given ipoib device matching a
 390 * pkey_index and address, if one exists.
 391 *
 392 * @found_net_dev: contains a matching net_device if the return value >= 1,
 393 * with a reference held. */
 394static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
 395                                     const union ib_gid *gid,
 396                                     u16 pkey_index,
 397                                     const struct sockaddr *addr,
 398                                     int nesting,
 399                                     struct net_device **found_net_dev)
 400{
 401        struct ipoib_dev_priv *child_priv;
 402        struct net_device *net_dev = NULL;
 403        int matches = 0;
 404
 405        if (priv->pkey_index == pkey_index &&
 406            (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
 407                if (!addr) {
 408                        net_dev = ipoib_get_master_net_dev(priv->dev);
 409                } else {
 410                        /* Verify the net_device matches the IP address, as
 411                         * IPoIB child devices currently share a GID. */
 412                        net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
 413                }
 414                if (net_dev) {
 415                        if (!*found_net_dev)
 416                                *found_net_dev = net_dev;
 417                        else
 418                                dev_put(net_dev);
 419                        ++matches;
 420                }
 421        }
 422
 423        /* Check child interfaces */
 424        down_read_nested(&priv->vlan_rwsem, nesting);
 425        list_for_each_entry(child_priv, &priv->child_intfs, list) {
 426                matches += ipoib_match_gid_pkey_addr(child_priv, gid,
 427                                                    pkey_index, addr,
 428                                                    nesting + 1,
 429                                                    found_net_dev);
 430                if (matches > 1)
 431                        break;
 432        }
 433        up_read(&priv->vlan_rwsem);
 434
 435        return matches;
 436}
 437
 438/* Returns the number of matching net_devs found (between 0 and 2). Also
 439 * return the matching net_device in the @net_dev parameter, holding a
 440 * reference to the net_device, if the number of matches >= 1 */
 441static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
 442                                         u16 pkey_index,
 443                                         const union ib_gid *gid,
 444                                         const struct sockaddr *addr,
 445                                         struct net_device **net_dev)
 446{
 447        struct ipoib_dev_priv *priv;
 448        int matches = 0;
 449
 450        *net_dev = NULL;
 451
 452        list_for_each_entry(priv, dev_list, list) {
 453                if (priv->port != port)
 454                        continue;
 455
 456                matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
 457                                                     addr, 0, net_dev);
 458                if (matches > 1)
 459                        break;
 460        }
 461
 462        return matches;
 463}
 464
 465static struct net_device *ipoib_get_net_dev_by_params(
 466                struct ib_device *dev, u8 port, u16 pkey,
 467                const union ib_gid *gid, const struct sockaddr *addr,
 468                void *client_data)
 469{
 470        struct net_device *net_dev;
 471        struct list_head *dev_list = client_data;
 472        u16 pkey_index;
 473        int matches;
 474        int ret;
 475
 476        if (!rdma_protocol_ib(dev, port))
 477                return NULL;
 478
 479        ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
 480        if (ret)
 481                return NULL;
 482
 483        /* See if we can find a unique device matching the L2 parameters */
 484        matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
 485                                                gid, NULL, &net_dev);
 486
 487        switch (matches) {
 488        case 0:
 489                return NULL;
 490        case 1:
 491                return net_dev;
 492        }
 493
 494        dev_put(net_dev);
 495
 496        /* Couldn't find a unique device with L2 parameters only. Use L3
 497         * address to uniquely match the net device */
 498        matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
 499                                                gid, addr, &net_dev);
 500        switch (matches) {
 501        case 0:
 502                return NULL;
 503        default:
 504                dev_warn_ratelimited(&dev->dev,
 505                                     "duplicate IP address detected\n");
 506                fallthrough;
 507        case 1:
 508                return net_dev;
 509        }
 510}
 511
 512int ipoib_set_mode(struct net_device *dev, const char *buf)
 513{
 514        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 515
 516        if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
 517             !strcmp(buf, "connected\n")) ||
 518             (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
 519             !strcmp(buf, "datagram\n"))) {
 520                return 0;
 521        }
 522
 523        /* flush paths if we switch modes so that connections are restarted */
 524        if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
 525                set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
 526                ipoib_warn(priv, "enabling connected mode "
 527                           "will cause multicast packet drops\n");
 528                netdev_update_features(dev);
 529                dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
 530                netif_set_real_num_tx_queues(dev, 1);
 531                rtnl_unlock();
 532                priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
 533
 534                ipoib_flush_paths(dev);
 535                return (!rtnl_trylock()) ? -EBUSY : 0;
 536        }
 537
 538        if (!strcmp(buf, "datagram\n")) {
 539                clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
 540                netdev_update_features(dev);
 541                dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
 542                netif_set_real_num_tx_queues(dev, dev->num_tx_queues);
 543                rtnl_unlock();
 544                ipoib_flush_paths(dev);
 545                return (!rtnl_trylock()) ? -EBUSY : 0;
 546        }
 547
 548        return -EINVAL;
 549}
 550
 551struct ipoib_path *__path_find(struct net_device *dev, void *gid)
 552{
 553        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 554        struct rb_node *n = priv->path_tree.rb_node;
 555        struct ipoib_path *path;
 556        int ret;
 557
 558        while (n) {
 559                path = rb_entry(n, struct ipoib_path, rb_node);
 560
 561                ret = memcmp(gid, path->pathrec.dgid.raw,
 562                             sizeof (union ib_gid));
 563
 564                if (ret < 0)
 565                        n = n->rb_left;
 566                else if (ret > 0)
 567                        n = n->rb_right;
 568                else
 569                        return path;
 570        }
 571
 572        return NULL;
 573}
 574
 575static int __path_add(struct net_device *dev, struct ipoib_path *path)
 576{
 577        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 578        struct rb_node **n = &priv->path_tree.rb_node;
 579        struct rb_node *pn = NULL;
 580        struct ipoib_path *tpath;
 581        int ret;
 582
 583        while (*n) {
 584                pn = *n;
 585                tpath = rb_entry(pn, struct ipoib_path, rb_node);
 586
 587                ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
 588                             sizeof (union ib_gid));
 589                if (ret < 0)
 590                        n = &pn->rb_left;
 591                else if (ret > 0)
 592                        n = &pn->rb_right;
 593                else
 594                        return -EEXIST;
 595        }
 596
 597        rb_link_node(&path->rb_node, pn, n);
 598        rb_insert_color(&path->rb_node, &priv->path_tree);
 599
 600        list_add_tail(&path->list, &priv->path_list);
 601
 602        return 0;
 603}
 604
 605static void path_free(struct net_device *dev, struct ipoib_path *path)
 606{
 607        struct sk_buff *skb;
 608
 609        while ((skb = __skb_dequeue(&path->queue)))
 610                dev_kfree_skb_irq(skb);
 611
 612        ipoib_dbg(ipoib_priv(dev), "%s\n", __func__);
 613
 614        /* remove all neigh connected to this path */
 615        ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
 616
 617        if (path->ah)
 618                ipoib_put_ah(path->ah);
 619
 620        kfree(path);
 621}
 622
 623#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 624
 625struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
 626{
 627        struct ipoib_path_iter *iter;
 628
 629        iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 630        if (!iter)
 631                return NULL;
 632
 633        iter->dev = dev;
 634        memset(iter->path.pathrec.dgid.raw, 0, 16);
 635
 636        if (ipoib_path_iter_next(iter)) {
 637                kfree(iter);
 638                return NULL;
 639        }
 640
 641        return iter;
 642}
 643
 644int ipoib_path_iter_next(struct ipoib_path_iter *iter)
 645{
 646        struct ipoib_dev_priv *priv = ipoib_priv(iter->dev);
 647        struct rb_node *n;
 648        struct ipoib_path *path;
 649        int ret = 1;
 650
 651        spin_lock_irq(&priv->lock);
 652
 653        n = rb_first(&priv->path_tree);
 654
 655        while (n) {
 656                path = rb_entry(n, struct ipoib_path, rb_node);
 657
 658                if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
 659                           sizeof (union ib_gid)) < 0) {
 660                        iter->path = *path;
 661                        ret = 0;
 662                        break;
 663                }
 664
 665                n = rb_next(n);
 666        }
 667
 668        spin_unlock_irq(&priv->lock);
 669
 670        return ret;
 671}
 672
 673void ipoib_path_iter_read(struct ipoib_path_iter *iter,
 674                          struct ipoib_path *path)
 675{
 676        *path = iter->path;
 677}
 678
 679#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
 680
 681void ipoib_mark_paths_invalid(struct net_device *dev)
 682{
 683        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 684        struct ipoib_path *path, *tp;
 685
 686        spin_lock_irq(&priv->lock);
 687
 688        list_for_each_entry_safe(path, tp, &priv->path_list, list) {
 689                ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n",
 690                          be32_to_cpu(sa_path_get_dlid(&path->pathrec)),
 691                          path->pathrec.dgid.raw);
 692                if (path->ah)
 693                        path->ah->valid = 0;
 694        }
 695
 696        spin_unlock_irq(&priv->lock);
 697}
 698
 699static void push_pseudo_header(struct sk_buff *skb, const char *daddr)
 700{
 701        struct ipoib_pseudo_header *phdr;
 702
 703        phdr = skb_push(skb, sizeof(*phdr));
 704        memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
 705}
 706
 707void ipoib_flush_paths(struct net_device *dev)
 708{
 709        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 710        struct ipoib_path *path, *tp;
 711        LIST_HEAD(remove_list);
 712        unsigned long flags;
 713
 714        netif_tx_lock_bh(dev);
 715        spin_lock_irqsave(&priv->lock, flags);
 716
 717        list_splice_init(&priv->path_list, &remove_list);
 718
 719        list_for_each_entry(path, &remove_list, list)
 720                rb_erase(&path->rb_node, &priv->path_tree);
 721
 722        list_for_each_entry_safe(path, tp, &remove_list, list) {
 723                if (path->query)
 724                        ib_sa_cancel_query(path->query_id, path->query);
 725                spin_unlock_irqrestore(&priv->lock, flags);
 726                netif_tx_unlock_bh(dev);
 727                wait_for_completion(&path->done);
 728                path_free(dev, path);
 729                netif_tx_lock_bh(dev);
 730                spin_lock_irqsave(&priv->lock, flags);
 731        }
 732
 733        spin_unlock_irqrestore(&priv->lock, flags);
 734        netif_tx_unlock_bh(dev);
 735}
 736
 737static void path_rec_completion(int status,
 738                                struct sa_path_rec *pathrec,
 739                                void *path_ptr)
 740{
 741        struct ipoib_path *path = path_ptr;
 742        struct net_device *dev = path->dev;
 743        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 744        struct ipoib_ah *ah = NULL;
 745        struct ipoib_ah *old_ah = NULL;
 746        struct ipoib_neigh *neigh, *tn;
 747        struct sk_buff_head skqueue;
 748        struct sk_buff *skb;
 749        unsigned long flags;
 750
 751        if (!status)
 752                ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
 753                          be32_to_cpu(sa_path_get_dlid(pathrec)),
 754                          pathrec->dgid.raw);
 755        else
 756                ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
 757                          status, path->pathrec.dgid.raw);
 758
 759        skb_queue_head_init(&skqueue);
 760
 761        if (!status) {
 762                struct rdma_ah_attr av;
 763
 764                if (!ib_init_ah_attr_from_path(priv->ca, priv->port,
 765                                               pathrec, &av, NULL)) {
 766                        ah = ipoib_create_ah(dev, priv->pd, &av);
 767                        rdma_destroy_ah_attr(&av);
 768                }
 769        }
 770
 771        spin_lock_irqsave(&priv->lock, flags);
 772
 773        if (!IS_ERR_OR_NULL(ah)) {
 774                /*
 775                 * pathrec.dgid is used as the database key from the LLADDR,
 776                 * it must remain unchanged even if the SA returns a different
 777                 * GID to use in the AH.
 778                 */
 779                if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw,
 780                           sizeof(union ib_gid))) {
 781                        ipoib_dbg(
 782                                priv,
 783                                "%s got PathRec for gid %pI6 while asked for %pI6\n",
 784                                dev->name, pathrec->dgid.raw,
 785                                path->pathrec.dgid.raw);
 786                        memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw,
 787                               sizeof(union ib_gid));
 788                }
 789
 790                path->pathrec = *pathrec;
 791
 792                old_ah   = path->ah;
 793                path->ah = ah;
 794
 795                ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
 796                          ah, be32_to_cpu(sa_path_get_dlid(pathrec)),
 797                          pathrec->sl);
 798
 799                while ((skb = __skb_dequeue(&path->queue)))
 800                        __skb_queue_tail(&skqueue, skb);
 801
 802                list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
 803                        if (neigh->ah) {
 804                                WARN_ON(neigh->ah != old_ah);
 805                                /*
 806                                 * Dropping the ah reference inside
 807                                 * priv->lock is safe here, because we
 808                                 * will hold one more reference from
 809                                 * the original value of path->ah (ie
 810                                 * old_ah).
 811                                 */
 812                                ipoib_put_ah(neigh->ah);
 813                        }
 814                        kref_get(&path->ah->ref);
 815                        neigh->ah = path->ah;
 816
 817                        if (ipoib_cm_enabled(dev, neigh->daddr)) {
 818                                if (!ipoib_cm_get(neigh))
 819                                        ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
 820                                                                               path,
 821                                                                               neigh));
 822                                if (!ipoib_cm_get(neigh)) {
 823                                        ipoib_neigh_free(neigh);
 824                                        continue;
 825                                }
 826                        }
 827
 828                        while ((skb = __skb_dequeue(&neigh->queue)))
 829                                __skb_queue_tail(&skqueue, skb);
 830                }
 831                path->ah->valid = 1;
 832        }
 833
 834        path->query = NULL;
 835        complete(&path->done);
 836
 837        spin_unlock_irqrestore(&priv->lock, flags);
 838
 839        if (IS_ERR_OR_NULL(ah))
 840                ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
 841
 842        if (old_ah)
 843                ipoib_put_ah(old_ah);
 844
 845        while ((skb = __skb_dequeue(&skqueue))) {
 846                int ret;
 847                skb->dev = dev;
 848                ret = dev_queue_xmit(skb);
 849                if (ret)
 850                        ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n",
 851                                   __func__, ret);
 852        }
 853}
 854
 855static void init_path_rec(struct ipoib_dev_priv *priv, struct ipoib_path *path,
 856                          void *gid)
 857{
 858        path->dev = priv->dev;
 859
 860        if (rdma_cap_opa_ah(priv->ca, priv->port))
 861                path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA;
 862        else
 863                path->pathrec.rec_type = SA_PATH_REC_TYPE_IB;
 864
 865        memcpy(path->pathrec.dgid.raw, gid, sizeof(union ib_gid));
 866        path->pathrec.sgid          = priv->local_gid;
 867        path->pathrec.pkey          = cpu_to_be16(priv->pkey);
 868        path->pathrec.numb_path     = 1;
 869        path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
 870}
 871
 872static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
 873{
 874        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 875        struct ipoib_path *path;
 876
 877        if (!priv->broadcast)
 878                return NULL;
 879
 880        path = kzalloc(sizeof(*path), GFP_ATOMIC);
 881        if (!path)
 882                return NULL;
 883
 884        skb_queue_head_init(&path->queue);
 885
 886        INIT_LIST_HEAD(&path->neigh_list);
 887
 888        init_path_rec(priv, path, gid);
 889
 890        return path;
 891}
 892
 893static int path_rec_start(struct net_device *dev,
 894                          struct ipoib_path *path)
 895{
 896        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 897
 898        ipoib_dbg(priv, "Start path record lookup for %pI6\n",
 899                  path->pathrec.dgid.raw);
 900
 901        init_completion(&path->done);
 902
 903        path->query_id =
 904                ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
 905                                   &path->pathrec,
 906                                   IB_SA_PATH_REC_DGID          |
 907                                   IB_SA_PATH_REC_SGID          |
 908                                   IB_SA_PATH_REC_NUMB_PATH     |
 909                                   IB_SA_PATH_REC_TRAFFIC_CLASS |
 910                                   IB_SA_PATH_REC_PKEY,
 911                                   1000, GFP_ATOMIC,
 912                                   path_rec_completion,
 913                                   path, &path->query);
 914        if (path->query_id < 0) {
 915                ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
 916                path->query = NULL;
 917                complete(&path->done);
 918                return path->query_id;
 919        }
 920
 921        return 0;
 922}
 923
 924static void neigh_refresh_path(struct ipoib_neigh *neigh, u8 *daddr,
 925                               struct net_device *dev)
 926{
 927        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 928        struct ipoib_path *path;
 929        unsigned long flags;
 930
 931        spin_lock_irqsave(&priv->lock, flags);
 932
 933        path = __path_find(dev, daddr + 4);
 934        if (!path)
 935                goto out;
 936        if (!path->query)
 937                path_rec_start(dev, path);
 938out:
 939        spin_unlock_irqrestore(&priv->lock, flags);
 940}
 941
 942static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr,
 943                                          struct net_device *dev)
 944{
 945        struct ipoib_dev_priv *priv = ipoib_priv(dev);
 946        struct rdma_netdev *rn = netdev_priv(dev);
 947        struct ipoib_path *path;
 948        struct ipoib_neigh *neigh;
 949        unsigned long flags;
 950
 951        spin_lock_irqsave(&priv->lock, flags);
 952        neigh = ipoib_neigh_alloc(daddr, dev);
 953        if (!neigh) {
 954                spin_unlock_irqrestore(&priv->lock, flags);
 955                ++dev->stats.tx_dropped;
 956                dev_kfree_skb_any(skb);
 957                return NULL;
 958        }
 959
 960        /* To avoid race condition, make sure that the
 961         * neigh will be added only once.
 962         */
 963        if (unlikely(!list_empty(&neigh->list))) {
 964                spin_unlock_irqrestore(&priv->lock, flags);
 965                return neigh;
 966        }
 967
 968        path = __path_find(dev, daddr + 4);
 969        if (!path) {
 970                path = path_rec_create(dev, daddr + 4);
 971                if (!path)
 972                        goto err_path;
 973
 974                __path_add(dev, path);
 975        }
 976
 977        list_add_tail(&neigh->list, &path->neigh_list);
 978
 979        if (path->ah && path->ah->valid) {
 980                kref_get(&path->ah->ref);
 981                neigh->ah = path->ah;
 982
 983                if (ipoib_cm_enabled(dev, neigh->daddr)) {
 984                        if (!ipoib_cm_get(neigh))
 985                                ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
 986                        if (!ipoib_cm_get(neigh)) {
 987                                ipoib_neigh_free(neigh);
 988                                goto err_drop;
 989                        }
 990                        if (skb_queue_len(&neigh->queue) <
 991                            IPOIB_MAX_PATH_REC_QUEUE) {
 992                                push_pseudo_header(skb, neigh->daddr);
 993                                __skb_queue_tail(&neigh->queue, skb);
 994                        } else {
 995                                ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
 996                                           skb_queue_len(&neigh->queue));
 997                                goto err_drop;
 998                        }
 999                } else {
1000                        spin_unlock_irqrestore(&priv->lock, flags);
1001                        path->ah->last_send = rn->send(dev, skb, path->ah->ah,
1002                                                       IPOIB_QPN(daddr));
1003                        ipoib_neigh_put(neigh);
1004                        return NULL;
1005                }
1006        } else {
1007                neigh->ah  = NULL;
1008
1009                if (!path->query && path_rec_start(dev, path))
1010                        goto err_path;
1011                if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1012                        push_pseudo_header(skb, neigh->daddr);
1013                        __skb_queue_tail(&neigh->queue, skb);
1014                } else {
1015                        goto err_drop;
1016                }
1017        }
1018
1019        spin_unlock_irqrestore(&priv->lock, flags);
1020        ipoib_neigh_put(neigh);
1021        return NULL;
1022
1023err_path:
1024        ipoib_neigh_free(neigh);
1025err_drop:
1026        ++dev->stats.tx_dropped;
1027        dev_kfree_skb_any(skb);
1028
1029        spin_unlock_irqrestore(&priv->lock, flags);
1030        ipoib_neigh_put(neigh);
1031
1032        return NULL;
1033}
1034
1035static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
1036                             struct ipoib_pseudo_header *phdr)
1037{
1038        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1039        struct rdma_netdev *rn = netdev_priv(dev);
1040        struct ipoib_path *path;
1041        unsigned long flags;
1042
1043        spin_lock_irqsave(&priv->lock, flags);
1044
1045        /* no broadcast means that all paths are (going to be) not valid */
1046        if (!priv->broadcast)
1047                goto drop_and_unlock;
1048
1049        path = __path_find(dev, phdr->hwaddr + 4);
1050        if (!path || !path->ah || !path->ah->valid) {
1051                if (!path) {
1052                        path = path_rec_create(dev, phdr->hwaddr + 4);
1053                        if (!path)
1054                                goto drop_and_unlock;
1055                        __path_add(dev, path);
1056                } else {
1057                        /*
1058                         * make sure there are no changes in the existing
1059                         * path record
1060                         */
1061                        init_path_rec(priv, path, phdr->hwaddr + 4);
1062                }
1063                if (!path->query && path_rec_start(dev, path)) {
1064                        goto drop_and_unlock;
1065                }
1066
1067                if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1068                        push_pseudo_header(skb, phdr->hwaddr);
1069                        __skb_queue_tail(&path->queue, skb);
1070                        goto unlock;
1071                } else {
1072                        goto drop_and_unlock;
1073                }
1074        }
1075
1076        spin_unlock_irqrestore(&priv->lock, flags);
1077        ipoib_dbg(priv, "Send unicast ARP to %08x\n",
1078                  be32_to_cpu(sa_path_get_dlid(&path->pathrec)));
1079        path->ah->last_send = rn->send(dev, skb, path->ah->ah,
1080                                       IPOIB_QPN(phdr->hwaddr));
1081        return;
1082
1083drop_and_unlock:
1084        ++dev->stats.tx_dropped;
1085        dev_kfree_skb_any(skb);
1086unlock:
1087        spin_unlock_irqrestore(&priv->lock, flags);
1088}
1089
1090static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
1091{
1092        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1093        struct rdma_netdev *rn = netdev_priv(dev);
1094        struct ipoib_neigh *neigh;
1095        struct ipoib_pseudo_header *phdr;
1096        struct ipoib_header *header;
1097        unsigned long flags;
1098
1099        phdr = (struct ipoib_pseudo_header *) skb->data;
1100        skb_pull(skb, sizeof(*phdr));
1101        header = (struct ipoib_header *) skb->data;
1102
1103        if (unlikely(phdr->hwaddr[4] == 0xff)) {
1104                /* multicast, arrange "if" according to probability */
1105                if ((header->proto != htons(ETH_P_IP)) &&
1106                    (header->proto != htons(ETH_P_IPV6)) &&
1107                    (header->proto != htons(ETH_P_ARP)) &&
1108                    (header->proto != htons(ETH_P_RARP)) &&
1109                    (header->proto != htons(ETH_P_TIPC))) {
1110                        /* ethertype not supported by IPoIB */
1111                        ++dev->stats.tx_dropped;
1112                        dev_kfree_skb_any(skb);
1113                        return NETDEV_TX_OK;
1114                }
1115                /* Add in the P_Key for multicast*/
1116                phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
1117                phdr->hwaddr[9] = priv->pkey & 0xff;
1118
1119                neigh = ipoib_neigh_get(dev, phdr->hwaddr);
1120                if (likely(neigh))
1121                        goto send_using_neigh;
1122                ipoib_mcast_send(dev, phdr->hwaddr, skb);
1123                return NETDEV_TX_OK;
1124        }
1125
1126        /* unicast, arrange "switch" according to probability */
1127        switch (header->proto) {
1128        case htons(ETH_P_IP):
1129        case htons(ETH_P_IPV6):
1130        case htons(ETH_P_TIPC):
1131                neigh = ipoib_neigh_get(dev, phdr->hwaddr);
1132                if (unlikely(!neigh)) {
1133                        neigh = neigh_add_path(skb, phdr->hwaddr, dev);
1134                        if (likely(!neigh))
1135                                return NETDEV_TX_OK;
1136                }
1137                break;
1138        case htons(ETH_P_ARP):
1139        case htons(ETH_P_RARP):
1140                /* for unicast ARP and RARP should always perform path find */
1141                unicast_arp_send(skb, dev, phdr);
1142                return NETDEV_TX_OK;
1143        default:
1144                /* ethertype not supported by IPoIB */
1145                ++dev->stats.tx_dropped;
1146                dev_kfree_skb_any(skb);
1147                return NETDEV_TX_OK;
1148        }
1149
1150send_using_neigh:
1151        /* note we now hold a ref to neigh */
1152        if (ipoib_cm_get(neigh)) {
1153                if (ipoib_cm_up(neigh)) {
1154                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
1155                        goto unref;
1156                }
1157        } else if (neigh->ah && neigh->ah->valid) {
1158                neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
1159                                                IPOIB_QPN(phdr->hwaddr));
1160                goto unref;
1161        } else if (neigh->ah) {
1162                neigh_refresh_path(neigh, phdr->hwaddr, dev);
1163        }
1164
1165        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1166                push_pseudo_header(skb, phdr->hwaddr);
1167                spin_lock_irqsave(&priv->lock, flags);
1168                __skb_queue_tail(&neigh->queue, skb);
1169                spin_unlock_irqrestore(&priv->lock, flags);
1170        } else {
1171                ++dev->stats.tx_dropped;
1172                dev_kfree_skb_any(skb);
1173        }
1174
1175unref:
1176        ipoib_neigh_put(neigh);
1177
1178        return NETDEV_TX_OK;
1179}
1180
1181static void ipoib_timeout(struct net_device *dev, unsigned int txqueue)
1182{
1183        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1184
1185        ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
1186                   jiffies_to_msecs(jiffies - dev_trans_start(dev)));
1187        ipoib_warn(priv,
1188                   "queue stopped %d, tx_head %u, tx_tail %u, global_tx_head %u, global_tx_tail %u\n",
1189                   netif_queue_stopped(dev), priv->tx_head, priv->tx_tail,
1190                   priv->global_tx_head, priv->global_tx_tail);
1191
1192        /* XXX reset QP, etc. */
1193}
1194
1195static int ipoib_hard_header(struct sk_buff *skb,
1196                             struct net_device *dev,
1197                             unsigned short type,
1198                             const void *daddr,
1199                             const void *saddr,
1200                             unsigned int len)
1201{
1202        struct ipoib_header *header;
1203
1204        header = skb_push(skb, sizeof(*header));
1205
1206        header->proto = htons(type);
1207        header->reserved = 0;
1208
1209        /*
1210         * we don't rely on dst_entry structure,  always stuff the
1211         * destination address into skb hard header so we can figure out where
1212         * to send the packet later.
1213         */
1214        push_pseudo_header(skb, daddr);
1215
1216        return IPOIB_HARD_LEN;
1217}
1218
1219static void ipoib_set_mcast_list(struct net_device *dev)
1220{
1221        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1222
1223        if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
1224                ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
1225                return;
1226        }
1227
1228        queue_work(priv->wq, &priv->restart_task);
1229}
1230
1231static int ipoib_get_iflink(const struct net_device *dev)
1232{
1233        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1234
1235        /* parent interface */
1236        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
1237                return dev->ifindex;
1238
1239        /* child/vlan interface */
1240        return priv->parent->ifindex;
1241}
1242
1243static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
1244{
1245        /*
1246         * Use only the address parts that contributes to spreading
1247         * The subnet prefix is not used as one can not connect to
1248         * same remote port (GUID) using the same remote QPN via two
1249         * different subnets.
1250         */
1251         /* qpn octets[1:4) & port GUID octets[12:20) */
1252        u32 *d32 = (u32 *) daddr;
1253        u32 hv;
1254
1255        hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
1256        return hv & htbl->mask;
1257}
1258
1259struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
1260{
1261        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1262        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1263        struct ipoib_neigh_hash *htbl;
1264        struct ipoib_neigh *neigh = NULL;
1265        u32 hash_val;
1266
1267        rcu_read_lock_bh();
1268
1269        htbl = rcu_dereference_bh(ntbl->htbl);
1270
1271        if (!htbl)
1272                goto out_unlock;
1273
1274        hash_val = ipoib_addr_hash(htbl, daddr);
1275        for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
1276             neigh != NULL;
1277             neigh = rcu_dereference_bh(neigh->hnext)) {
1278                if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
1279                        /* found, take one ref on behalf of the caller */
1280                        if (!atomic_inc_not_zero(&neigh->refcnt)) {
1281                                /* deleted */
1282                                neigh = NULL;
1283                                goto out_unlock;
1284                        }
1285
1286                        if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE))
1287                                neigh->alive = jiffies;
1288                        goto out_unlock;
1289                }
1290        }
1291
1292out_unlock:
1293        rcu_read_unlock_bh();
1294        return neigh;
1295}
1296
1297static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
1298{
1299        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1300        struct ipoib_neigh_hash *htbl;
1301        unsigned long neigh_obsolete;
1302        unsigned long dt;
1303        unsigned long flags;
1304        int i;
1305        LIST_HEAD(remove_list);
1306
1307        spin_lock_irqsave(&priv->lock, flags);
1308
1309        htbl = rcu_dereference_protected(ntbl->htbl,
1310                                         lockdep_is_held(&priv->lock));
1311
1312        if (!htbl)
1313                goto out_unlock;
1314
1315        /* neigh is obsolete if it was idle for two GC periods */
1316        dt = 2 * arp_tbl.gc_interval;
1317        neigh_obsolete = jiffies - dt;
1318
1319        for (i = 0; i < htbl->size; i++) {
1320                struct ipoib_neigh *neigh;
1321                struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1322
1323                while ((neigh = rcu_dereference_protected(*np,
1324                                                          lockdep_is_held(&priv->lock))) != NULL) {
1325                        /* was the neigh idle for two GC periods */
1326                        if (time_after(neigh_obsolete, neigh->alive)) {
1327
1328                                ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list);
1329
1330                                rcu_assign_pointer(*np,
1331                                                   rcu_dereference_protected(neigh->hnext,
1332                                                                             lockdep_is_held(&priv->lock)));
1333                                /* remove from path/mc list */
1334                                list_del_init(&neigh->list);
1335                                call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1336                        } else {
1337                                np = &neigh->hnext;
1338                        }
1339
1340                }
1341        }
1342
1343out_unlock:
1344        spin_unlock_irqrestore(&priv->lock, flags);
1345        ipoib_mcast_remove_list(&remove_list);
1346}
1347
1348static void ipoib_reap_neigh(struct work_struct *work)
1349{
1350        struct ipoib_dev_priv *priv =
1351                container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
1352
1353        __ipoib_reap_neigh(priv);
1354
1355        queue_delayed_work(priv->wq, &priv->neigh_reap_task,
1356                           arp_tbl.gc_interval);
1357}
1358
1359
1360static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
1361                                      struct net_device *dev)
1362{
1363        struct ipoib_neigh *neigh;
1364
1365        neigh = kzalloc(sizeof(*neigh), GFP_ATOMIC);
1366        if (!neigh)
1367                return NULL;
1368
1369        neigh->dev = dev;
1370        memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
1371        skb_queue_head_init(&neigh->queue);
1372        INIT_LIST_HEAD(&neigh->list);
1373        ipoib_cm_set(neigh, NULL);
1374        /* one ref on behalf of the caller */
1375        atomic_set(&neigh->refcnt, 1);
1376
1377        return neigh;
1378}
1379
1380struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
1381                                      struct net_device *dev)
1382{
1383        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1384        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1385        struct ipoib_neigh_hash *htbl;
1386        struct ipoib_neigh *neigh;
1387        u32 hash_val;
1388
1389        htbl = rcu_dereference_protected(ntbl->htbl,
1390                                         lockdep_is_held(&priv->lock));
1391        if (!htbl) {
1392                neigh = NULL;
1393                goto out_unlock;
1394        }
1395
1396        /* need to add a new neigh, but maybe some other thread succeeded?
1397         * recalc hash, maybe hash resize took place so we do a search
1398         */
1399        hash_val = ipoib_addr_hash(htbl, daddr);
1400        for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
1401                                               lockdep_is_held(&priv->lock));
1402             neigh != NULL;
1403             neigh = rcu_dereference_protected(neigh->hnext,
1404                                               lockdep_is_held(&priv->lock))) {
1405                if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
1406                        /* found, take one ref on behalf of the caller */
1407                        if (!atomic_inc_not_zero(&neigh->refcnt)) {
1408                                /* deleted */
1409                                neigh = NULL;
1410                                break;
1411                        }
1412                        neigh->alive = jiffies;
1413                        goto out_unlock;
1414                }
1415        }
1416
1417        neigh = ipoib_neigh_ctor(daddr, dev);
1418        if (!neigh)
1419                goto out_unlock;
1420
1421        /* one ref on behalf of the hash table */
1422        atomic_inc(&neigh->refcnt);
1423        neigh->alive = jiffies;
1424        /* put in hash */
1425        rcu_assign_pointer(neigh->hnext,
1426                           rcu_dereference_protected(htbl->buckets[hash_val],
1427                                                     lockdep_is_held(&priv->lock)));
1428        rcu_assign_pointer(htbl->buckets[hash_val], neigh);
1429        atomic_inc(&ntbl->entries);
1430
1431out_unlock:
1432
1433        return neigh;
1434}
1435
1436void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
1437{
1438        /* neigh reference count was dropprd to zero */
1439        struct net_device *dev = neigh->dev;
1440        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1441        struct sk_buff *skb;
1442        if (neigh->ah)
1443                ipoib_put_ah(neigh->ah);
1444        while ((skb = __skb_dequeue(&neigh->queue))) {
1445                ++dev->stats.tx_dropped;
1446                dev_kfree_skb_any(skb);
1447        }
1448        if (ipoib_cm_get(neigh))
1449                ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
1450        ipoib_dbg(ipoib_priv(dev),
1451                  "neigh free for %06x %pI6\n",
1452                  IPOIB_QPN(neigh->daddr),
1453                  neigh->daddr + 4);
1454        kfree(neigh);
1455        if (atomic_dec_and_test(&priv->ntbl.entries)) {
1456                if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
1457                        complete(&priv->ntbl.flushed);
1458        }
1459}
1460
1461static void ipoib_neigh_reclaim(struct rcu_head *rp)
1462{
1463        /* Called as a result of removal from hash table */
1464        struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
1465        /* note TX context may hold another ref */
1466        ipoib_neigh_put(neigh);
1467}
1468
1469void ipoib_neigh_free(struct ipoib_neigh *neigh)
1470{
1471        struct net_device *dev = neigh->dev;
1472        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1473        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1474        struct ipoib_neigh_hash *htbl;
1475        struct ipoib_neigh __rcu **np;
1476        struct ipoib_neigh *n;
1477        u32 hash_val;
1478
1479        htbl = rcu_dereference_protected(ntbl->htbl,
1480                                        lockdep_is_held(&priv->lock));
1481        if (!htbl)
1482                return;
1483
1484        hash_val = ipoib_addr_hash(htbl, neigh->daddr);
1485        np = &htbl->buckets[hash_val];
1486        for (n = rcu_dereference_protected(*np,
1487                                            lockdep_is_held(&priv->lock));
1488             n != NULL;
1489             n = rcu_dereference_protected(*np,
1490                                        lockdep_is_held(&priv->lock))) {
1491                if (n == neigh) {
1492                        /* found */
1493                        rcu_assign_pointer(*np,
1494                                           rcu_dereference_protected(neigh->hnext,
1495                                                                     lockdep_is_held(&priv->lock)));
1496                        /* remove from parent list */
1497                        list_del_init(&neigh->list);
1498                        call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1499                        return;
1500                } else {
1501                        np = &n->hnext;
1502                }
1503        }
1504}
1505
1506static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
1507{
1508        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1509        struct ipoib_neigh_hash *htbl;
1510        struct ipoib_neigh __rcu **buckets;
1511        u32 size;
1512
1513        clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1514        ntbl->htbl = NULL;
1515        htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
1516        if (!htbl)
1517                return -ENOMEM;
1518        size = roundup_pow_of_two(arp_tbl.gc_thresh3);
1519        buckets = kvcalloc(size, sizeof(*buckets), GFP_KERNEL);
1520        if (!buckets) {
1521                kfree(htbl);
1522                return -ENOMEM;
1523        }
1524        htbl->size = size;
1525        htbl->mask = (size - 1);
1526        htbl->buckets = buckets;
1527        RCU_INIT_POINTER(ntbl->htbl, htbl);
1528        htbl->ntbl = ntbl;
1529        atomic_set(&ntbl->entries, 0);
1530
1531        /* start garbage collection */
1532        queue_delayed_work(priv->wq, &priv->neigh_reap_task,
1533                           arp_tbl.gc_interval);
1534
1535        return 0;
1536}
1537
1538static void neigh_hash_free_rcu(struct rcu_head *head)
1539{
1540        struct ipoib_neigh_hash *htbl = container_of(head,
1541                                                    struct ipoib_neigh_hash,
1542                                                    rcu);
1543        struct ipoib_neigh __rcu **buckets = htbl->buckets;
1544        struct ipoib_neigh_table *ntbl = htbl->ntbl;
1545
1546        kvfree(buckets);
1547        kfree(htbl);
1548        complete(&ntbl->deleted);
1549}
1550
1551void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
1552{
1553        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1554        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1555        struct ipoib_neigh_hash *htbl;
1556        unsigned long flags;
1557        int i;
1558
1559        /* remove all neigh connected to a given path or mcast */
1560        spin_lock_irqsave(&priv->lock, flags);
1561
1562        htbl = rcu_dereference_protected(ntbl->htbl,
1563                                         lockdep_is_held(&priv->lock));
1564
1565        if (!htbl)
1566                goto out_unlock;
1567
1568        for (i = 0; i < htbl->size; i++) {
1569                struct ipoib_neigh *neigh;
1570                struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1571
1572                while ((neigh = rcu_dereference_protected(*np,
1573                                                          lockdep_is_held(&priv->lock))) != NULL) {
1574                        /* delete neighs belong to this parent */
1575                        if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
1576                                rcu_assign_pointer(*np,
1577                                                   rcu_dereference_protected(neigh->hnext,
1578                                                                             lockdep_is_held(&priv->lock)));
1579                                /* remove from parent list */
1580                                list_del_init(&neigh->list);
1581                                call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1582                        } else {
1583                                np = &neigh->hnext;
1584                        }
1585
1586                }
1587        }
1588out_unlock:
1589        spin_unlock_irqrestore(&priv->lock, flags);
1590}
1591
1592static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
1593{
1594        struct ipoib_neigh_table *ntbl = &priv->ntbl;
1595        struct ipoib_neigh_hash *htbl;
1596        unsigned long flags;
1597        int i, wait_flushed = 0;
1598
1599        init_completion(&priv->ntbl.flushed);
1600        set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1601
1602        spin_lock_irqsave(&priv->lock, flags);
1603
1604        htbl = rcu_dereference_protected(ntbl->htbl,
1605                                        lockdep_is_held(&priv->lock));
1606        if (!htbl)
1607                goto out_unlock;
1608
1609        wait_flushed = atomic_read(&priv->ntbl.entries);
1610        if (!wait_flushed)
1611                goto free_htbl;
1612
1613        for (i = 0; i < htbl->size; i++) {
1614                struct ipoib_neigh *neigh;
1615                struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1616
1617                while ((neigh = rcu_dereference_protected(*np,
1618                                       lockdep_is_held(&priv->lock))) != NULL) {
1619                        rcu_assign_pointer(*np,
1620                                           rcu_dereference_protected(neigh->hnext,
1621                                                                     lockdep_is_held(&priv->lock)));
1622                        /* remove from path/mc list */
1623                        list_del_init(&neigh->list);
1624                        call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1625                }
1626        }
1627
1628free_htbl:
1629        rcu_assign_pointer(ntbl->htbl, NULL);
1630        call_rcu(&htbl->rcu, neigh_hash_free_rcu);
1631
1632out_unlock:
1633        spin_unlock_irqrestore(&priv->lock, flags);
1634        if (wait_flushed)
1635                wait_for_completion(&priv->ntbl.flushed);
1636}
1637
1638static void ipoib_neigh_hash_uninit(struct net_device *dev)
1639{
1640        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1641
1642        ipoib_dbg(priv, "%s\n", __func__);
1643        init_completion(&priv->ntbl.deleted);
1644
1645        cancel_delayed_work_sync(&priv->neigh_reap_task);
1646
1647        ipoib_flush_neighs(priv);
1648
1649        wait_for_completion(&priv->ntbl.deleted);
1650}
1651
1652static void ipoib_napi_add(struct net_device *dev)
1653{
1654        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1655
1656        netif_napi_add(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC);
1657        netif_napi_add(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE);
1658}
1659
1660static void ipoib_napi_del(struct net_device *dev)
1661{
1662        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1663
1664        netif_napi_del(&priv->recv_napi);
1665        netif_napi_del(&priv->send_napi);
1666}
1667
1668static void ipoib_dev_uninit_default(struct net_device *dev)
1669{
1670        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1671
1672        ipoib_transport_dev_cleanup(dev);
1673
1674        ipoib_napi_del(dev);
1675
1676        ipoib_cm_dev_cleanup(dev);
1677
1678        kfree(priv->rx_ring);
1679        vfree(priv->tx_ring);
1680
1681        priv->rx_ring = NULL;
1682        priv->tx_ring = NULL;
1683}
1684
1685static int ipoib_dev_init_default(struct net_device *dev)
1686{
1687        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1688
1689        ipoib_napi_add(dev);
1690
1691        /* Allocate RX/TX "rings" to hold queued skbs */
1692        priv->rx_ring = kcalloc(ipoib_recvq_size,
1693                                       sizeof(*priv->rx_ring),
1694                                       GFP_KERNEL);
1695        if (!priv->rx_ring)
1696                goto out;
1697
1698        priv->tx_ring = vzalloc(array_size(ipoib_sendq_size,
1699                                           sizeof(*priv->tx_ring)));
1700        if (!priv->tx_ring) {
1701                pr_warn("%s: failed to allocate TX ring (%d entries)\n",
1702                        priv->ca->name, ipoib_sendq_size);
1703                goto out_rx_ring_cleanup;
1704        }
1705
1706        /* priv->tx_head, tx_tail and global_tx_tail/head are already 0 */
1707
1708        if (ipoib_transport_dev_init(dev, priv->ca)) {
1709                pr_warn("%s: ipoib_transport_dev_init failed\n",
1710                        priv->ca->name);
1711                goto out_tx_ring_cleanup;
1712        }
1713
1714        /* after qp created set dev address */
1715        priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
1716        priv->dev->dev_addr[2] = (priv->qp->qp_num >>  8) & 0xff;
1717        priv->dev->dev_addr[3] = (priv->qp->qp_num) & 0xff;
1718
1719        return 0;
1720
1721out_tx_ring_cleanup:
1722        vfree(priv->tx_ring);
1723
1724out_rx_ring_cleanup:
1725        kfree(priv->rx_ring);
1726
1727out:
1728        ipoib_napi_del(dev);
1729        return -ENOMEM;
1730}
1731
1732static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr,
1733                       int cmd)
1734{
1735        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1736
1737        if (!priv->rn_ops->ndo_do_ioctl)
1738                return -EOPNOTSUPP;
1739
1740        return priv->rn_ops->ndo_do_ioctl(dev, ifr, cmd);
1741}
1742
1743static int ipoib_dev_init(struct net_device *dev)
1744{
1745        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1746        int ret = -ENOMEM;
1747
1748        priv->qp = NULL;
1749
1750        /*
1751         * the various IPoIB tasks assume they will never race against
1752         * themselves, so always use a single thread workqueue
1753         */
1754        priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM);
1755        if (!priv->wq) {
1756                pr_warn("%s: failed to allocate device WQ\n", dev->name);
1757                goto out;
1758        }
1759
1760        /* create pd, which used both for control and datapath*/
1761        priv->pd = ib_alloc_pd(priv->ca, 0);
1762        if (IS_ERR(priv->pd)) {
1763                pr_warn("%s: failed to allocate PD\n", priv->ca->name);
1764                goto clean_wq;
1765        }
1766
1767        ret = priv->rn_ops->ndo_init(dev);
1768        if (ret) {
1769                pr_warn("%s failed to init HW resource\n", dev->name);
1770                goto out_free_pd;
1771        }
1772
1773        ret = ipoib_neigh_hash_init(priv);
1774        if (ret) {
1775                pr_warn("%s failed to init neigh hash\n", dev->name);
1776                goto out_dev_uninit;
1777        }
1778
1779        if (dev->flags & IFF_UP) {
1780                if (ipoib_ib_dev_open(dev)) {
1781                        pr_warn("%s failed to open device\n", dev->name);
1782                        ret = -ENODEV;
1783                        goto out_hash_uninit;
1784                }
1785        }
1786
1787        return 0;
1788
1789out_hash_uninit:
1790        ipoib_neigh_hash_uninit(dev);
1791
1792out_dev_uninit:
1793        ipoib_ib_dev_cleanup(dev);
1794
1795out_free_pd:
1796        if (priv->pd) {
1797                ib_dealloc_pd(priv->pd);
1798                priv->pd = NULL;
1799        }
1800
1801clean_wq:
1802        if (priv->wq) {
1803                destroy_workqueue(priv->wq);
1804                priv->wq = NULL;
1805        }
1806
1807out:
1808        return ret;
1809}
1810
1811/*
1812 * This must be called before doing an unregister_netdev on a parent device to
1813 * shutdown the IB event handler.
1814 */
1815static void ipoib_parent_unregister_pre(struct net_device *ndev)
1816{
1817        struct ipoib_dev_priv *priv = ipoib_priv(ndev);
1818
1819        /*
1820         * ipoib_set_mac checks netif_running before pushing work, clearing
1821         * running ensures the it will not add more work.
1822         */
1823        rtnl_lock();
1824        dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL);
1825        rtnl_unlock();
1826
1827        /* ipoib_event() cannot be running once this returns */
1828        ib_unregister_event_handler(&priv->event_handler);
1829
1830        /*
1831         * Work on the queue grabs the rtnl lock, so this cannot be done while
1832         * also holding it.
1833         */
1834        flush_workqueue(ipoib_workqueue);
1835}
1836
1837static void ipoib_set_dev_features(struct ipoib_dev_priv *priv)
1838{
1839        priv->hca_caps = priv->ca->attrs.device_cap_flags;
1840
1841        if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
1842                priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
1843
1844                if (priv->hca_caps & IB_DEVICE_UD_TSO)
1845                        priv->dev->hw_features |= NETIF_F_TSO;
1846
1847                priv->dev->features |= priv->dev->hw_features;
1848        }
1849}
1850
1851static int ipoib_parent_init(struct net_device *ndev)
1852{
1853        struct ipoib_dev_priv *priv = ipoib_priv(ndev);
1854        struct ib_port_attr attr;
1855        int result;
1856
1857        result = ib_query_port(priv->ca, priv->port, &attr);
1858        if (result) {
1859                pr_warn("%s: ib_query_port %d failed\n", priv->ca->name,
1860                        priv->port);
1861                return result;
1862        }
1863        priv->max_ib_mtu = rdma_mtu_from_attr(priv->ca, priv->port, &attr);
1864
1865        result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
1866        if (result) {
1867                pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n",
1868                        priv->ca->name, priv->port, result);
1869                return result;
1870        }
1871
1872        result = rdma_query_gid(priv->ca, priv->port, 0, &priv->local_gid);
1873        if (result) {
1874                pr_warn("%s: rdma_query_gid port %d failed (ret = %d)\n",
1875                        priv->ca->name, priv->port, result);
1876                return result;
1877        }
1878        memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw,
1879               sizeof(union ib_gid));
1880
1881        SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent);
1882        priv->dev->dev_port = priv->port - 1;
1883        /* Let's set this one too for backwards compatibility. */
1884        priv->dev->dev_id = priv->port - 1;
1885
1886        return 0;
1887}
1888
1889static void ipoib_child_init(struct net_device *ndev)
1890{
1891        struct ipoib_dev_priv *priv = ipoib_priv(ndev);
1892        struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
1893
1894        priv->max_ib_mtu = ppriv->max_ib_mtu;
1895        set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
1896        if (memchr_inv(priv->dev->dev_addr, 0, INFINIBAND_ALEN))
1897                memcpy(&priv->local_gid, priv->dev->dev_addr + 4,
1898                       sizeof(priv->local_gid));
1899        else {
1900                memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr,
1901                       INFINIBAND_ALEN);
1902                memcpy(&priv->local_gid, &ppriv->local_gid,
1903                       sizeof(priv->local_gid));
1904        }
1905}
1906
1907static int ipoib_ndo_init(struct net_device *ndev)
1908{
1909        struct ipoib_dev_priv *priv = ipoib_priv(ndev);
1910        int rc;
1911        struct rdma_netdev *rn = netdev_priv(ndev);
1912
1913        if (priv->parent) {
1914                ipoib_child_init(ndev);
1915        } else {
1916                rc = ipoib_parent_init(ndev);
1917                if (rc)
1918                        return rc;
1919        }
1920
1921        /* MTU will be reset when mcast join happens */
1922        ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
1923        priv->mcast_mtu = priv->admin_mtu = ndev->mtu;
1924        rn->mtu = priv->mcast_mtu;
1925        ndev->max_mtu = IPOIB_CM_MTU;
1926
1927        ndev->neigh_priv_len = sizeof(struct ipoib_neigh);
1928
1929        /*
1930         * Set the full membership bit, so that we join the right
1931         * broadcast group, etc.
1932         */
1933        priv->pkey |= 0x8000;
1934
1935        ndev->broadcast[8] = priv->pkey >> 8;
1936        ndev->broadcast[9] = priv->pkey & 0xff;
1937        set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
1938
1939        ipoib_set_dev_features(priv);
1940
1941        rc = ipoib_dev_init(ndev);
1942        if (rc) {
1943                pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n",
1944                        priv->ca->name, priv->dev->name, priv->port, rc);
1945                return rc;
1946        }
1947
1948        if (priv->parent) {
1949                struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
1950
1951                dev_hold(priv->parent);
1952
1953                down_write(&ppriv->vlan_rwsem);
1954                list_add_tail(&priv->list, &ppriv->child_intfs);
1955                up_write(&ppriv->vlan_rwsem);
1956        }
1957
1958        return 0;
1959}
1960
1961static void ipoib_ndo_uninit(struct net_device *dev)
1962{
1963        struct ipoib_dev_priv *priv = ipoib_priv(dev);
1964
1965        ASSERT_RTNL();
1966
1967        /*
1968         * ipoib_remove_one guarantees the children are removed before the
1969         * parent, and that is the only place where a parent can be removed.
1970         */
1971        WARN_ON(!list_empty(&priv->child_intfs));
1972
1973        if (priv->parent) {
1974                struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
1975
1976                down_write(&ppriv->vlan_rwsem);
1977                list_del(&priv->list);
1978                up_write(&ppriv->vlan_rwsem);
1979        }
1980
1981        ipoib_neigh_hash_uninit(dev);
1982
1983        ipoib_ib_dev_cleanup(dev);
1984
1985        /* no more works over the priv->wq */
1986        if (priv->wq) {
1987                /* See ipoib_mcast_carrier_on_task() */
1988                WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags));
1989                flush_workqueue(priv->wq);
1990                destroy_workqueue(priv->wq);
1991                priv->wq = NULL;
1992        }
1993
1994        if (priv->parent)
1995                dev_put(priv->parent);
1996}
1997
1998static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state)
1999{
2000        struct ipoib_dev_priv *priv = ipoib_priv(dev);
2001
2002        return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state);
2003}
2004
2005static int ipoib_get_vf_config(struct net_device *dev, int vf,
2006                               struct ifla_vf_info *ivf)
2007{
2008        struct ipoib_dev_priv *priv = ipoib_priv(dev);
2009        int err;
2010
2011        err = ib_get_vf_config(priv->ca, vf, priv->port, ivf);
2012        if (err)
2013                return err;
2014
2015        ivf->vf = vf;
2016        memcpy(ivf->mac, dev->dev_addr, dev->addr_len);
2017
2018        return 0;
2019}
2020
2021static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type)
2022{
2023        struct ipoib_dev_priv *priv = ipoib_priv(dev);
2024
2025        if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID)
2026                return -EINVAL;
2027
2028        return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type);
2029}
2030
2031static int ipoib_get_vf_guid(struct net_device *dev, int vf,
2032                             struct ifla_vf_guid *node_guid,
2033                             struct ifla_vf_guid *port_guid)
2034{
2035        struct ipoib_dev_priv *priv = ipoib_priv(dev);
2036
2037        return ib_get_vf_guid(priv->ca, vf, priv->port, node_guid, port_guid);
2038}
2039
2040static int ipoib_get_vf_stats(struct net_device *dev, int vf,
2041                              struct ifla_vf_stats *vf_stats)
2042{
2043        struct ipoib_dev_priv *priv = ipoib_priv(dev);
2044
2045        return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats);
2046}
2047
2048static const struct header_ops ipoib_header_ops = {
2049        .create = ipoib_hard_header,
2050};
2051
2052static const struct net_device_ops ipoib_netdev_ops_pf = {
2053        .ndo_init                = ipoib_ndo_init,
2054        .ndo_uninit              = ipoib_ndo_uninit,
2055        .ndo_open                = ipoib_open,
2056        .ndo_stop                = ipoib_stop,
2057        .ndo_change_mtu          = ipoib_change_mtu,
2058        .ndo_fix_features        = ipoib_fix_features,
2059        .ndo_start_xmit          = ipoib_start_xmit,
2060        .ndo_tx_timeout          = ipoib_timeout,
2061        .ndo_set_rx_mode         = ipoib_set_mcast_list,
2062        .ndo_get_iflink          = ipoib_get_iflink,
2063        .ndo_set_vf_link_state   = ipoib_set_vf_link_state,
2064        .ndo_get_vf_config       = ipoib_get_vf_config,
2065        .ndo_get_vf_stats        = ipoib_get_vf_stats,
2066        .ndo_get_vf_guid         = ipoib_get_vf_guid,
2067        .ndo_set_vf_guid         = ipoib_set_vf_guid,
2068        .ndo_set_mac_address     = ipoib_set_mac,
2069        .ndo_get_stats64         = ipoib_get_stats,
2070        .ndo_do_ioctl            = ipoib_ioctl,
2071};
2072
2073static const struct net_device_ops ipoib_netdev_ops_vf = {
2074        .ndo_init                = ipoib_ndo_init,
2075        .ndo_uninit              = ipoib_ndo_uninit,
2076        .ndo_open                = ipoib_open,
2077        .ndo_stop                = ipoib_stop,
2078        .ndo_change_mtu          = ipoib_change_mtu,
2079        .ndo_fix_features        = ipoib_fix_features,
2080        .ndo_start_xmit          = ipoib_start_xmit,
2081        .ndo_tx_timeout          = ipoib_timeout,
2082        .ndo_set_rx_mode         = ipoib_set_mcast_list,
2083        .ndo_get_iflink          = ipoib_get_iflink,
2084        .ndo_get_stats64         = ipoib_get_stats,
2085        .ndo_do_ioctl            = ipoib_ioctl,
2086};
2087
2088static const struct net_device_ops ipoib_netdev_default_pf = {
2089        .ndo_init                = ipoib_dev_init_default,
2090        .ndo_uninit              = ipoib_dev_uninit_default,
2091        .ndo_open                = ipoib_ib_dev_open_default,
2092        .ndo_stop                = ipoib_ib_dev_stop_default,
2093};
2094
2095void ipoib_setup_common(struct net_device *dev)
2096{
2097        dev->header_ops          = &ipoib_header_ops;
2098        dev->netdev_ops          = &ipoib_netdev_default_pf;
2099
2100        ipoib_set_ethtool_ops(dev);
2101
2102        dev->watchdog_timeo      = HZ;
2103
2104        dev->flags              |= IFF_BROADCAST | IFF_MULTICAST;
2105
2106        dev->hard_header_len     = IPOIB_HARD_LEN;
2107        dev->addr_len            = INFINIBAND_ALEN;
2108        dev->type                = ARPHRD_INFINIBAND;
2109        dev->tx_queue_len        = ipoib_sendq_size * 2;
2110        dev->features            = (NETIF_F_VLAN_CHALLENGED     |
2111                                    NETIF_F_HIGHDMA);
2112        netif_keep_dst(dev);
2113
2114        memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
2115
2116        /*
2117         * unregister_netdev always frees the netdev, we use this mode
2118         * consistently to unify all the various unregister paths, including
2119         * those connected to rtnl_link_ops which require it.
2120         */
2121        dev->needs_free_netdev = true;
2122}
2123
2124static void ipoib_build_priv(struct net_device *dev)
2125{
2126        struct ipoib_dev_priv *priv = ipoib_priv(dev);
2127
2128        priv->dev = dev;
2129        spin_lock_init(&priv->lock);
2130        init_rwsem(&priv->vlan_rwsem);
2131        mutex_init(&priv->mcast_mutex);
2132
2133        INIT_LIST_HEAD(&priv->path_list);
2134        INIT_LIST_HEAD(&priv->child_intfs);
2135        INIT_LIST_HEAD(&priv->dead_ahs);
2136        INIT_LIST_HEAD(&priv->multicast_list);
2137
2138        INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
2139        INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
2140        INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
2141        INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
2142        INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
2143        INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
2144        INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
2145        INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
2146}
2147
2148static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u8 port,
2149                                             const char *name)
2150{
2151        struct net_device *dev;
2152
2153        dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
2154                                NET_NAME_UNKNOWN, ipoib_setup_common);
2155        if (!IS_ERR(dev) || PTR_ERR(dev) != -EOPNOTSUPP)
2156                return dev;
2157
2158        dev = alloc_netdev(sizeof(struct rdma_netdev), name, NET_NAME_UNKNOWN,
2159                           ipoib_setup_common);
2160        if (!dev)
2161                return ERR_PTR(-ENOMEM);
2162        return dev;
2163}
2164
2165int ipoib_intf_init(struct ib_device *hca, u8 port, const char *name,
2166                    struct net_device *dev)
2167{
2168        struct rdma_netdev *rn = netdev_priv(dev);
2169        struct ipoib_dev_priv *priv;
2170        int rc;
2171
2172        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
2173        if (!priv)
2174                return -ENOMEM;
2175
2176        priv->ca = hca;
2177        priv->port = port;
2178
2179        rc = rdma_init_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
2180                              NET_NAME_UNKNOWN, ipoib_setup_common, dev);
2181        if (rc) {
2182                if (rc != -EOPNOTSUPP)
2183                        goto out;
2184
2185                rn->send = ipoib_send;
2186                rn->attach_mcast = ipoib_mcast_attach;
2187                rn->detach_mcast = ipoib_mcast_detach;
2188                rn->hca = hca;
2189        }
2190
2191        priv->rn_ops = dev->netdev_ops;
2192
2193        if (hca->attrs.device_cap_flags & IB_DEVICE_VIRTUAL_FUNCTION)
2194                dev->netdev_ops = &ipoib_netdev_ops_vf;
2195        else
2196                dev->netdev_ops = &ipoib_netdev_ops_pf;
2197
2198        rn->clnt_priv = priv;
2199        /*
2200         * Only the child register_netdev flows can handle priv_destructor
2201         * being set, so we force it to NULL here and handle manually until it
2202         * is safe to turn on.
2203         */
2204        priv->next_priv_destructor = dev->priv_destructor;
2205        dev->priv_destructor = NULL;
2206
2207        ipoib_build_priv(dev);
2208
2209        return 0;
2210
2211out:
2212        kfree(priv);
2213        return rc;
2214}
2215
2216struct net_device *ipoib_intf_alloc(struct ib_device *hca, u8 port,
2217                                    const char *name)
2218{
2219        struct net_device *dev;
2220        int rc;
2221
2222        dev = ipoib_alloc_netdev(hca, port, name);
2223        if (IS_ERR(dev))
2224                return dev;
2225
2226        rc = ipoib_intf_init(hca, port, name, dev);
2227        if (rc) {
2228                free_netdev(dev);
2229                return ERR_PTR(rc);
2230        }
2231
2232        /*
2233         * Upon success the caller must ensure ipoib_intf_free is called or
2234         * register_netdevice succeed'd and priv_destructor is set to
2235         * ipoib_intf_free.
2236         */
2237        return dev;
2238}
2239
2240void ipoib_intf_free(struct net_device *dev)
2241{
2242        struct ipoib_dev_priv *priv = ipoib_priv(dev);
2243        struct rdma_netdev *rn = netdev_priv(dev);
2244
2245        dev->priv_destructor = priv->next_priv_destructor;
2246        if (dev->priv_destructor)
2247                dev->priv_destructor(dev);
2248
2249        /*
2250         * There are some error flows around register_netdev failing that may
2251         * attempt to call priv_destructor twice, prevent that from happening.
2252         */
2253        dev->priv_destructor = NULL;
2254
2255        /* unregister/destroy is very complicated. Make bugs more obvious. */
2256        rn->clnt_priv = NULL;
2257
2258        kfree(priv);
2259}
2260
2261static ssize_t show_pkey(struct device *dev,
2262                         struct device_attribute *attr, char *buf)
2263{
2264        struct net_device *ndev = to_net_dev(dev);
2265        struct ipoib_dev_priv *priv = ipoib_priv(ndev);
2266
2267        return sysfs_emit(buf, "0x%04x\n", priv->pkey);
2268}
2269static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
2270
2271static ssize_t show_umcast(struct device *dev,
2272                           struct device_attribute *attr, char *buf)
2273{
2274        struct net_device *ndev = to_net_dev(dev);
2275        struct ipoib_dev_priv *priv = ipoib_priv(ndev);
2276
2277        return sysfs_emit(buf, "%d\n",
2278                          test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
2279}
2280
2281void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
2282{
2283        struct ipoib_dev_priv *priv = ipoib_priv(ndev);
2284
2285        if (umcast_val > 0) {
2286                set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
2287                ipoib_warn(priv, "ignoring multicast groups joined directly "
2288                                "by userspace\n");
2289        } else
2290                clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
2291}
2292
2293static ssize_t set_umcast(struct device *dev,
2294                          struct device_attribute *attr,
2295                          const char *buf, size_t count)
2296{
2297        unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
2298
2299        ipoib_set_umcast(to_net_dev(dev), umcast_val);
2300
2301        return count;
2302}
2303static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
2304
2305int ipoib_add_umcast_attr(struct net_device *dev)
2306{
2307        return device_create_file(&dev->dev, &dev_attr_umcast);
2308}
2309
2310static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid)
2311{
2312        struct ipoib_dev_priv *child_priv;
2313        struct net_device *netdev = priv->dev;
2314
2315        netif_addr_lock_bh(netdev);
2316
2317        memcpy(&priv->local_gid.global.interface_id,
2318               &gid->global.interface_id,
2319               sizeof(gid->global.interface_id));
2320        memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid));
2321        clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
2322
2323        netif_addr_unlock_bh(netdev);
2324
2325        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
2326                down_read(&priv->vlan_rwsem);
2327                list_for_each_entry(child_priv, &priv->child_intfs, list)
2328                        set_base_guid(child_priv, gid);
2329                up_read(&priv->vlan_rwsem);
2330        }
2331}
2332
2333static int ipoib_check_lladdr(struct net_device *dev,
2334                              struct sockaddr_storage *ss)
2335{
2336        union ib_gid *gid = (union ib_gid *)(ss->__data + 4);
2337        int ret = 0;
2338
2339        netif_addr_lock_bh(dev);
2340
2341        /* Make sure the QPN, reserved and subnet prefix match the current
2342         * lladdr, it also makes sure the lladdr is unicast.
2343         */
2344        if (memcmp(dev->dev_addr, ss->__data,
2345                   4 + sizeof(gid->global.subnet_prefix)) ||
2346            gid->global.interface_id == 0)
2347                ret = -EINVAL;
2348
2349        netif_addr_unlock_bh(dev);
2350
2351        return ret;
2352}
2353
2354static int ipoib_set_mac(struct net_device *dev, void *addr)
2355{
2356        struct ipoib_dev_priv *priv = ipoib_priv(dev);
2357        struct sockaddr_storage *ss = addr;
2358        int ret;
2359
2360        if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
2361                return -EBUSY;
2362
2363        ret = ipoib_check_lladdr(dev, ss);
2364        if (ret)
2365                return ret;
2366
2367        set_base_guid(priv, (union ib_gid *)(ss->__data + 4));
2368
2369        queue_work(ipoib_workqueue, &priv->flush_light);
2370
2371        return 0;
2372}
2373
2374static ssize_t create_child(struct device *dev,
2375                            struct device_attribute *attr,
2376                            const char *buf, size_t count)
2377{
2378        int pkey;
2379        int ret;
2380
2381        if (sscanf(buf, "%i", &pkey) != 1)
2382                return -EINVAL;
2383
2384        if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
2385                return -EINVAL;
2386
2387        ret = ipoib_vlan_add(to_net_dev(dev), pkey);
2388
2389        return ret ? ret : count;
2390}
2391static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
2392
2393static ssize_t delete_child(struct device *dev,
2394                            struct device_attribute *attr,
2395                            const char *buf, size_t count)
2396{
2397        int pkey;
2398        int ret;
2399
2400        if (sscanf(buf, "%i", &pkey) != 1)
2401                return -EINVAL;
2402
2403        if (pkey < 0 || pkey > 0xffff)
2404                return -EINVAL;
2405
2406        ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
2407
2408        return ret ? ret : count;
2409
2410}
2411static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
2412
2413int ipoib_add_pkey_attr(struct net_device *dev)
2414{
2415        return device_create_file(&dev->dev, &dev_attr_pkey);
2416}
2417
2418/*
2419 * We erroneously exposed the iface's port number in the dev_id
2420 * sysfs field long after dev_port was introduced for that purpose[1],
2421 * and we need to stop everyone from relying on that.
2422 * Let's overload the shower routine for the dev_id file here
2423 * to gently bring the issue up.
2424 *
2425 * [1] https://www.spinics.net/lists/netdev/msg272123.html
2426 */
2427static ssize_t dev_id_show(struct device *dev,
2428                           struct device_attribute *attr, char *buf)
2429{
2430        struct net_device *ndev = to_net_dev(dev);
2431
2432        /*
2433         * ndev->dev_port will be equal to 0 in old kernel prior to commit
2434         * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface
2435         * port numbers") Zero was chosen as special case for user space
2436         * applications to fallback and query dev_id to check if it has
2437         * different value or not.
2438         *
2439         * Don't print warning in such scenario.
2440         *
2441         * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358
2442         */
2443        if (ndev->dev_port && ndev->dev_id == ndev->dev_port)
2444                netdev_info_once(ndev,
2445                        "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n",
2446                        current->comm);
2447
2448        return sysfs_emit(buf, "%#x\n", ndev->dev_id);
2449}
2450static DEVICE_ATTR_RO(dev_id);
2451
2452static int ipoib_intercept_dev_id_attr(struct net_device *dev)
2453{
2454        device_remove_file(&dev->dev, &dev_attr_dev_id);
2455        return device_create_file(&dev->dev, &dev_attr_dev_id);
2456}
2457
2458static struct net_device *ipoib_add_port(const char *format,
2459                                         struct ib_device *hca, u8 port)
2460{
2461        struct rtnl_link_ops *ops = ipoib_get_link_ops();
2462        struct rdma_netdev_alloc_params params;
2463        struct ipoib_dev_priv *priv;
2464        struct net_device *ndev;
2465        int result;
2466
2467        ndev = ipoib_intf_alloc(hca, port, format);
2468        if (IS_ERR(ndev)) {
2469                pr_warn("%s, %d: ipoib_intf_alloc failed %ld\n", hca->name, port,
2470                        PTR_ERR(ndev));
2471                return ndev;
2472        }
2473        priv = ipoib_priv(ndev);
2474
2475        INIT_IB_EVENT_HANDLER(&priv->event_handler,
2476                              priv->ca, ipoib_event);
2477        ib_register_event_handler(&priv->event_handler);
2478
2479        /* call event handler to ensure pkey in sync */
2480        queue_work(ipoib_workqueue, &priv->flush_heavy);
2481
2482        ndev->rtnl_link_ops = ipoib_get_link_ops();
2483
2484        result = register_netdev(ndev);
2485        if (result) {
2486                pr_warn("%s: couldn't register ipoib port %d; error %d\n",
2487                        hca->name, port, result);
2488
2489                ipoib_parent_unregister_pre(ndev);
2490                ipoib_intf_free(ndev);
2491                free_netdev(ndev);
2492
2493                return ERR_PTR(result);
2494        }
2495
2496        if (hca->ops.rdma_netdev_get_params) {
2497                int rc = hca->ops.rdma_netdev_get_params(hca, port,
2498                                                     RDMA_NETDEV_IPOIB,
2499                                                     &params);
2500
2501                if (!rc && ops->priv_size < params.sizeof_priv)
2502                        ops->priv_size = params.sizeof_priv;
2503        }
2504        /*
2505         * We cannot set priv_destructor before register_netdev because we
2506         * need priv to be always valid during the error flow to execute
2507         * ipoib_parent_unregister_pre(). Instead handle it manually and only
2508         * enter priv_destructor mode once we are completely registered.
2509         */
2510        ndev->priv_destructor = ipoib_intf_free;
2511
2512        if (ipoib_intercept_dev_id_attr(ndev))
2513                goto sysfs_failed;
2514        if (ipoib_cm_add_mode_attr(ndev))
2515                goto sysfs_failed;
2516        if (ipoib_add_pkey_attr(ndev))
2517                goto sysfs_failed;
2518        if (ipoib_add_umcast_attr(ndev))
2519                goto sysfs_failed;
2520        if (device_create_file(&ndev->dev, &dev_attr_create_child))
2521                goto sysfs_failed;
2522        if (device_create_file(&ndev->dev, &dev_attr_delete_child))
2523                goto sysfs_failed;
2524
2525        return ndev;
2526
2527sysfs_failed:
2528        ipoib_parent_unregister_pre(ndev);
2529        unregister_netdev(ndev);
2530        return ERR_PTR(-ENOMEM);
2531}
2532
2533static int ipoib_add_one(struct ib_device *device)
2534{
2535        struct list_head *dev_list;
2536        struct net_device *dev;
2537        struct ipoib_dev_priv *priv;
2538        unsigned int p;
2539        int count = 0;
2540
2541        dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL);
2542        if (!dev_list)
2543                return -ENOMEM;
2544
2545        INIT_LIST_HEAD(dev_list);
2546
2547        rdma_for_each_port (device, p) {
2548                if (!rdma_protocol_ib(device, p))
2549                        continue;
2550                dev = ipoib_add_port("ib%d", device, p);
2551                if (!IS_ERR(dev)) {
2552                        priv = ipoib_priv(dev);
2553                        list_add_tail(&priv->list, dev_list);
2554                        count++;
2555                }
2556        }
2557
2558        if (!count) {
2559                kfree(dev_list);
2560                return -EOPNOTSUPP;
2561        }
2562
2563        ib_set_client_data(device, &ipoib_client, dev_list);
2564        return 0;
2565}
2566
2567static void ipoib_remove_one(struct ib_device *device, void *client_data)
2568{
2569        struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv;
2570        struct list_head *dev_list = client_data;
2571
2572        list_for_each_entry_safe(priv, tmp, dev_list, list) {
2573                LIST_HEAD(head);
2574                ipoib_parent_unregister_pre(priv->dev);
2575
2576                rtnl_lock();
2577
2578                list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs,
2579                                         list)
2580                        unregister_netdevice_queue(cpriv->dev, &head);
2581                unregister_netdevice_queue(priv->dev, &head);
2582                unregister_netdevice_many(&head);
2583
2584                rtnl_unlock();
2585        }
2586
2587        kfree(dev_list);
2588}
2589
2590#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
2591static struct notifier_block ipoib_netdev_notifier = {
2592        .notifier_call = ipoib_netdev_event,
2593};
2594#endif
2595
2596static int __init ipoib_init_module(void)
2597{
2598        int ret;
2599
2600        ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
2601        ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
2602        ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
2603
2604        ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
2605        ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
2606        ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
2607#ifdef CONFIG_INFINIBAND_IPOIB_CM
2608        ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
2609        ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0);
2610#endif
2611
2612        /*
2613         * When copying small received packets, we only copy from the
2614         * linear data part of the SKB, so we rely on this condition.
2615         */
2616        BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
2617
2618        ipoib_register_debugfs();
2619
2620        /*
2621         * We create a global workqueue here that is used for all flush
2622         * operations.  However, if you attempt to flush a workqueue
2623         * from a task on that same workqueue, it deadlocks the system.
2624         * We want to be able to flush the tasks associated with a
2625         * specific net device, so we also create a workqueue for each
2626         * netdevice.  We queue up the tasks for that device only on
2627         * its private workqueue, and we only queue up flush events
2628         * on our global flush workqueue.  This avoids the deadlocks.
2629         */
2630        ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", 0);
2631        if (!ipoib_workqueue) {
2632                ret = -ENOMEM;
2633                goto err_fs;
2634        }
2635
2636        ib_sa_register_client(&ipoib_sa_client);
2637
2638        ret = ib_register_client(&ipoib_client);
2639        if (ret)
2640                goto err_sa;
2641
2642        ret = ipoib_netlink_init();
2643        if (ret)
2644                goto err_client;
2645
2646#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
2647        register_netdevice_notifier(&ipoib_netdev_notifier);
2648#endif
2649        return 0;
2650
2651err_client:
2652        ib_unregister_client(&ipoib_client);
2653
2654err_sa:
2655        ib_sa_unregister_client(&ipoib_sa_client);
2656        destroy_workqueue(ipoib_workqueue);
2657
2658err_fs:
2659        ipoib_unregister_debugfs();
2660
2661        return ret;
2662}
2663
2664static void __exit ipoib_cleanup_module(void)
2665{
2666#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
2667        unregister_netdevice_notifier(&ipoib_netdev_notifier);
2668#endif
2669        ipoib_netlink_fini();
2670        ib_unregister_client(&ipoib_client);
2671        ib_sa_unregister_client(&ipoib_sa_client);
2672        ipoib_unregister_debugfs();
2673        destroy_workqueue(ipoib_workqueue);
2674}
2675
2676module_init(ipoib_init_module);
2677module_exit(ipoib_cleanup_module);
2678