linux/drivers/infiniband/core/device.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
   3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
   4 *
   5 * This software is available to you under a choice of one of two
   6 * licenses.  You may choose to be licensed under the terms of the GNU
   7 * General Public License (GPL) Version 2, available from the file
   8 * COPYING in the main directory of this source tree, or the
   9 * OpenIB.org BSD license below:
  10 *
  11 *     Redistribution and use in source and binary forms, with or
  12 *     without modification, are permitted provided that the following
  13 *     conditions are met:
  14 *
  15 *      - Redistributions of source code must retain the above
  16 *        copyright notice, this list of conditions and the following
  17 *        disclaimer.
  18 *
  19 *      - Redistributions in binary form must reproduce the above
  20 *        copyright notice, this list of conditions and the following
  21 *        disclaimer in the documentation and/or other materials
  22 *        provided with the distribution.
  23 *
  24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  31 * SOFTWARE.
  32 */
  33
  34#include <linux/module.h>
  35#include <linux/string.h>
  36#include <linux/errno.h>
  37#include <linux/kernel.h>
  38#include <linux/slab.h>
  39#include <linux/init.h>
  40#include <linux/netdevice.h>
  41#include <net/net_namespace.h>
  42#include <linux/security.h>
  43#include <linux/notifier.h>
  44#include <linux/hashtable.h>
  45#include <rdma/rdma_netlink.h>
  46#include <rdma/ib_addr.h>
  47#include <rdma/ib_cache.h>
  48#include <rdma/rdma_counter.h>
  49
  50#include "core_priv.h"
  51#include "restrack.h"
  52
  53MODULE_AUTHOR("Roland Dreier");
  54MODULE_DESCRIPTION("core kernel InfiniBand API");
  55MODULE_LICENSE("Dual BSD/GPL");
  56
  57struct workqueue_struct *ib_comp_wq;
  58struct workqueue_struct *ib_comp_unbound_wq;
  59struct workqueue_struct *ib_wq;
  60EXPORT_SYMBOL_GPL(ib_wq);
  61
  62/*
  63 * Each of the three rwsem locks (devices, clients, client_data) protects the
  64 * xarray of the same name. Specifically it allows the caller to assert that
  65 * the MARK will/will not be changing under the lock, and for devices and
  66 * clients, that the value in the xarray is still a valid pointer. Change of
  67 * the MARK is linked to the object state, so holding the lock and testing the
  68 * MARK also asserts that the contained object is in a certain state.
  69 *
  70 * This is used to build a two stage register/unregister flow where objects
  71 * can continue to be in the xarray even though they are still in progress to
  72 * register/unregister.
  73 *
  74 * The xarray itself provides additional locking, and restartable iteration,
  75 * which is also relied on.
  76 *
  77 * Locks should not be nested, with the exception of client_data, which is
  78 * allowed to nest under the read side of the other two locks.
  79 *
  80 * The devices_rwsem also protects the device name list, any change or
  81 * assignment of device name must also hold the write side to guarantee unique
  82 * names.
  83 */
  84
  85/*
  86 * devices contains devices that have had their names assigned. The
  87 * devices may not be registered. Users that care about the registration
  88 * status need to call ib_device_try_get() on the device to ensure it is
  89 * registered, and keep it registered, for the required duration.
  90 *
  91 */
  92static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
  93static DECLARE_RWSEM(devices_rwsem);
  94#define DEVICE_REGISTERED XA_MARK_1
  95
  96static u32 highest_client_id;
  97#define CLIENT_REGISTERED XA_MARK_1
  98static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
  99static DECLARE_RWSEM(clients_rwsem);
 100
 101static void ib_client_put(struct ib_client *client)
 102{
 103        if (refcount_dec_and_test(&client->uses))
 104                complete(&client->uses_zero);
 105}
 106
 107/*
 108 * If client_data is registered then the corresponding client must also still
 109 * be registered.
 110 */
 111#define CLIENT_DATA_REGISTERED XA_MARK_1
 112
 113unsigned int rdma_dev_net_id;
 114
 115/*
 116 * A list of net namespaces is maintained in an xarray. This is necessary
 117 * because we can't get the locking right using the existing net ns list. We
 118 * would require a init_net callback after the list is updated.
 119 */
 120static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC);
 121/*
 122 * rwsem to protect accessing the rdma_nets xarray entries.
 123 */
 124static DECLARE_RWSEM(rdma_nets_rwsem);
 125
 126bool ib_devices_shared_netns = true;
 127module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444);
 128MODULE_PARM_DESC(netns_mode,
 129                 "Share device among net namespaces; default=1 (shared)");
 130/**
 131 * rdma_dev_access_netns() - Return whether a rdma device can be accessed
 132 *                           from a specified net namespace or not.
 133 * @device:     Pointer to rdma device which needs to be checked
 134 * @net:        Pointer to net namesapce for which access to be checked
 135 *
 136 * rdma_dev_access_netns() - Return whether a rdma device can be accessed
 137 *                           from a specified net namespace or not. When
 138 *                           rdma device is in shared mode, it ignores the
 139 *                           net namespace. When rdma device is exclusive
 140 *                           to a net namespace, rdma device net namespace is
 141 *                           checked against the specified one.
 142 */
 143bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
 144{
 145        return (ib_devices_shared_netns ||
 146                net_eq(read_pnet(&dev->coredev.rdma_net), net));
 147}
 148EXPORT_SYMBOL(rdma_dev_access_netns);
 149
 150/*
 151 * xarray has this behavior where it won't iterate over NULL values stored in
 152 * allocated arrays.  So we need our own iterator to see all values stored in
 153 * the array. This does the same thing as xa_for_each except that it also
 154 * returns NULL valued entries if the array is allocating. Simplified to only
 155 * work on simple xarrays.
 156 */
 157static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
 158                             xa_mark_t filter)
 159{
 160        XA_STATE(xas, xa, *indexp);
 161        void *entry;
 162
 163        rcu_read_lock();
 164        do {
 165                entry = xas_find_marked(&xas, ULONG_MAX, filter);
 166                if (xa_is_zero(entry))
 167                        break;
 168        } while (xas_retry(&xas, entry));
 169        rcu_read_unlock();
 170
 171        if (entry) {
 172                *indexp = xas.xa_index;
 173                if (xa_is_zero(entry))
 174                        return NULL;
 175                return entry;
 176        }
 177        return XA_ERROR(-ENOENT);
 178}
 179#define xan_for_each_marked(xa, index, entry, filter)                          \
 180        for (index = 0, entry = xan_find_marked(xa, &(index), filter);         \
 181             !xa_is_err(entry);                                                \
 182             (index)++, entry = xan_find_marked(xa, &(index), filter))
 183
 184/* RCU hash table mapping netdevice pointers to struct ib_port_data */
 185static DEFINE_SPINLOCK(ndev_hash_lock);
 186static DECLARE_HASHTABLE(ndev_hash, 5);
 187
 188static void free_netdevs(struct ib_device *ib_dev);
 189static void ib_unregister_work(struct work_struct *work);
 190static void __ib_unregister_device(struct ib_device *device);
 191static int ib_security_change(struct notifier_block *nb, unsigned long event,
 192                              void *lsm_data);
 193static void ib_policy_change_task(struct work_struct *work);
 194static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);
 195
 196static void __ibdev_printk(const char *level, const struct ib_device *ibdev,
 197                           struct va_format *vaf)
 198{
 199        if (ibdev && ibdev->dev.parent)
 200                dev_printk_emit(level[1] - '0',
 201                                ibdev->dev.parent,
 202                                "%s %s %s: %pV",
 203                                dev_driver_string(ibdev->dev.parent),
 204                                dev_name(ibdev->dev.parent),
 205                                dev_name(&ibdev->dev),
 206                                vaf);
 207        else if (ibdev)
 208                printk("%s%s: %pV",
 209                       level, dev_name(&ibdev->dev), vaf);
 210        else
 211                printk("%s(NULL ib_device): %pV", level, vaf);
 212}
 213
 214void ibdev_printk(const char *level, const struct ib_device *ibdev,
 215                  const char *format, ...)
 216{
 217        struct va_format vaf;
 218        va_list args;
 219
 220        va_start(args, format);
 221
 222        vaf.fmt = format;
 223        vaf.va = &args;
 224
 225        __ibdev_printk(level, ibdev, &vaf);
 226
 227        va_end(args);
 228}
 229EXPORT_SYMBOL(ibdev_printk);
 230
 231#define define_ibdev_printk_level(func, level)                  \
 232void func(const struct ib_device *ibdev, const char *fmt, ...)  \
 233{                                                               \
 234        struct va_format vaf;                                   \
 235        va_list args;                                           \
 236                                                                \
 237        va_start(args, fmt);                                    \
 238                                                                \
 239        vaf.fmt = fmt;                                          \
 240        vaf.va = &args;                                         \
 241                                                                \
 242        __ibdev_printk(level, ibdev, &vaf);                     \
 243                                                                \
 244        va_end(args);                                           \
 245}                                                               \
 246EXPORT_SYMBOL(func);
 247
 248define_ibdev_printk_level(ibdev_emerg, KERN_EMERG);
 249define_ibdev_printk_level(ibdev_alert, KERN_ALERT);
 250define_ibdev_printk_level(ibdev_crit, KERN_CRIT);
 251define_ibdev_printk_level(ibdev_err, KERN_ERR);
 252define_ibdev_printk_level(ibdev_warn, KERN_WARNING);
 253define_ibdev_printk_level(ibdev_notice, KERN_NOTICE);
 254define_ibdev_printk_level(ibdev_info, KERN_INFO);
 255
 256static struct notifier_block ibdev_lsm_nb = {
 257        .notifier_call = ib_security_change,
 258};
 259
 260static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
 261                                 struct net *net);
 262
 263/* Pointer to the RCU head at the start of the ib_port_data array */
 264struct ib_port_data_rcu {
 265        struct rcu_head rcu_head;
 266        struct ib_port_data pdata[];
 267};
 268
 269static void ib_device_check_mandatory(struct ib_device *device)
 270{
 271#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
 272        static const struct {
 273                size_t offset;
 274                char  *name;
 275        } mandatory_table[] = {
 276                IB_MANDATORY_FUNC(query_device),
 277                IB_MANDATORY_FUNC(query_port),
 278                IB_MANDATORY_FUNC(query_pkey),
 279                IB_MANDATORY_FUNC(alloc_pd),
 280                IB_MANDATORY_FUNC(dealloc_pd),
 281                IB_MANDATORY_FUNC(create_qp),
 282                IB_MANDATORY_FUNC(modify_qp),
 283                IB_MANDATORY_FUNC(destroy_qp),
 284                IB_MANDATORY_FUNC(post_send),
 285                IB_MANDATORY_FUNC(post_recv),
 286                IB_MANDATORY_FUNC(create_cq),
 287                IB_MANDATORY_FUNC(destroy_cq),
 288                IB_MANDATORY_FUNC(poll_cq),
 289                IB_MANDATORY_FUNC(req_notify_cq),
 290                IB_MANDATORY_FUNC(get_dma_mr),
 291                IB_MANDATORY_FUNC(dereg_mr),
 292                IB_MANDATORY_FUNC(get_port_immutable)
 293        };
 294        int i;
 295
 296        device->kverbs_provider = true;
 297        for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
 298                if (!*(void **) ((void *) &device->ops +
 299                                 mandatory_table[i].offset)) {
 300                        device->kverbs_provider = false;
 301                        break;
 302                }
 303        }
 304}
 305
 306/*
 307 * Caller must perform ib_device_put() to return the device reference count
 308 * when ib_device_get_by_index() returns valid device pointer.
 309 */
 310struct ib_device *ib_device_get_by_index(const struct net *net, u32 index)
 311{
 312        struct ib_device *device;
 313
 314        down_read(&devices_rwsem);
 315        device = xa_load(&devices, index);
 316        if (device) {
 317                if (!rdma_dev_access_netns(device, net)) {
 318                        device = NULL;
 319                        goto out;
 320                }
 321
 322                if (!ib_device_try_get(device))
 323                        device = NULL;
 324        }
 325out:
 326        up_read(&devices_rwsem);
 327        return device;
 328}
 329
 330/**
 331 * ib_device_put - Release IB device reference
 332 * @device: device whose reference to be released
 333 *
 334 * ib_device_put() releases reference to the IB device to allow it to be
 335 * unregistered and eventually free.
 336 */
 337void ib_device_put(struct ib_device *device)
 338{
 339        if (refcount_dec_and_test(&device->refcount))
 340                complete(&device->unreg_completion);
 341}
 342EXPORT_SYMBOL(ib_device_put);
 343
 344static struct ib_device *__ib_device_get_by_name(const char *name)
 345{
 346        struct ib_device *device;
 347        unsigned long index;
 348
 349        xa_for_each (&devices, index, device)
 350                if (!strcmp(name, dev_name(&device->dev)))
 351                        return device;
 352
 353        return NULL;
 354}
 355
 356/**
 357 * ib_device_get_by_name - Find an IB device by name
 358 * @name: The name to look for
 359 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
 360 *
 361 * Find and hold an ib_device by its name. The caller must call
 362 * ib_device_put() on the returned pointer.
 363 */
 364struct ib_device *ib_device_get_by_name(const char *name,
 365                                        enum rdma_driver_id driver_id)
 366{
 367        struct ib_device *device;
 368
 369        down_read(&devices_rwsem);
 370        device = __ib_device_get_by_name(name);
 371        if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
 372            device->ops.driver_id != driver_id)
 373                device = NULL;
 374
 375        if (device) {
 376                if (!ib_device_try_get(device))
 377                        device = NULL;
 378        }
 379        up_read(&devices_rwsem);
 380        return device;
 381}
 382EXPORT_SYMBOL(ib_device_get_by_name);
 383
 384static int rename_compat_devs(struct ib_device *device)
 385{
 386        struct ib_core_device *cdev;
 387        unsigned long index;
 388        int ret = 0;
 389
 390        mutex_lock(&device->compat_devs_mutex);
 391        xa_for_each (&device->compat_devs, index, cdev) {
 392                ret = device_rename(&cdev->dev, dev_name(&device->dev));
 393                if (ret) {
 394                        dev_warn(&cdev->dev,
 395                                 "Fail to rename compatdev to new name %s\n",
 396                                 dev_name(&device->dev));
 397                        break;
 398                }
 399        }
 400        mutex_unlock(&device->compat_devs_mutex);
 401        return ret;
 402}
 403
 404int ib_device_rename(struct ib_device *ibdev, const char *name)
 405{
 406        unsigned long index;
 407        void *client_data;
 408        int ret;
 409
 410        down_write(&devices_rwsem);
 411        if (!strcmp(name, dev_name(&ibdev->dev))) {
 412                up_write(&devices_rwsem);
 413                return 0;
 414        }
 415
 416        if (__ib_device_get_by_name(name)) {
 417                up_write(&devices_rwsem);
 418                return -EEXIST;
 419        }
 420
 421        ret = device_rename(&ibdev->dev, name);
 422        if (ret) {
 423                up_write(&devices_rwsem);
 424                return ret;
 425        }
 426
 427        strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
 428        ret = rename_compat_devs(ibdev);
 429
 430        downgrade_write(&devices_rwsem);
 431        down_read(&ibdev->client_data_rwsem);
 432        xan_for_each_marked(&ibdev->client_data, index, client_data,
 433                            CLIENT_DATA_REGISTERED) {
 434                struct ib_client *client = xa_load(&clients, index);
 435
 436                if (!client || !client->rename)
 437                        continue;
 438
 439                client->rename(ibdev, client_data);
 440        }
 441        up_read(&ibdev->client_data_rwsem);
 442        up_read(&devices_rwsem);
 443        return 0;
 444}
 445
 446int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim)
 447{
 448        if (use_dim > 1)
 449                return -EINVAL;
 450        ibdev->use_cq_dim = use_dim;
 451
 452        return 0;
 453}
 454
 455static int alloc_name(struct ib_device *ibdev, const char *name)
 456{
 457        struct ib_device *device;
 458        unsigned long index;
 459        struct ida inuse;
 460        int rc;
 461        int i;
 462
 463        lockdep_assert_held_write(&devices_rwsem);
 464        ida_init(&inuse);
 465        xa_for_each (&devices, index, device) {
 466                char buf[IB_DEVICE_NAME_MAX];
 467
 468                if (sscanf(dev_name(&device->dev), name, &i) != 1)
 469                        continue;
 470                if (i < 0 || i >= INT_MAX)
 471                        continue;
 472                snprintf(buf, sizeof buf, name, i);
 473                if (strcmp(buf, dev_name(&device->dev)) != 0)
 474                        continue;
 475
 476                rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL);
 477                if (rc < 0)
 478                        goto out;
 479        }
 480
 481        rc = ida_alloc(&inuse, GFP_KERNEL);
 482        if (rc < 0)
 483                goto out;
 484
 485        rc = dev_set_name(&ibdev->dev, name, rc);
 486out:
 487        ida_destroy(&inuse);
 488        return rc;
 489}
 490
 491static void ib_device_release(struct device *device)
 492{
 493        struct ib_device *dev = container_of(device, struct ib_device, dev);
 494
 495        free_netdevs(dev);
 496        WARN_ON(refcount_read(&dev->refcount));
 497        if (dev->port_data) {
 498                ib_cache_release_one(dev);
 499                ib_security_release_port_pkey_list(dev);
 500                rdma_counter_release(dev);
 501                kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
 502                                       pdata[0]),
 503                          rcu_head);
 504        }
 505
 506        mutex_destroy(&dev->unregistration_lock);
 507        mutex_destroy(&dev->compat_devs_mutex);
 508
 509        xa_destroy(&dev->compat_devs);
 510        xa_destroy(&dev->client_data);
 511        kfree_rcu(dev, rcu_head);
 512}
 513
 514static int ib_device_uevent(struct device *device,
 515                            struct kobj_uevent_env *env)
 516{
 517        if (add_uevent_var(env, "NAME=%s", dev_name(device)))
 518                return -ENOMEM;
 519
 520        /*
 521         * It would be nice to pass the node GUID with the event...
 522         */
 523
 524        return 0;
 525}
 526
 527static const void *net_namespace(struct device *d)
 528{
 529        struct ib_core_device *coredev =
 530                        container_of(d, struct ib_core_device, dev);
 531
 532        return read_pnet(&coredev->rdma_net);
 533}
 534
 535static struct class ib_class = {
 536        .name    = "infiniband",
 537        .dev_release = ib_device_release,
 538        .dev_uevent = ib_device_uevent,
 539        .ns_type = &net_ns_type_operations,
 540        .namespace = net_namespace,
 541};
 542
 543static void rdma_init_coredev(struct ib_core_device *coredev,
 544                              struct ib_device *dev, struct net *net)
 545{
 546        /* This BUILD_BUG_ON is intended to catch layout change
 547         * of union of ib_core_device and device.
 548         * dev must be the first element as ib_core and providers
 549         * driver uses it. Adding anything in ib_core_device before
 550         * device will break this assumption.
 551         */
 552        BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) !=
 553                     offsetof(struct ib_device, dev));
 554
 555        coredev->dev.class = &ib_class;
 556        coredev->dev.groups = dev->groups;
 557        device_initialize(&coredev->dev);
 558        coredev->owner = dev;
 559        INIT_LIST_HEAD(&coredev->port_list);
 560        write_pnet(&coredev->rdma_net, net);
 561}
 562
 563/**
 564 * _ib_alloc_device - allocate an IB device struct
 565 * @size:size of structure to allocate
 566 *
 567 * Low-level drivers should use ib_alloc_device() to allocate &struct
 568 * ib_device.  @size is the size of the structure to be allocated,
 569 * including any private data used by the low-level driver.
 570 * ib_dealloc_device() must be used to free structures allocated with
 571 * ib_alloc_device().
 572 */
 573struct ib_device *_ib_alloc_device(size_t size)
 574{
 575        struct ib_device *device;
 576
 577        if (WARN_ON(size < sizeof(struct ib_device)))
 578                return NULL;
 579
 580        device = kzalloc(size, GFP_KERNEL);
 581        if (!device)
 582                return NULL;
 583
 584        if (rdma_restrack_init(device)) {
 585                kfree(device);
 586                return NULL;
 587        }
 588
 589        device->groups[0] = &ib_dev_attr_group;
 590        rdma_init_coredev(&device->coredev, device, &init_net);
 591
 592        INIT_LIST_HEAD(&device->event_handler_list);
 593        spin_lock_init(&device->event_handler_lock);
 594        mutex_init(&device->unregistration_lock);
 595        /*
 596         * client_data needs to be alloc because we don't want our mark to be
 597         * destroyed if the user stores NULL in the client data.
 598         */
 599        xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
 600        init_rwsem(&device->client_data_rwsem);
 601        xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
 602        mutex_init(&device->compat_devs_mutex);
 603        init_completion(&device->unreg_completion);
 604        INIT_WORK(&device->unregistration_work, ib_unregister_work);
 605
 606        return device;
 607}
 608EXPORT_SYMBOL(_ib_alloc_device);
 609
 610/**
 611 * ib_dealloc_device - free an IB device struct
 612 * @device:structure to free
 613 *
 614 * Free a structure allocated with ib_alloc_device().
 615 */
 616void ib_dealloc_device(struct ib_device *device)
 617{
 618        if (device->ops.dealloc_driver)
 619                device->ops.dealloc_driver(device);
 620
 621        /*
 622         * ib_unregister_driver() requires all devices to remain in the xarray
 623         * while their ops are callable. The last op we call is dealloc_driver
 624         * above.  This is needed to create a fence on op callbacks prior to
 625         * allowing the driver module to unload.
 626         */
 627        down_write(&devices_rwsem);
 628        if (xa_load(&devices, device->index) == device)
 629                xa_erase(&devices, device->index);
 630        up_write(&devices_rwsem);
 631
 632        /* Expedite releasing netdev references */
 633        free_netdevs(device);
 634
 635        WARN_ON(!xa_empty(&device->compat_devs));
 636        WARN_ON(!xa_empty(&device->client_data));
 637        WARN_ON(refcount_read(&device->refcount));
 638        rdma_restrack_clean(device);
 639        /* Balances with device_initialize */
 640        put_device(&device->dev);
 641}
 642EXPORT_SYMBOL(ib_dealloc_device);
 643
 644/*
 645 * add_client_context() and remove_client_context() must be safe against
 646 * parallel calls on the same device - registration/unregistration of both the
 647 * device and client can be occurring in parallel.
 648 *
 649 * The routines need to be a fence, any caller must not return until the add
 650 * or remove is fully completed.
 651 */
 652static int add_client_context(struct ib_device *device,
 653                              struct ib_client *client)
 654{
 655        int ret = 0;
 656
 657        if (!device->kverbs_provider && !client->no_kverbs_req)
 658                return 0;
 659
 660        down_write(&device->client_data_rwsem);
 661        /*
 662         * So long as the client is registered hold both the client and device
 663         * unregistration locks.
 664         */
 665        if (!refcount_inc_not_zero(&client->uses))
 666                goto out_unlock;
 667        refcount_inc(&device->refcount);
 668
 669        /*
 670         * Another caller to add_client_context got here first and has already
 671         * completely initialized context.
 672         */
 673        if (xa_get_mark(&device->client_data, client->client_id,
 674                    CLIENT_DATA_REGISTERED))
 675                goto out;
 676
 677        ret = xa_err(xa_store(&device->client_data, client->client_id, NULL,
 678                              GFP_KERNEL));
 679        if (ret)
 680                goto out;
 681        downgrade_write(&device->client_data_rwsem);
 682        if (client->add)
 683                client->add(device);
 684
 685        /* Readers shall not see a client until add has been completed */
 686        xa_set_mark(&device->client_data, client->client_id,
 687                    CLIENT_DATA_REGISTERED);
 688        up_read(&device->client_data_rwsem);
 689        return 0;
 690
 691out:
 692        ib_device_put(device);
 693        ib_client_put(client);
 694out_unlock:
 695        up_write(&device->client_data_rwsem);
 696        return ret;
 697}
 698
 699static void remove_client_context(struct ib_device *device,
 700                                  unsigned int client_id)
 701{
 702        struct ib_client *client;
 703        void *client_data;
 704
 705        down_write(&device->client_data_rwsem);
 706        if (!xa_get_mark(&device->client_data, client_id,
 707                         CLIENT_DATA_REGISTERED)) {
 708                up_write(&device->client_data_rwsem);
 709                return;
 710        }
 711        client_data = xa_load(&device->client_data, client_id);
 712        xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
 713        client = xa_load(&clients, client_id);
 714        up_write(&device->client_data_rwsem);
 715
 716        /*
 717         * Notice we cannot be holding any exclusive locks when calling the
 718         * remove callback as the remove callback can recurse back into any
 719         * public functions in this module and thus try for any locks those
 720         * functions take.
 721         *
 722         * For this reason clients and drivers should not call the
 723         * unregistration functions will holdling any locks.
 724         */
 725        if (client->remove)
 726                client->remove(device, client_data);
 727
 728        xa_erase(&device->client_data, client_id);
 729        ib_device_put(device);
 730        ib_client_put(client);
 731}
 732
 733static int alloc_port_data(struct ib_device *device)
 734{
 735        struct ib_port_data_rcu *pdata_rcu;
 736        unsigned int port;
 737
 738        if (device->port_data)
 739                return 0;
 740
 741        /* This can only be called once the physical port range is defined */
 742        if (WARN_ON(!device->phys_port_cnt))
 743                return -EINVAL;
 744
 745        /*
 746         * device->port_data is indexed directly by the port number to make
 747         * access to this data as efficient as possible.
 748         *
 749         * Therefore port_data is declared as a 1 based array with potential
 750         * empty slots at the beginning.
 751         */
 752        pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
 753                                        rdma_end_port(device) + 1),
 754                            GFP_KERNEL);
 755        if (!pdata_rcu)
 756                return -ENOMEM;
 757        /*
 758         * The rcu_head is put in front of the port data array and the stored
 759         * pointer is adjusted since we never need to see that member until
 760         * kfree_rcu.
 761         */
 762        device->port_data = pdata_rcu->pdata;
 763
 764        rdma_for_each_port (device, port) {
 765                struct ib_port_data *pdata = &device->port_data[port];
 766
 767                pdata->ib_dev = device;
 768                spin_lock_init(&pdata->pkey_list_lock);
 769                INIT_LIST_HEAD(&pdata->pkey_list);
 770                spin_lock_init(&pdata->netdev_lock);
 771                INIT_HLIST_NODE(&pdata->ndev_hash_link);
 772        }
 773        return 0;
 774}
 775
 776static int verify_immutable(const struct ib_device *dev, u8 port)
 777{
 778        return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
 779                            rdma_max_mad_size(dev, port) != 0);
 780}
 781
 782static int setup_port_data(struct ib_device *device)
 783{
 784        unsigned int port;
 785        int ret;
 786
 787        ret = alloc_port_data(device);
 788        if (ret)
 789                return ret;
 790
 791        rdma_for_each_port (device, port) {
 792                struct ib_port_data *pdata = &device->port_data[port];
 793
 794                ret = device->ops.get_port_immutable(device, port,
 795                                                     &pdata->immutable);
 796                if (ret)
 797                        return ret;
 798
 799                if (verify_immutable(device, port))
 800                        return -EINVAL;
 801        }
 802        return 0;
 803}
 804
 805void ib_get_device_fw_str(struct ib_device *dev, char *str)
 806{
 807        if (dev->ops.get_dev_fw_str)
 808                dev->ops.get_dev_fw_str(dev, str);
 809        else
 810                str[0] = '\0';
 811}
 812EXPORT_SYMBOL(ib_get_device_fw_str);
 813
 814static void ib_policy_change_task(struct work_struct *work)
 815{
 816        struct ib_device *dev;
 817        unsigned long index;
 818
 819        down_read(&devices_rwsem);
 820        xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
 821                unsigned int i;
 822
 823                rdma_for_each_port (dev, i) {
 824                        u64 sp;
 825                        int ret = ib_get_cached_subnet_prefix(dev,
 826                                                              i,
 827                                                              &sp);
 828
 829                        WARN_ONCE(ret,
 830                                  "ib_get_cached_subnet_prefix err: %d, this should never happen here\n",
 831                                  ret);
 832                        if (!ret)
 833                                ib_security_cache_change(dev, i, sp);
 834                }
 835        }
 836        up_read(&devices_rwsem);
 837}
 838
 839static int ib_security_change(struct notifier_block *nb, unsigned long event,
 840                              void *lsm_data)
 841{
 842        if (event != LSM_POLICY_CHANGE)
 843                return NOTIFY_DONE;
 844
 845        schedule_work(&ib_policy_change_work);
 846        ib_mad_agent_security_change();
 847
 848        return NOTIFY_OK;
 849}
 850
 851static void compatdev_release(struct device *dev)
 852{
 853        struct ib_core_device *cdev =
 854                container_of(dev, struct ib_core_device, dev);
 855
 856        kfree(cdev);
 857}
 858
 859static int add_one_compat_dev(struct ib_device *device,
 860                              struct rdma_dev_net *rnet)
 861{
 862        struct ib_core_device *cdev;
 863        int ret;
 864
 865        lockdep_assert_held(&rdma_nets_rwsem);
 866        if (!ib_devices_shared_netns)
 867                return 0;
 868
 869        /*
 870         * Create and add compat device in all namespaces other than where it
 871         * is currently bound to.
 872         */
 873        if (net_eq(read_pnet(&rnet->net),
 874                   read_pnet(&device->coredev.rdma_net)))
 875                return 0;
 876
 877        /*
 878         * The first of init_net() or ib_register_device() to take the
 879         * compat_devs_mutex wins and gets to add the device. Others will wait
 880         * for completion here.
 881         */
 882        mutex_lock(&device->compat_devs_mutex);
 883        cdev = xa_load(&device->compat_devs, rnet->id);
 884        if (cdev) {
 885                ret = 0;
 886                goto done;
 887        }
 888        ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL);
 889        if (ret)
 890                goto done;
 891
 892        cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
 893        if (!cdev) {
 894                ret = -ENOMEM;
 895                goto cdev_err;
 896        }
 897
 898        cdev->dev.parent = device->dev.parent;
 899        rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
 900        cdev->dev.release = compatdev_release;
 901        dev_set_name(&cdev->dev, "%s", dev_name(&device->dev));
 902
 903        ret = device_add(&cdev->dev);
 904        if (ret)
 905                goto add_err;
 906        ret = ib_setup_port_attrs(cdev);
 907        if (ret)
 908                goto port_err;
 909
 910        ret = xa_err(xa_store(&device->compat_devs, rnet->id,
 911                              cdev, GFP_KERNEL));
 912        if (ret)
 913                goto insert_err;
 914
 915        mutex_unlock(&device->compat_devs_mutex);
 916        return 0;
 917
 918insert_err:
 919        ib_free_port_attrs(cdev);
 920port_err:
 921        device_del(&cdev->dev);
 922add_err:
 923        put_device(&cdev->dev);
 924cdev_err:
 925        xa_release(&device->compat_devs, rnet->id);
 926done:
 927        mutex_unlock(&device->compat_devs_mutex);
 928        return ret;
 929}
 930
 931static void remove_one_compat_dev(struct ib_device *device, u32 id)
 932{
 933        struct ib_core_device *cdev;
 934
 935        mutex_lock(&device->compat_devs_mutex);
 936        cdev = xa_erase(&device->compat_devs, id);
 937        mutex_unlock(&device->compat_devs_mutex);
 938        if (cdev) {
 939                ib_free_port_attrs(cdev);
 940                device_del(&cdev->dev);
 941                put_device(&cdev->dev);
 942        }
 943}
 944
 945static void remove_compat_devs(struct ib_device *device)
 946{
 947        struct ib_core_device *cdev;
 948        unsigned long index;
 949
 950        xa_for_each (&device->compat_devs, index, cdev)
 951                remove_one_compat_dev(device, index);
 952}
 953
 954static int add_compat_devs(struct ib_device *device)
 955{
 956        struct rdma_dev_net *rnet;
 957        unsigned long index;
 958        int ret = 0;
 959
 960        lockdep_assert_held(&devices_rwsem);
 961
 962        down_read(&rdma_nets_rwsem);
 963        xa_for_each (&rdma_nets, index, rnet) {
 964                ret = add_one_compat_dev(device, rnet);
 965                if (ret)
 966                        break;
 967        }
 968        up_read(&rdma_nets_rwsem);
 969        return ret;
 970}
 971
 972static void remove_all_compat_devs(void)
 973{
 974        struct ib_compat_device *cdev;
 975        struct ib_device *dev;
 976        unsigned long index;
 977
 978        down_read(&devices_rwsem);
 979        xa_for_each (&devices, index, dev) {
 980                unsigned long c_index = 0;
 981
 982                /* Hold nets_rwsem so that any other thread modifying this
 983                 * system param can sync with this thread.
 984                 */
 985                down_read(&rdma_nets_rwsem);
 986                xa_for_each (&dev->compat_devs, c_index, cdev)
 987                        remove_one_compat_dev(dev, c_index);
 988                up_read(&rdma_nets_rwsem);
 989        }
 990        up_read(&devices_rwsem);
 991}
 992
 993static int add_all_compat_devs(void)
 994{
 995        struct rdma_dev_net *rnet;
 996        struct ib_device *dev;
 997        unsigned long index;
 998        int ret = 0;
 999
1000        down_read(&devices_rwsem);
1001        xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
1002                unsigned long net_index = 0;
1003
1004                /* Hold nets_rwsem so that any other thread modifying this
1005                 * system param can sync with this thread.
1006                 */
1007                down_read(&rdma_nets_rwsem);
1008                xa_for_each (&rdma_nets, net_index, rnet) {
1009                        ret = add_one_compat_dev(dev, rnet);
1010                        if (ret)
1011                                break;
1012                }
1013                up_read(&rdma_nets_rwsem);
1014        }
1015        up_read(&devices_rwsem);
1016        if (ret)
1017                remove_all_compat_devs();
1018        return ret;
1019}
1020
1021int rdma_compatdev_set(u8 enable)
1022{
1023        struct rdma_dev_net *rnet;
1024        unsigned long index;
1025        int ret = 0;
1026
1027        down_write(&rdma_nets_rwsem);
1028        if (ib_devices_shared_netns == enable) {
1029                up_write(&rdma_nets_rwsem);
1030                return 0;
1031        }
1032
1033        /* enable/disable of compat devices is not supported
1034         * when more than default init_net exists.
1035         */
1036        xa_for_each (&rdma_nets, index, rnet) {
1037                ret++;
1038                break;
1039        }
1040        if (!ret)
1041                ib_devices_shared_netns = enable;
1042        up_write(&rdma_nets_rwsem);
1043        if (ret)
1044                return -EBUSY;
1045
1046        if (enable)
1047                ret = add_all_compat_devs();
1048        else
1049                remove_all_compat_devs();
1050        return ret;
1051}
1052
1053static void rdma_dev_exit_net(struct net *net)
1054{
1055        struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
1056        struct ib_device *dev;
1057        unsigned long index;
1058        int ret;
1059
1060        down_write(&rdma_nets_rwsem);
1061        /*
1062         * Prevent the ID from being re-used and hide the id from xa_for_each.
1063         */
1064        ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
1065        WARN_ON(ret);
1066        up_write(&rdma_nets_rwsem);
1067
1068        down_read(&devices_rwsem);
1069        xa_for_each (&devices, index, dev) {
1070                get_device(&dev->dev);
1071                /*
1072                 * Release the devices_rwsem so that pontentially blocking
1073                 * device_del, doesn't hold the devices_rwsem for too long.
1074                 */
1075                up_read(&devices_rwsem);
1076
1077                remove_one_compat_dev(dev, rnet->id);
1078
1079                /*
1080                 * If the real device is in the NS then move it back to init.
1081                 */
1082                rdma_dev_change_netns(dev, net, &init_net);
1083
1084                put_device(&dev->dev);
1085                down_read(&devices_rwsem);
1086        }
1087        up_read(&devices_rwsem);
1088
1089        rdma_nl_net_exit(rnet);
1090        xa_erase(&rdma_nets, rnet->id);
1091}
1092
1093static __net_init int rdma_dev_init_net(struct net *net)
1094{
1095        struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
1096        unsigned long index;
1097        struct ib_device *dev;
1098        int ret;
1099
1100        write_pnet(&rnet->net, net);
1101
1102        ret = rdma_nl_net_init(rnet);
1103        if (ret)
1104                return ret;
1105
1106        /* No need to create any compat devices in default init_net. */
1107        if (net_eq(net, &init_net))
1108                return 0;
1109
1110        ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL);
1111        if (ret) {
1112                rdma_nl_net_exit(rnet);
1113                return ret;
1114        }
1115
1116        down_read(&devices_rwsem);
1117        xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
1118                /* Hold nets_rwsem so that netlink command cannot change
1119                 * system configuration for device sharing mode.
1120                 */
1121                down_read(&rdma_nets_rwsem);
1122                ret = add_one_compat_dev(dev, rnet);
1123                up_read(&rdma_nets_rwsem);
1124                if (ret)
1125                        break;
1126        }
1127        up_read(&devices_rwsem);
1128
1129        if (ret)
1130                rdma_dev_exit_net(net);
1131
1132        return ret;
1133}
1134
1135/*
1136 * Assign the unique string device name and the unique device index. This is
1137 * undone by ib_dealloc_device.
1138 */
1139static int assign_name(struct ib_device *device, const char *name)
1140{
1141        static u32 last_id;
1142        int ret;
1143
1144        down_write(&devices_rwsem);
1145        /* Assign a unique name to the device */
1146        if (strchr(name, '%'))
1147                ret = alloc_name(device, name);
1148        else
1149                ret = dev_set_name(&device->dev, name);
1150        if (ret)
1151                goto out;
1152
1153        if (__ib_device_get_by_name(dev_name(&device->dev))) {
1154                ret = -ENFILE;
1155                goto out;
1156        }
1157        strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
1158
1159        ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b,
1160                        &last_id, GFP_KERNEL);
1161        if (ret > 0)
1162                ret = 0;
1163
1164out:
1165        up_write(&devices_rwsem);
1166        return ret;
1167}
1168
1169static void setup_dma_device(struct ib_device *device)
1170{
1171        struct device *parent = device->dev.parent;
1172
1173        WARN_ON_ONCE(device->dma_device);
1174        if (device->dev.dma_ops) {
1175                /*
1176                 * The caller provided custom DMA operations. Copy the
1177                 * DMA-related fields that are used by e.g. dma_alloc_coherent()
1178                 * into device->dev.
1179                 */
1180                device->dma_device = &device->dev;
1181                if (!device->dev.dma_mask) {
1182                        if (parent)
1183                                device->dev.dma_mask = parent->dma_mask;
1184                        else
1185                                WARN_ON_ONCE(true);
1186                }
1187                if (!device->dev.coherent_dma_mask) {
1188                        if (parent)
1189                                device->dev.coherent_dma_mask =
1190                                        parent->coherent_dma_mask;
1191                        else
1192                                WARN_ON_ONCE(true);
1193                }
1194        } else {
1195                /*
1196                 * The caller did not provide custom DMA operations. Use the
1197                 * DMA mapping operations of the parent device.
1198                 */
1199                WARN_ON_ONCE(!parent);
1200                device->dma_device = parent;
1201        }
1202        /* Setup default max segment size for all IB devices */
1203        dma_set_max_seg_size(device->dma_device, SZ_2G);
1204
1205}
1206
1207/*
1208 * setup_device() allocates memory and sets up data that requires calling the
1209 * device ops, this is the only reason these actions are not done during
1210 * ib_alloc_device. It is undone by ib_dealloc_device().
1211 */
1212static int setup_device(struct ib_device *device)
1213{
1214        struct ib_udata uhw = {.outlen = 0, .inlen = 0};
1215        int ret;
1216
1217        setup_dma_device(device);
1218        ib_device_check_mandatory(device);
1219
1220        ret = setup_port_data(device);
1221        if (ret) {
1222                dev_warn(&device->dev, "Couldn't create per-port data\n");
1223                return ret;
1224        }
1225
1226        memset(&device->attrs, 0, sizeof(device->attrs));
1227        ret = device->ops.query_device(device, &device->attrs, &uhw);
1228        if (ret) {
1229                dev_warn(&device->dev,
1230                         "Couldn't query the device attributes\n");
1231                return ret;
1232        }
1233
1234        return 0;
1235}
1236
1237static void disable_device(struct ib_device *device)
1238{
1239        u32 cid;
1240
1241        WARN_ON(!refcount_read(&device->refcount));
1242
1243        down_write(&devices_rwsem);
1244        xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
1245        up_write(&devices_rwsem);
1246
1247        /*
1248         * Remove clients in LIFO order, see assign_client_id. This could be
1249         * more efficient if xarray learns to reverse iterate. Since no new
1250         * clients can be added to this ib_device past this point we only need
1251         * the maximum possible client_id value here.
1252         */
1253        down_read(&clients_rwsem);
1254        cid = highest_client_id;
1255        up_read(&clients_rwsem);
1256        while (cid) {
1257                cid--;
1258                remove_client_context(device, cid);
1259        }
1260
1261        /* Pairs with refcount_set in enable_device */
1262        ib_device_put(device);
1263        wait_for_completion(&device->unreg_completion);
1264
1265        /*
1266         * compat devices must be removed after device refcount drops to zero.
1267         * Otherwise init_net() may add more compatdevs after removing compat
1268         * devices and before device is disabled.
1269         */
1270        remove_compat_devs(device);
1271}
1272
1273/*
1274 * An enabled device is visible to all clients and to all the public facing
1275 * APIs that return a device pointer. This always returns with a new get, even
1276 * if it fails.
1277 */
1278static int enable_device_and_get(struct ib_device *device)
1279{
1280        struct ib_client *client;
1281        unsigned long index;
1282        int ret = 0;
1283
1284        /*
1285         * One ref belongs to the xa and the other belongs to this
1286         * thread. This is needed to guard against parallel unregistration.
1287         */
1288        refcount_set(&device->refcount, 2);
1289        down_write(&devices_rwsem);
1290        xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
1291
1292        /*
1293         * By using downgrade_write() we ensure that no other thread can clear
1294         * DEVICE_REGISTERED while we are completing the client setup.
1295         */
1296        downgrade_write(&devices_rwsem);
1297
1298        if (device->ops.enable_driver) {
1299                ret = device->ops.enable_driver(device);
1300                if (ret)
1301                        goto out;
1302        }
1303
1304        down_read(&clients_rwsem);
1305        xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
1306                ret = add_client_context(device, client);
1307                if (ret)
1308                        break;
1309        }
1310        up_read(&clients_rwsem);
1311        if (!ret)
1312                ret = add_compat_devs(device);
1313out:
1314        up_read(&devices_rwsem);
1315        return ret;
1316}
1317
1318/**
1319 * ib_register_device - Register an IB device with IB core
1320 * @device:Device to register
1321 *
1322 * Low-level drivers use ib_register_device() to register their
1323 * devices with the IB core.  All registered clients will receive a
1324 * callback for each device that is added. @device must be allocated
1325 * with ib_alloc_device().
1326 *
1327 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
1328 * asynchronously then the device pointer may become freed as soon as this
1329 * function returns.
1330 */
1331int ib_register_device(struct ib_device *device, const char *name)
1332{
1333        int ret;
1334
1335        ret = assign_name(device, name);
1336        if (ret)
1337                return ret;
1338
1339        ret = setup_device(device);
1340        if (ret)
1341                return ret;
1342
1343        ret = ib_cache_setup_one(device);
1344        if (ret) {
1345                dev_warn(&device->dev,
1346                         "Couldn't set up InfiniBand P_Key/GID cache\n");
1347                return ret;
1348        }
1349
1350        ib_device_register_rdmacg(device);
1351
1352        rdma_counter_init(device);
1353
1354        /*
1355         * Ensure that ADD uevent is not fired because it
1356         * is too early amd device is not initialized yet.
1357         */
1358        dev_set_uevent_suppress(&device->dev, true);
1359        ret = device_add(&device->dev);
1360        if (ret)
1361                goto cg_cleanup;
1362
1363        ret = ib_device_register_sysfs(device);
1364        if (ret) {
1365                dev_warn(&device->dev,
1366                         "Couldn't register device with driver model\n");
1367                goto dev_cleanup;
1368        }
1369
1370        ret = enable_device_and_get(device);
1371        dev_set_uevent_suppress(&device->dev, false);
1372        /* Mark for userspace that device is ready */
1373        kobject_uevent(&device->dev.kobj, KOBJ_ADD);
1374        if (ret) {
1375                void (*dealloc_fn)(struct ib_device *);
1376
1377                /*
1378                 * If we hit this error flow then we don't want to
1379                 * automatically dealloc the device since the caller is
1380                 * expected to call ib_dealloc_device() after
1381                 * ib_register_device() fails. This is tricky due to the
1382                 * possibility for a parallel unregistration along with this
1383                 * error flow. Since we have a refcount here we know any
1384                 * parallel flow is stopped in disable_device and will see the
1385                 * NULL pointers, causing the responsibility to
1386                 * ib_dealloc_device() to revert back to this thread.
1387                 */
1388                dealloc_fn = device->ops.dealloc_driver;
1389                device->ops.dealloc_driver = NULL;
1390                ib_device_put(device);
1391                __ib_unregister_device(device);
1392                device->ops.dealloc_driver = dealloc_fn;
1393                return ret;
1394        }
1395        ib_device_put(device);
1396
1397        return 0;
1398
1399dev_cleanup:
1400        device_del(&device->dev);
1401cg_cleanup:
1402        dev_set_uevent_suppress(&device->dev, false);
1403        ib_device_unregister_rdmacg(device);
1404        ib_cache_cleanup_one(device);
1405        return ret;
1406}
1407EXPORT_SYMBOL(ib_register_device);
1408
1409/* Callers must hold a get on the device. */
1410static void __ib_unregister_device(struct ib_device *ib_dev)
1411{
1412        /*
1413         * We have a registration lock so that all the calls to unregister are
1414         * fully fenced, once any unregister returns the device is truely
1415         * unregistered even if multiple callers are unregistering it at the
1416         * same time. This also interacts with the registration flow and
1417         * provides sane semantics if register and unregister are racing.
1418         */
1419        mutex_lock(&ib_dev->unregistration_lock);
1420        if (!refcount_read(&ib_dev->refcount))
1421                goto out;
1422
1423        disable_device(ib_dev);
1424
1425        /* Expedite removing unregistered pointers from the hash table */
1426        free_netdevs(ib_dev);
1427
1428        ib_device_unregister_sysfs(ib_dev);
1429        device_del(&ib_dev->dev);
1430        ib_device_unregister_rdmacg(ib_dev);
1431        ib_cache_cleanup_one(ib_dev);
1432
1433        /*
1434         * Drivers using the new flow may not call ib_dealloc_device except
1435         * in error unwind prior to registration success.
1436         */
1437        if (ib_dev->ops.dealloc_driver) {
1438                WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
1439                ib_dealloc_device(ib_dev);
1440        }
1441out:
1442        mutex_unlock(&ib_dev->unregistration_lock);
1443}
1444
1445/**
1446 * ib_unregister_device - Unregister an IB device
1447 * @device: The device to unregister
1448 *
1449 * Unregister an IB device.  All clients will receive a remove callback.
1450 *
1451 * Callers should call this routine only once, and protect against races with
1452 * registration. Typically it should only be called as part of a remove
1453 * callback in an implementation of driver core's struct device_driver and
1454 * related.
1455 *
1456 * If ops.dealloc_driver is used then ib_dev will be freed upon return from
1457 * this function.
1458 */
1459void ib_unregister_device(struct ib_device *ib_dev)
1460{
1461        get_device(&ib_dev->dev);
1462        __ib_unregister_device(ib_dev);
1463        put_device(&ib_dev->dev);
1464}
1465EXPORT_SYMBOL(ib_unregister_device);
1466
1467/**
1468 * ib_unregister_device_and_put - Unregister a device while holding a 'get'
1469 * device: The device to unregister
1470 *
1471 * This is the same as ib_unregister_device(), except it includes an internal
1472 * ib_device_put() that should match a 'get' obtained by the caller.
1473 *
1474 * It is safe to call this routine concurrently from multiple threads while
1475 * holding the 'get'. When the function returns the device is fully
1476 * unregistered.
1477 *
1478 * Drivers using this flow MUST use the driver_unregister callback to clean up
1479 * their resources associated with the device and dealloc it.
1480 */
1481void ib_unregister_device_and_put(struct ib_device *ib_dev)
1482{
1483        WARN_ON(!ib_dev->ops.dealloc_driver);
1484        get_device(&ib_dev->dev);
1485        ib_device_put(ib_dev);
1486        __ib_unregister_device(ib_dev);
1487        put_device(&ib_dev->dev);
1488}
1489EXPORT_SYMBOL(ib_unregister_device_and_put);
1490
1491/**
1492 * ib_unregister_driver - Unregister all IB devices for a driver
1493 * @driver_id: The driver to unregister
1494 *
1495 * This implements a fence for device unregistration. It only returns once all
1496 * devices associated with the driver_id have fully completed their
1497 * unregistration and returned from ib_unregister_device*().
1498 *
1499 * If device's are not yet unregistered it goes ahead and starts unregistering
1500 * them.
1501 *
1502 * This does not block creation of new devices with the given driver_id, that
1503 * is the responsibility of the caller.
1504 */
1505void ib_unregister_driver(enum rdma_driver_id driver_id)
1506{
1507        struct ib_device *ib_dev;
1508        unsigned long index;
1509
1510        down_read(&devices_rwsem);
1511        xa_for_each (&devices, index, ib_dev) {
1512                if (ib_dev->ops.driver_id != driver_id)
1513                        continue;
1514
1515                get_device(&ib_dev->dev);
1516                up_read(&devices_rwsem);
1517
1518                WARN_ON(!ib_dev->ops.dealloc_driver);
1519                __ib_unregister_device(ib_dev);
1520
1521                put_device(&ib_dev->dev);
1522                down_read(&devices_rwsem);
1523        }
1524        up_read(&devices_rwsem);
1525}
1526EXPORT_SYMBOL(ib_unregister_driver);
1527
1528static void ib_unregister_work(struct work_struct *work)
1529{
1530        struct ib_device *ib_dev =
1531                container_of(work, struct ib_device, unregistration_work);
1532
1533        __ib_unregister_device(ib_dev);
1534        put_device(&ib_dev->dev);
1535}
1536
1537/**
1538 * ib_unregister_device_queued - Unregister a device using a work queue
1539 * device: The device to unregister
1540 *
1541 * This schedules an asynchronous unregistration using a WQ for the device. A
1542 * driver should use this to avoid holding locks while doing unregistration,
1543 * such as holding the RTNL lock.
1544 *
1545 * Drivers using this API must use ib_unregister_driver before module unload
1546 * to ensure that all scheduled unregistrations have completed.
1547 */
1548void ib_unregister_device_queued(struct ib_device *ib_dev)
1549{
1550        WARN_ON(!refcount_read(&ib_dev->refcount));
1551        WARN_ON(!ib_dev->ops.dealloc_driver);
1552        get_device(&ib_dev->dev);
1553        if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work))
1554                put_device(&ib_dev->dev);
1555}
1556EXPORT_SYMBOL(ib_unregister_device_queued);
1557
1558/*
1559 * The caller must pass in a device that has the kref held and the refcount
1560 * released. If the device is in cur_net and still registered then it is moved
1561 * into net.
1562 */
1563static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
1564                                 struct net *net)
1565{
1566        int ret2 = -EINVAL;
1567        int ret;
1568
1569        mutex_lock(&device->unregistration_lock);
1570
1571        /*
1572         * If a device not under ib_device_get() or if the unregistration_lock
1573         * is not held, the namespace can be changed, or it can be unregistered.
1574         * Check again under the lock.
1575         */
1576        if (refcount_read(&device->refcount) == 0 ||
1577            !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) {
1578                ret = -ENODEV;
1579                goto out;
1580        }
1581
1582        kobject_uevent(&device->dev.kobj, KOBJ_REMOVE);
1583        disable_device(device);
1584
1585        /*
1586         * At this point no one can be using the device, so it is safe to
1587         * change the namespace.
1588         */
1589        write_pnet(&device->coredev.rdma_net, net);
1590
1591        down_read(&devices_rwsem);
1592        /*
1593         * Currently rdma devices are system wide unique. So the device name
1594         * is guaranteed free in the new namespace. Publish the new namespace
1595         * at the sysfs level.
1596         */
1597        ret = device_rename(&device->dev, dev_name(&device->dev));
1598        up_read(&devices_rwsem);
1599        if (ret) {
1600                dev_warn(&device->dev,
1601                         "%s: Couldn't rename device after namespace change\n",
1602                         __func__);
1603                /* Try and put things back and re-enable the device */
1604                write_pnet(&device->coredev.rdma_net, cur_net);
1605        }
1606
1607        ret2 = enable_device_and_get(device);
1608        if (ret2) {
1609                /*
1610                 * This shouldn't really happen, but if it does, let the user
1611                 * retry at later point. So don't disable the device.
1612                 */
1613                dev_warn(&device->dev,
1614                         "%s: Couldn't re-enable device after namespace change\n",
1615                         __func__);
1616        }
1617        kobject_uevent(&device->dev.kobj, KOBJ_ADD);
1618
1619        ib_device_put(device);
1620out:
1621        mutex_unlock(&device->unregistration_lock);
1622        if (ret)
1623                return ret;
1624        return ret2;
1625}
1626
1627int ib_device_set_netns_put(struct sk_buff *skb,
1628                            struct ib_device *dev, u32 ns_fd)
1629{
1630        struct net *net;
1631        int ret;
1632
1633        net = get_net_ns_by_fd(ns_fd);
1634        if (IS_ERR(net)) {
1635                ret = PTR_ERR(net);
1636                goto net_err;
1637        }
1638
1639        if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
1640                ret = -EPERM;
1641                goto ns_err;
1642        }
1643
1644        /*
1645         * Currently supported only for those providers which support
1646         * disassociation and don't do port specific sysfs init. Once a
1647         * port_cleanup infrastructure is implemented, this limitation will be
1648         * removed.
1649         */
1650        if (!dev->ops.disassociate_ucontext || dev->ops.init_port ||
1651            ib_devices_shared_netns) {
1652                ret = -EOPNOTSUPP;
1653                goto ns_err;
1654        }
1655
1656        get_device(&dev->dev);
1657        ib_device_put(dev);
1658        ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net);
1659        put_device(&dev->dev);
1660
1661        put_net(net);
1662        return ret;
1663
1664ns_err:
1665        put_net(net);
1666net_err:
1667        ib_device_put(dev);
1668        return ret;
1669}
1670
1671static struct pernet_operations rdma_dev_net_ops = {
1672        .init = rdma_dev_init_net,
1673        .exit = rdma_dev_exit_net,
1674        .id = &rdma_dev_net_id,
1675        .size = sizeof(struct rdma_dev_net),
1676};
1677
1678static int assign_client_id(struct ib_client *client)
1679{
1680        int ret;
1681
1682        down_write(&clients_rwsem);
1683        /*
1684         * The add/remove callbacks must be called in FIFO/LIFO order. To
1685         * achieve this we assign client_ids so they are sorted in
1686         * registration order.
1687         */
1688        client->client_id = highest_client_id;
1689        ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL);
1690        if (ret)
1691                goto out;
1692
1693        highest_client_id++;
1694        xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
1695
1696out:
1697        up_write(&clients_rwsem);
1698        return ret;
1699}
1700
1701static void remove_client_id(struct ib_client *client)
1702{
1703        down_write(&clients_rwsem);
1704        xa_erase(&clients, client->client_id);
1705        for (; highest_client_id; highest_client_id--)
1706                if (xa_load(&clients, highest_client_id - 1))
1707                        break;
1708        up_write(&clients_rwsem);
1709}
1710
1711/**
1712 * ib_register_client - Register an IB client
1713 * @client:Client to register
1714 *
1715 * Upper level users of the IB drivers can use ib_register_client() to
1716 * register callbacks for IB device addition and removal.  When an IB
1717 * device is added, each registered client's add method will be called
1718 * (in the order the clients were registered), and when a device is
1719 * removed, each client's remove method will be called (in the reverse
1720 * order that clients were registered).  In addition, when
1721 * ib_register_client() is called, the client will receive an add
1722 * callback for all devices already registered.
1723 */
1724int ib_register_client(struct ib_client *client)
1725{
1726        struct ib_device *device;
1727        unsigned long index;
1728        int ret;
1729
1730        refcount_set(&client->uses, 1);
1731        init_completion(&client->uses_zero);
1732        ret = assign_client_id(client);
1733        if (ret)
1734                return ret;
1735
1736        down_read(&devices_rwsem);
1737        xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
1738                ret = add_client_context(device, client);
1739                if (ret) {
1740                        up_read(&devices_rwsem);
1741                        ib_unregister_client(client);
1742                        return ret;
1743                }
1744        }
1745        up_read(&devices_rwsem);
1746        return 0;
1747}
1748EXPORT_SYMBOL(ib_register_client);
1749
1750/**
1751 * ib_unregister_client - Unregister an IB client
1752 * @client:Client to unregister
1753 *
1754 * Upper level users use ib_unregister_client() to remove their client
1755 * registration.  When ib_unregister_client() is called, the client
1756 * will receive a remove callback for each IB device still registered.
1757 *
1758 * This is a full fence, once it returns no client callbacks will be called,
1759 * or are running in another thread.
1760 */
1761void ib_unregister_client(struct ib_client *client)
1762{
1763        struct ib_device *device;
1764        unsigned long index;
1765
1766        down_write(&clients_rwsem);
1767        ib_client_put(client);
1768        xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
1769        up_write(&clients_rwsem);
1770
1771        /* We do not want to have locks while calling client->remove() */
1772        rcu_read_lock();
1773        xa_for_each (&devices, index, device) {
1774                if (!ib_device_try_get(device))
1775                        continue;
1776                rcu_read_unlock();
1777
1778                remove_client_context(device, client->client_id);
1779
1780                ib_device_put(device);
1781                rcu_read_lock();
1782        }
1783        rcu_read_unlock();
1784
1785        /*
1786         * remove_client_context() is not a fence, it can return even though a
1787         * removal is ongoing. Wait until all removals are completed.
1788         */
1789        wait_for_completion(&client->uses_zero);
1790        remove_client_id(client);
1791}
1792EXPORT_SYMBOL(ib_unregister_client);
1793
1794static int __ib_get_global_client_nl_info(const char *client_name,
1795                                          struct ib_client_nl_info *res)
1796{
1797        struct ib_client *client;
1798        unsigned long index;
1799        int ret = -ENOENT;
1800
1801        down_read(&clients_rwsem);
1802        xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
1803                if (strcmp(client->name, client_name) != 0)
1804                        continue;
1805                if (!client->get_global_nl_info) {
1806                        ret = -EOPNOTSUPP;
1807                        break;
1808                }
1809                ret = client->get_global_nl_info(res);
1810                if (WARN_ON(ret == -ENOENT))
1811                        ret = -EINVAL;
1812                if (!ret && res->cdev)
1813                        get_device(res->cdev);
1814                break;
1815        }
1816        up_read(&clients_rwsem);
1817        return ret;
1818}
1819
1820static int __ib_get_client_nl_info(struct ib_device *ibdev,
1821                                   const char *client_name,
1822                                   struct ib_client_nl_info *res)
1823{
1824        unsigned long index;
1825        void *client_data;
1826        int ret = -ENOENT;
1827
1828        down_read(&ibdev->client_data_rwsem);
1829        xan_for_each_marked (&ibdev->client_data, index, client_data,
1830                             CLIENT_DATA_REGISTERED) {
1831                struct ib_client *client = xa_load(&clients, index);
1832
1833                if (!client || strcmp(client->name, client_name) != 0)
1834                        continue;
1835                if (!client->get_nl_info) {
1836                        ret = -EOPNOTSUPP;
1837                        break;
1838                }
1839                ret = client->get_nl_info(ibdev, client_data, res);
1840                if (WARN_ON(ret == -ENOENT))
1841                        ret = -EINVAL;
1842
1843                /*
1844                 * The cdev is guaranteed valid as long as we are inside the
1845                 * client_data_rwsem as remove_one can't be called. Keep it
1846                 * valid for the caller.
1847                 */
1848                if (!ret && res->cdev)
1849                        get_device(res->cdev);
1850                break;
1851        }
1852        up_read(&ibdev->client_data_rwsem);
1853
1854        return ret;
1855}
1856
1857/**
1858 * ib_get_client_nl_info - Fetch the nl_info from a client
1859 * @device - IB device
1860 * @client_name - Name of the client
1861 * @res - Result of the query
1862 */
1863int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
1864                          struct ib_client_nl_info *res)
1865{
1866        int ret;
1867
1868        if (ibdev)
1869                ret = __ib_get_client_nl_info(ibdev, client_name, res);
1870        else
1871                ret = __ib_get_global_client_nl_info(client_name, res);
1872#ifdef CONFIG_MODULES
1873        if (ret == -ENOENT) {
1874                request_module("rdma-client-%s", client_name);
1875                if (ibdev)
1876                        ret = __ib_get_client_nl_info(ibdev, client_name, res);
1877                else
1878                        ret = __ib_get_global_client_nl_info(client_name, res);
1879        }
1880#endif
1881        if (ret) {
1882                if (ret == -ENOENT)
1883                        return -EOPNOTSUPP;
1884                return ret;
1885        }
1886
1887        if (WARN_ON(!res->cdev))
1888                return -EINVAL;
1889        return 0;
1890}
1891
1892/**
1893 * ib_set_client_data - Set IB client context
1894 * @device:Device to set context for
1895 * @client:Client to set context for
1896 * @data:Context to set
1897 *
1898 * ib_set_client_data() sets client context data that can be retrieved with
1899 * ib_get_client_data(). This can only be called while the client is
1900 * registered to the device, once the ib_client remove() callback returns this
1901 * cannot be called.
1902 */
1903void ib_set_client_data(struct ib_device *device, struct ib_client *client,
1904                        void *data)
1905{
1906        void *rc;
1907
1908        if (WARN_ON(IS_ERR(data)))
1909                data = NULL;
1910
1911        rc = xa_store(&device->client_data, client->client_id, data,
1912                      GFP_KERNEL);
1913        WARN_ON(xa_is_err(rc));
1914}
1915EXPORT_SYMBOL(ib_set_client_data);
1916
1917/**
1918 * ib_register_event_handler - Register an IB event handler
1919 * @event_handler:Handler to register
1920 *
1921 * ib_register_event_handler() registers an event handler that will be
1922 * called back when asynchronous IB events occur (as defined in
1923 * chapter 11 of the InfiniBand Architecture Specification).  This
1924 * callback may occur in interrupt context.
1925 */
1926void ib_register_event_handler(struct ib_event_handler *event_handler)
1927{
1928        unsigned long flags;
1929
1930        spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
1931        list_add_tail(&event_handler->list,
1932                      &event_handler->device->event_handler_list);
1933        spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
1934}
1935EXPORT_SYMBOL(ib_register_event_handler);
1936
1937/**
1938 * ib_unregister_event_handler - Unregister an event handler
1939 * @event_handler:Handler to unregister
1940 *
1941 * Unregister an event handler registered with
1942 * ib_register_event_handler().
1943 */
1944void ib_unregister_event_handler(struct ib_event_handler *event_handler)
1945{
1946        unsigned long flags;
1947
1948        spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
1949        list_del(&event_handler->list);
1950        spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
1951}
1952EXPORT_SYMBOL(ib_unregister_event_handler);
1953
1954/**
1955 * ib_dispatch_event - Dispatch an asynchronous event
1956 * @event:Event to dispatch
1957 *
1958 * Low-level drivers must call ib_dispatch_event() to dispatch the
1959 * event to all registered event handlers when an asynchronous event
1960 * occurs.
1961 */
1962void ib_dispatch_event(struct ib_event *event)
1963{
1964        unsigned long flags;
1965        struct ib_event_handler *handler;
1966
1967        spin_lock_irqsave(&event->device->event_handler_lock, flags);
1968
1969        list_for_each_entry(handler, &event->device->event_handler_list, list)
1970                handler->handler(handler, event);
1971
1972        spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
1973}
1974EXPORT_SYMBOL(ib_dispatch_event);
1975
1976static int iw_query_port(struct ib_device *device,
1977                           u8 port_num,
1978                           struct ib_port_attr *port_attr)
1979{
1980        struct in_device *inetdev;
1981        struct net_device *netdev;
1982        int err;
1983
1984        memset(port_attr, 0, sizeof(*port_attr));
1985
1986        netdev = ib_device_get_netdev(device, port_num);
1987        if (!netdev)
1988                return -ENODEV;
1989
1990        port_attr->max_mtu = IB_MTU_4096;
1991        port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
1992
1993        if (!netif_carrier_ok(netdev)) {
1994                port_attr->state = IB_PORT_DOWN;
1995                port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
1996        } else {
1997                rcu_read_lock();
1998                inetdev = __in_dev_get_rcu(netdev);
1999
2000                if (inetdev && inetdev->ifa_list) {
2001                        port_attr->state = IB_PORT_ACTIVE;
2002                        port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
2003                } else {
2004                        port_attr->state = IB_PORT_INIT;
2005                        port_attr->phys_state =
2006                                IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING;
2007                }
2008
2009                rcu_read_unlock();
2010        }
2011
2012        dev_put(netdev);
2013        err = device->ops.query_port(device, port_num, port_attr);
2014        if (err)
2015                return err;
2016
2017        return 0;
2018}
2019
2020static int __ib_query_port(struct ib_device *device,
2021                           u8 port_num,
2022                           struct ib_port_attr *port_attr)
2023{
2024        union ib_gid gid = {};
2025        int err;
2026
2027        memset(port_attr, 0, sizeof(*port_attr));
2028
2029        err = device->ops.query_port(device, port_num, port_attr);
2030        if (err || port_attr->subnet_prefix)
2031                return err;
2032
2033        if (rdma_port_get_link_layer(device, port_num) !=
2034            IB_LINK_LAYER_INFINIBAND)
2035                return 0;
2036
2037        err = device->ops.query_gid(device, port_num, 0, &gid);
2038        if (err)
2039                return err;
2040
2041        port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
2042        return 0;
2043}
2044
2045/**
2046 * ib_query_port - Query IB port attributes
2047 * @device:Device to query
2048 * @port_num:Port number to query
2049 * @port_attr:Port attributes
2050 *
2051 * ib_query_port() returns the attributes of a port through the
2052 * @port_attr pointer.
2053 */
2054int ib_query_port(struct ib_device *device,
2055                  u8 port_num,
2056                  struct ib_port_attr *port_attr)
2057{
2058        if (!rdma_is_port_valid(device, port_num))
2059                return -EINVAL;
2060
2061        if (rdma_protocol_iwarp(device, port_num))
2062                return iw_query_port(device, port_num, port_attr);
2063        else
2064                return __ib_query_port(device, port_num, port_attr);
2065}
2066EXPORT_SYMBOL(ib_query_port);
2067
2068static void add_ndev_hash(struct ib_port_data *pdata)
2069{
2070        unsigned long flags;
2071
2072        might_sleep();
2073
2074        spin_lock_irqsave(&ndev_hash_lock, flags);
2075        if (hash_hashed(&pdata->ndev_hash_link)) {
2076                hash_del_rcu(&pdata->ndev_hash_link);
2077                spin_unlock_irqrestore(&ndev_hash_lock, flags);
2078                /*
2079                 * We cannot do hash_add_rcu after a hash_del_rcu until the
2080                 * grace period
2081                 */
2082                synchronize_rcu();
2083                spin_lock_irqsave(&ndev_hash_lock, flags);
2084        }
2085        if (pdata->netdev)
2086                hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
2087                             (uintptr_t)pdata->netdev);
2088        spin_unlock_irqrestore(&ndev_hash_lock, flags);
2089}
2090
2091/**
2092 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
2093 * @ib_dev: Device to modify
2094 * @ndev: net_device to affiliate, may be NULL
2095 * @port: IB port the net_device is connected to
2096 *
2097 * Drivers should use this to link the ib_device to a netdev so the netdev
2098 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be
2099 * affiliated with any port.
2100 *
2101 * The caller must ensure that the given ndev is not unregistered or
2102 * unregistering, and that either the ib_device is unregistered or
2103 * ib_device_set_netdev() is called with NULL when the ndev sends a
2104 * NETDEV_UNREGISTER event.
2105 */
2106int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
2107                         unsigned int port)
2108{
2109        struct net_device *old_ndev;
2110        struct ib_port_data *pdata;
2111        unsigned long flags;
2112        int ret;
2113
2114        /*
2115         * Drivers wish to call this before ib_register_driver, so we have to
2116         * setup the port data early.
2117         */
2118        ret = alloc_port_data(ib_dev);
2119        if (ret)
2120                return ret;
2121
2122        if (!rdma_is_port_valid(ib_dev, port))
2123                return -EINVAL;
2124
2125        pdata = &ib_dev->port_data[port];
2126        spin_lock_irqsave(&pdata->netdev_lock, flags);
2127        old_ndev = rcu_dereference_protected(
2128                pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
2129        if (old_ndev == ndev) {
2130                spin_unlock_irqrestore(&pdata->netdev_lock, flags);
2131                return 0;
2132        }
2133
2134        if (ndev)
2135                dev_hold(ndev);
2136        rcu_assign_pointer(pdata->netdev, ndev);
2137        spin_unlock_irqrestore(&pdata->netdev_lock, flags);
2138
2139        add_ndev_hash(pdata);
2140        if (old_ndev)
2141                dev_put(old_ndev);
2142
2143        return 0;
2144}
2145EXPORT_SYMBOL(ib_device_set_netdev);
2146
2147static void free_netdevs(struct ib_device *ib_dev)
2148{
2149        unsigned long flags;
2150        unsigned int port;
2151
2152        if (!ib_dev->port_data)
2153                return;
2154
2155        rdma_for_each_port (ib_dev, port) {
2156                struct ib_port_data *pdata = &ib_dev->port_data[port];
2157                struct net_device *ndev;
2158
2159                spin_lock_irqsave(&pdata->netdev_lock, flags);
2160                ndev = rcu_dereference_protected(
2161                        pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
2162                if (ndev) {
2163                        spin_lock(&ndev_hash_lock);
2164                        hash_del_rcu(&pdata->ndev_hash_link);
2165                        spin_unlock(&ndev_hash_lock);
2166
2167                        /*
2168                         * If this is the last dev_put there is still a
2169                         * synchronize_rcu before the netdev is kfreed, so we
2170                         * can continue to rely on unlocked pointer
2171                         * comparisons after the put
2172                         */
2173                        rcu_assign_pointer(pdata->netdev, NULL);
2174                        dev_put(ndev);
2175                }
2176                spin_unlock_irqrestore(&pdata->netdev_lock, flags);
2177        }
2178}
2179
2180struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
2181                                        unsigned int port)
2182{
2183        struct ib_port_data *pdata;
2184        struct net_device *res;
2185
2186        if (!rdma_is_port_valid(ib_dev, port))
2187                return NULL;
2188
2189        pdata = &ib_dev->port_data[port];
2190
2191        /*
2192         * New drivers should use ib_device_set_netdev() not the legacy
2193         * get_netdev().
2194         */
2195        if (ib_dev->ops.get_netdev)
2196                res = ib_dev->ops.get_netdev(ib_dev, port);
2197        else {
2198                spin_lock(&pdata->netdev_lock);
2199                res = rcu_dereference_protected(
2200                        pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
2201                if (res)
2202                        dev_hold(res);
2203                spin_unlock(&pdata->netdev_lock);
2204        }
2205
2206        /*
2207         * If we are starting to unregister expedite things by preventing
2208         * propagation of an unregistering netdev.
2209         */
2210        if (res && res->reg_state != NETREG_REGISTERED) {
2211                dev_put(res);
2212                return NULL;
2213        }
2214
2215        return res;
2216}
2217
2218/**
2219 * ib_device_get_by_netdev - Find an IB device associated with a netdev
2220 * @ndev: netdev to locate
2221 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
2222 *
2223 * Find and hold an ib_device that is associated with a netdev via
2224 * ib_device_set_netdev(). The caller must call ib_device_put() on the
2225 * returned pointer.
2226 */
2227struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
2228                                          enum rdma_driver_id driver_id)
2229{
2230        struct ib_device *res = NULL;
2231        struct ib_port_data *cur;
2232
2233        rcu_read_lock();
2234        hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
2235                                    (uintptr_t)ndev) {
2236                if (rcu_access_pointer(cur->netdev) == ndev &&
2237                    (driver_id == RDMA_DRIVER_UNKNOWN ||
2238                     cur->ib_dev->ops.driver_id == driver_id) &&
2239                    ib_device_try_get(cur->ib_dev)) {
2240                        res = cur->ib_dev;
2241                        break;
2242                }
2243        }
2244        rcu_read_unlock();
2245
2246        return res;
2247}
2248EXPORT_SYMBOL(ib_device_get_by_netdev);
2249
2250/**
2251 * ib_enum_roce_netdev - enumerate all RoCE ports
2252 * @ib_dev : IB device we want to query
2253 * @filter: Should we call the callback?
2254 * @filter_cookie: Cookie passed to filter
2255 * @cb: Callback to call for each found RoCE ports
2256 * @cookie: Cookie passed back to the callback
2257 *
2258 * Enumerates all of the physical RoCE ports of ib_dev
2259 * which are related to netdevice and calls callback() on each
2260 * device for which filter() function returns non zero.
2261 */
2262void ib_enum_roce_netdev(struct ib_device *ib_dev,
2263                         roce_netdev_filter filter,
2264                         void *filter_cookie,
2265                         roce_netdev_callback cb,
2266                         void *cookie)
2267{
2268        unsigned int port;
2269
2270        rdma_for_each_port (ib_dev, port)
2271                if (rdma_protocol_roce(ib_dev, port)) {
2272                        struct net_device *idev =
2273                                ib_device_get_netdev(ib_dev, port);
2274
2275                        if (filter(ib_dev, port, idev, filter_cookie))
2276                                cb(ib_dev, port, idev, cookie);
2277
2278                        if (idev)
2279                                dev_put(idev);
2280                }
2281}
2282
2283/**
2284 * ib_enum_all_roce_netdevs - enumerate all RoCE devices
2285 * @filter: Should we call the callback?
2286 * @filter_cookie: Cookie passed to filter
2287 * @cb: Callback to call for each found RoCE ports
2288 * @cookie: Cookie passed back to the callback
2289 *
2290 * Enumerates all RoCE devices' physical ports which are related
2291 * to netdevices and calls callback() on each device for which
2292 * filter() function returns non zero.
2293 */
2294void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
2295                              void *filter_cookie,
2296                              roce_netdev_callback cb,
2297                              void *cookie)
2298{
2299        struct ib_device *dev;
2300        unsigned long index;
2301
2302        down_read(&devices_rwsem);
2303        xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
2304                ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
2305        up_read(&devices_rwsem);
2306}
2307
2308/**
2309 * ib_enum_all_devs - enumerate all ib_devices
2310 * @cb: Callback to call for each found ib_device
2311 *
2312 * Enumerates all ib_devices and calls callback() on each device.
2313 */
2314int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
2315                     struct netlink_callback *cb)
2316{
2317        unsigned long index;
2318        struct ib_device *dev;
2319        unsigned int idx = 0;
2320        int ret = 0;
2321
2322        down_read(&devices_rwsem);
2323        xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
2324                if (!rdma_dev_access_netns(dev, sock_net(skb->sk)))
2325                        continue;
2326
2327                ret = nldev_cb(dev, skb, cb, idx);
2328                if (ret)
2329                        break;
2330                idx++;
2331        }
2332        up_read(&devices_rwsem);
2333        return ret;
2334}
2335
2336/**
2337 * ib_query_pkey - Get P_Key table entry
2338 * @device:Device to query
2339 * @port_num:Port number to query
2340 * @index:P_Key table index to query
2341 * @pkey:Returned P_Key
2342 *
2343 * ib_query_pkey() fetches the specified P_Key table entry.
2344 */
2345int ib_query_pkey(struct ib_device *device,
2346                  u8 port_num, u16 index, u16 *pkey)
2347{
2348        if (!rdma_is_port_valid(device, port_num))
2349                return -EINVAL;
2350
2351        return device->ops.query_pkey(device, port_num, index, pkey);
2352}
2353EXPORT_SYMBOL(ib_query_pkey);
2354
2355/**
2356 * ib_modify_device - Change IB device attributes
2357 * @device:Device to modify
2358 * @device_modify_mask:Mask of attributes to change
2359 * @device_modify:New attribute values
2360 *
2361 * ib_modify_device() changes a device's attributes as specified by
2362 * the @device_modify_mask and @device_modify structure.
2363 */
2364int ib_modify_device(struct ib_device *device,
2365                     int device_modify_mask,
2366                     struct ib_device_modify *device_modify)
2367{
2368        if (!device->ops.modify_device)
2369                return -ENOSYS;
2370
2371        return device->ops.modify_device(device, device_modify_mask,
2372                                         device_modify);
2373}
2374EXPORT_SYMBOL(ib_modify_device);
2375
2376/**
2377 * ib_modify_port - Modifies the attributes for the specified port.
2378 * @device: The device to modify.
2379 * @port_num: The number of the port to modify.
2380 * @port_modify_mask: Mask used to specify which attributes of the port
2381 *   to change.
2382 * @port_modify: New attribute values for the port.
2383 *
2384 * ib_modify_port() changes a port's attributes as specified by the
2385 * @port_modify_mask and @port_modify structure.
2386 */
2387int ib_modify_port(struct ib_device *device,
2388                   u8 port_num, int port_modify_mask,
2389                   struct ib_port_modify *port_modify)
2390{
2391        int rc;
2392
2393        if (!rdma_is_port_valid(device, port_num))
2394                return -EINVAL;
2395
2396        if (device->ops.modify_port)
2397                rc = device->ops.modify_port(device, port_num,
2398                                             port_modify_mask,
2399                                             port_modify);
2400        else
2401                rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS;
2402        return rc;
2403}
2404EXPORT_SYMBOL(ib_modify_port);
2405
2406/**
2407 * ib_find_gid - Returns the port number and GID table index where
2408 *   a specified GID value occurs. Its searches only for IB link layer.
2409 * @device: The device to query.
2410 * @gid: The GID value to search for.
2411 * @port_num: The port number of the device where the GID value was found.
2412 * @index: The index into the GID table where the GID was found.  This
2413 *   parameter may be NULL.
2414 */
2415int ib_find_gid(struct ib_device *device, union ib_gid *gid,
2416                u8 *port_num, u16 *index)
2417{
2418        union ib_gid tmp_gid;
2419        unsigned int port;
2420        int ret, i;
2421
2422        rdma_for_each_port (device, port) {
2423                if (!rdma_protocol_ib(device, port))
2424                        continue;
2425
2426                for (i = 0; i < device->port_data[port].immutable.gid_tbl_len;
2427                     ++i) {
2428                        ret = rdma_query_gid(device, port, i, &tmp_gid);
2429                        if (ret)
2430                                return ret;
2431                        if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
2432                                *port_num = port;
2433                                if (index)
2434                                        *index = i;
2435                                return 0;
2436                        }
2437                }
2438        }
2439
2440        return -ENOENT;
2441}
2442EXPORT_SYMBOL(ib_find_gid);
2443
2444/**
2445 * ib_find_pkey - Returns the PKey table index where a specified
2446 *   PKey value occurs.
2447 * @device: The device to query.
2448 * @port_num: The port number of the device to search for the PKey.
2449 * @pkey: The PKey value to search for.
2450 * @index: The index into the PKey table where the PKey was found.
2451 */
2452int ib_find_pkey(struct ib_device *device,
2453                 u8 port_num, u16 pkey, u16 *index)
2454{
2455        int ret, i;
2456        u16 tmp_pkey;
2457        int partial_ix = -1;
2458
2459        for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len;
2460             ++i) {
2461                ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
2462                if (ret)
2463                        return ret;
2464                if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
2465                        /* if there is full-member pkey take it.*/
2466                        if (tmp_pkey & 0x8000) {
2467                                *index = i;
2468                                return 0;
2469                        }
2470                        if (partial_ix < 0)
2471                                partial_ix = i;
2472                }
2473        }
2474
2475        /*no full-member, if exists take the limited*/
2476        if (partial_ix >= 0) {
2477                *index = partial_ix;
2478                return 0;
2479        }
2480        return -ENOENT;
2481}
2482EXPORT_SYMBOL(ib_find_pkey);
2483
2484/**
2485 * ib_get_net_dev_by_params() - Return the appropriate net_dev
2486 * for a received CM request
2487 * @dev:        An RDMA device on which the request has been received.
2488 * @port:       Port number on the RDMA device.
2489 * @pkey:       The Pkey the request came on.
2490 * @gid:        A GID that the net_dev uses to communicate.
2491 * @addr:       Contains the IP address that the request specified as its
2492 *              destination.
2493 *
2494 */
2495struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
2496                                            u8 port,
2497                                            u16 pkey,
2498                                            const union ib_gid *gid,
2499                                            const struct sockaddr *addr)
2500{
2501        struct net_device *net_dev = NULL;
2502        unsigned long index;
2503        void *client_data;
2504
2505        if (!rdma_protocol_ib(dev, port))
2506                return NULL;
2507
2508        /*
2509         * Holding the read side guarantees that the client will not become
2510         * unregistered while we are calling get_net_dev_by_params()
2511         */
2512        down_read(&dev->client_data_rwsem);
2513        xan_for_each_marked (&dev->client_data, index, client_data,
2514                             CLIENT_DATA_REGISTERED) {
2515                struct ib_client *client = xa_load(&clients, index);
2516
2517                if (!client || !client->get_net_dev_by_params)
2518                        continue;
2519
2520                net_dev = client->get_net_dev_by_params(dev, port, pkey, gid,
2521                                                        addr, client_data);
2522                if (net_dev)
2523                        break;
2524        }
2525        up_read(&dev->client_data_rwsem);
2526
2527        return net_dev;
2528}
2529EXPORT_SYMBOL(ib_get_net_dev_by_params);
2530
2531void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
2532{
2533        struct ib_device_ops *dev_ops = &dev->ops;
2534#define SET_DEVICE_OP(ptr, name)                                               \
2535        do {                                                                   \
2536                if (ops->name)                                                 \
2537                        if (!((ptr)->name))                                    \
2538                                (ptr)->name = ops->name;                       \
2539        } while (0)
2540
2541#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
2542
2543        if (ops->driver_id != RDMA_DRIVER_UNKNOWN) {
2544                WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN &&
2545                        dev_ops->driver_id != ops->driver_id);
2546                dev_ops->driver_id = ops->driver_id;
2547        }
2548        if (ops->owner) {
2549                WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner);
2550                dev_ops->owner = ops->owner;
2551        }
2552        if (ops->uverbs_abi_ver)
2553                dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver;
2554
2555        dev_ops->uverbs_no_driver_id_binding |=
2556                ops->uverbs_no_driver_id_binding;
2557
2558        SET_DEVICE_OP(dev_ops, add_gid);
2559        SET_DEVICE_OP(dev_ops, advise_mr);
2560        SET_DEVICE_OP(dev_ops, alloc_dm);
2561        SET_DEVICE_OP(dev_ops, alloc_fmr);
2562        SET_DEVICE_OP(dev_ops, alloc_hw_stats);
2563        SET_DEVICE_OP(dev_ops, alloc_mr);
2564        SET_DEVICE_OP(dev_ops, alloc_mr_integrity);
2565        SET_DEVICE_OP(dev_ops, alloc_mw);
2566        SET_DEVICE_OP(dev_ops, alloc_pd);
2567        SET_DEVICE_OP(dev_ops, alloc_rdma_netdev);
2568        SET_DEVICE_OP(dev_ops, alloc_ucontext);
2569        SET_DEVICE_OP(dev_ops, alloc_xrcd);
2570        SET_DEVICE_OP(dev_ops, attach_mcast);
2571        SET_DEVICE_OP(dev_ops, check_mr_status);
2572        SET_DEVICE_OP(dev_ops, counter_alloc_stats);
2573        SET_DEVICE_OP(dev_ops, counter_bind_qp);
2574        SET_DEVICE_OP(dev_ops, counter_dealloc);
2575        SET_DEVICE_OP(dev_ops, counter_unbind_qp);
2576        SET_DEVICE_OP(dev_ops, counter_update_stats);
2577        SET_DEVICE_OP(dev_ops, create_ah);
2578        SET_DEVICE_OP(dev_ops, create_counters);
2579        SET_DEVICE_OP(dev_ops, create_cq);
2580        SET_DEVICE_OP(dev_ops, create_flow);
2581        SET_DEVICE_OP(dev_ops, create_flow_action_esp);
2582        SET_DEVICE_OP(dev_ops, create_qp);
2583        SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
2584        SET_DEVICE_OP(dev_ops, create_srq);
2585        SET_DEVICE_OP(dev_ops, create_wq);
2586        SET_DEVICE_OP(dev_ops, dealloc_dm);
2587        SET_DEVICE_OP(dev_ops, dealloc_driver);
2588        SET_DEVICE_OP(dev_ops, dealloc_fmr);
2589        SET_DEVICE_OP(dev_ops, dealloc_mw);
2590        SET_DEVICE_OP(dev_ops, dealloc_pd);
2591        SET_DEVICE_OP(dev_ops, dealloc_ucontext);
2592        SET_DEVICE_OP(dev_ops, dealloc_xrcd);
2593        SET_DEVICE_OP(dev_ops, del_gid);
2594        SET_DEVICE_OP(dev_ops, dereg_mr);
2595        SET_DEVICE_OP(dev_ops, destroy_ah);
2596        SET_DEVICE_OP(dev_ops, destroy_counters);
2597        SET_DEVICE_OP(dev_ops, destroy_cq);
2598        SET_DEVICE_OP(dev_ops, destroy_flow);
2599        SET_DEVICE_OP(dev_ops, destroy_flow_action);
2600        SET_DEVICE_OP(dev_ops, destroy_qp);
2601        SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table);
2602        SET_DEVICE_OP(dev_ops, destroy_srq);
2603        SET_DEVICE_OP(dev_ops, destroy_wq);
2604        SET_DEVICE_OP(dev_ops, detach_mcast);
2605        SET_DEVICE_OP(dev_ops, disassociate_ucontext);
2606        SET_DEVICE_OP(dev_ops, drain_rq);
2607        SET_DEVICE_OP(dev_ops, drain_sq);
2608        SET_DEVICE_OP(dev_ops, enable_driver);
2609        SET_DEVICE_OP(dev_ops, fill_res_entry);
2610        SET_DEVICE_OP(dev_ops, get_dev_fw_str);
2611        SET_DEVICE_OP(dev_ops, get_dma_mr);
2612        SET_DEVICE_OP(dev_ops, get_hw_stats);
2613        SET_DEVICE_OP(dev_ops, get_link_layer);
2614        SET_DEVICE_OP(dev_ops, get_netdev);
2615        SET_DEVICE_OP(dev_ops, get_port_immutable);
2616        SET_DEVICE_OP(dev_ops, get_vector_affinity);
2617        SET_DEVICE_OP(dev_ops, get_vf_config);
2618        SET_DEVICE_OP(dev_ops, get_vf_stats);
2619        SET_DEVICE_OP(dev_ops, init_port);
2620        SET_DEVICE_OP(dev_ops, invalidate_range);
2621        SET_DEVICE_OP(dev_ops, iw_accept);
2622        SET_DEVICE_OP(dev_ops, iw_add_ref);
2623        SET_DEVICE_OP(dev_ops, iw_connect);
2624        SET_DEVICE_OP(dev_ops, iw_create_listen);
2625        SET_DEVICE_OP(dev_ops, iw_destroy_listen);
2626        SET_DEVICE_OP(dev_ops, iw_get_qp);
2627        SET_DEVICE_OP(dev_ops, iw_reject);
2628        SET_DEVICE_OP(dev_ops, iw_rem_ref);
2629        SET_DEVICE_OP(dev_ops, map_mr_sg);
2630        SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
2631        SET_DEVICE_OP(dev_ops, map_phys_fmr);
2632        SET_DEVICE_OP(dev_ops, mmap);
2633        SET_DEVICE_OP(dev_ops, modify_ah);
2634        SET_DEVICE_OP(dev_ops, modify_cq);
2635        SET_DEVICE_OP(dev_ops, modify_device);
2636        SET_DEVICE_OP(dev_ops, modify_flow_action_esp);
2637        SET_DEVICE_OP(dev_ops, modify_port);
2638        SET_DEVICE_OP(dev_ops, modify_qp);
2639        SET_DEVICE_OP(dev_ops, modify_srq);
2640        SET_DEVICE_OP(dev_ops, modify_wq);
2641        SET_DEVICE_OP(dev_ops, peek_cq);
2642        SET_DEVICE_OP(dev_ops, poll_cq);
2643        SET_DEVICE_OP(dev_ops, post_recv);
2644        SET_DEVICE_OP(dev_ops, post_send);
2645        SET_DEVICE_OP(dev_ops, post_srq_recv);
2646        SET_DEVICE_OP(dev_ops, process_mad);
2647        SET_DEVICE_OP(dev_ops, query_ah);
2648        SET_DEVICE_OP(dev_ops, query_device);
2649        SET_DEVICE_OP(dev_ops, query_gid);
2650        SET_DEVICE_OP(dev_ops, query_pkey);
2651        SET_DEVICE_OP(dev_ops, query_port);
2652        SET_DEVICE_OP(dev_ops, query_qp);
2653        SET_DEVICE_OP(dev_ops, query_srq);
2654        SET_DEVICE_OP(dev_ops, rdma_netdev_get_params);
2655        SET_DEVICE_OP(dev_ops, read_counters);
2656        SET_DEVICE_OP(dev_ops, reg_dm_mr);
2657        SET_DEVICE_OP(dev_ops, reg_user_mr);
2658        SET_DEVICE_OP(dev_ops, req_ncomp_notif);
2659        SET_DEVICE_OP(dev_ops, req_notify_cq);
2660        SET_DEVICE_OP(dev_ops, rereg_user_mr);
2661        SET_DEVICE_OP(dev_ops, resize_cq);
2662        SET_DEVICE_OP(dev_ops, set_vf_guid);
2663        SET_DEVICE_OP(dev_ops, set_vf_link_state);
2664        SET_DEVICE_OP(dev_ops, unmap_fmr);
2665
2666        SET_OBJ_SIZE(dev_ops, ib_ah);
2667        SET_OBJ_SIZE(dev_ops, ib_cq);
2668        SET_OBJ_SIZE(dev_ops, ib_pd);
2669        SET_OBJ_SIZE(dev_ops, ib_srq);
2670        SET_OBJ_SIZE(dev_ops, ib_ucontext);
2671}
2672EXPORT_SYMBOL(ib_set_device_ops);
2673
2674static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
2675        [RDMA_NL_LS_OP_RESOLVE] = {
2676                .doit = ib_nl_handle_resolve_resp,
2677                .flags = RDMA_NL_ADMIN_PERM,
2678        },
2679        [RDMA_NL_LS_OP_SET_TIMEOUT] = {
2680                .doit = ib_nl_handle_set_timeout,
2681                .flags = RDMA_NL_ADMIN_PERM,
2682        },
2683        [RDMA_NL_LS_OP_IP_RESOLVE] = {
2684                .doit = ib_nl_handle_ip_res_resp,
2685                .flags = RDMA_NL_ADMIN_PERM,
2686        },
2687};
2688
2689static int __init ib_core_init(void)
2690{
2691        int ret;
2692
2693        ib_wq = alloc_workqueue("infiniband", 0, 0);
2694        if (!ib_wq)
2695                return -ENOMEM;
2696
2697        ib_comp_wq = alloc_workqueue("ib-comp-wq",
2698                        WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
2699        if (!ib_comp_wq) {
2700                ret = -ENOMEM;
2701                goto err;
2702        }
2703
2704        ib_comp_unbound_wq =
2705                alloc_workqueue("ib-comp-unb-wq",
2706                                WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM |
2707                                WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE);
2708        if (!ib_comp_unbound_wq) {
2709                ret = -ENOMEM;
2710                goto err_comp;
2711        }
2712
2713        ret = class_register(&ib_class);
2714        if (ret) {
2715                pr_warn("Couldn't create InfiniBand device class\n");
2716                goto err_comp_unbound;
2717        }
2718
2719        rdma_nl_init();
2720
2721        ret = addr_init();
2722        if (ret) {
2723                pr_warn("Could't init IB address resolution\n");
2724                goto err_ibnl;
2725        }
2726
2727        ret = ib_mad_init();
2728        if (ret) {
2729                pr_warn("Couldn't init IB MAD\n");
2730                goto err_addr;
2731        }
2732
2733        ret = ib_sa_init();
2734        if (ret) {
2735                pr_warn("Couldn't init SA\n");
2736                goto err_mad;
2737        }
2738
2739        ret = register_blocking_lsm_notifier(&ibdev_lsm_nb);
2740        if (ret) {
2741                pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
2742                goto err_sa;
2743        }
2744
2745        ret = register_pernet_device(&rdma_dev_net_ops);
2746        if (ret) {
2747                pr_warn("Couldn't init compat dev. ret %d\n", ret);
2748                goto err_compat;
2749        }
2750
2751        nldev_init();
2752        rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
2753        roce_gid_mgmt_init();
2754
2755        return 0;
2756
2757err_compat:
2758        unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
2759err_sa:
2760        ib_sa_cleanup();
2761err_mad:
2762        ib_mad_cleanup();
2763err_addr:
2764        addr_cleanup();
2765err_ibnl:
2766        class_unregister(&ib_class);
2767err_comp_unbound:
2768        destroy_workqueue(ib_comp_unbound_wq);
2769err_comp:
2770        destroy_workqueue(ib_comp_wq);
2771err:
2772        destroy_workqueue(ib_wq);
2773        return ret;
2774}
2775
2776static void __exit ib_core_cleanup(void)
2777{
2778        roce_gid_mgmt_cleanup();
2779        nldev_exit();
2780        rdma_nl_unregister(RDMA_NL_LS);
2781        unregister_pernet_device(&rdma_dev_net_ops);
2782        unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
2783        ib_sa_cleanup();
2784        ib_mad_cleanup();
2785        addr_cleanup();
2786        rdma_nl_exit();
2787        class_unregister(&ib_class);
2788        destroy_workqueue(ib_comp_unbound_wq);
2789        destroy_workqueue(ib_comp_wq);
2790        /* Make sure that any pending umem accounting work is done. */
2791        destroy_workqueue(ib_wq);
2792        flush_workqueue(system_unbound_wq);
2793        WARN_ON(!xa_empty(&clients));
2794        WARN_ON(!xa_empty(&devices));
2795}
2796
2797MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
2798
2799/* ib core relies on netdev stack to first register net_ns_type_operations
2800 * ns kobject type before ib_core initialization.
2801 */
2802fs_initcall(ib_core_init);
2803module_exit(ib_core_cleanup);
2804