linux/drivers/infiniband/sw/siw/siw_main.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
   2
   3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
   4/* Copyright (c) 2008-2019, IBM Corporation */
   5
   6#include <linux/init.h>
   7#include <linux/errno.h>
   8#include <linux/netdevice.h>
   9#include <linux/inetdevice.h>
  10#include <net/net_namespace.h>
  11#include <linux/rtnetlink.h>
  12#include <linux/if_arp.h>
  13#include <linux/list.h>
  14#include <linux/kernel.h>
  15#include <linux/sched.h>
  16#include <linux/module.h>
  17#include <linux/dma-mapping.h>
  18
  19#include <net/addrconf.h>
  20#include <rdma/ib_verbs.h>
  21#include <rdma/ib_user_verbs.h>
  22#include <rdma/rdma_netlink.h>
  23#include <linux/kthread.h>
  24
  25#include "siw.h"
  26#include "siw_verbs.h"
  27
  28MODULE_AUTHOR("Bernard Metzler");
  29MODULE_DESCRIPTION("Software iWARP Driver");
  30MODULE_LICENSE("Dual BSD/GPL");
  31
  32/* transmit from user buffer, if possible */
  33const bool zcopy_tx = true;
  34
  35/* Restrict usage of GSO, if hardware peer iwarp is unable to process
  36 * large packets. try_gso = true lets siw try to use local GSO,
  37 * if peer agrees.  Not using GSO severly limits siw maximum tx bandwidth.
  38 */
  39const bool try_gso;
  40
  41/* Attach siw also with loopback devices */
  42const bool loopback_enabled = true;
  43
  44/* We try to negotiate CRC on, if true */
  45const bool mpa_crc_required;
  46
  47/* MPA CRC on/off enforced */
  48const bool mpa_crc_strict;
  49
  50/* Control TCP_NODELAY socket option */
  51const bool siw_tcp_nagle;
  52
  53/* Select MPA version to be used during connection setup */
  54u_char mpa_version = MPA_REVISION_2;
  55
  56/* Selects MPA P2P mode (additional handshake during connection
  57 * setup, if true.
  58 */
  59const bool peer_to_peer;
  60
  61struct task_struct *siw_tx_thread[NR_CPUS];
  62struct crypto_shash *siw_crypto_shash;
  63
  64static int siw_device_register(struct siw_device *sdev, const char *name)
  65{
  66        struct ib_device *base_dev = &sdev->base_dev;
  67        static int dev_id = 1;
  68        int rv;
  69
  70        sdev->vendor_part_id = dev_id++;
  71
  72        rv = ib_register_device(base_dev, name, NULL);
  73        if (rv) {
  74                pr_warn("siw: device registration error %d\n", rv);
  75                return rv;
  76        }
  77
  78        siw_dbg(base_dev, "HWaddr=%pM\n", sdev->netdev->dev_addr);
  79
  80        return 0;
  81}
  82
  83static void siw_device_cleanup(struct ib_device *base_dev)
  84{
  85        struct siw_device *sdev = to_siw_dev(base_dev);
  86
  87        xa_destroy(&sdev->qp_xa);
  88        xa_destroy(&sdev->mem_xa);
  89}
  90
  91static int siw_create_tx_threads(void)
  92{
  93        int cpu, assigned = 0;
  94
  95        for_each_online_cpu(cpu) {
  96                /* Skip HT cores */
  97                if (cpu % cpumask_weight(topology_sibling_cpumask(cpu)))
  98                        continue;
  99
 100                siw_tx_thread[cpu] =
 101                        kthread_create(siw_run_sq, (unsigned long *)(long)cpu,
 102                                       "siw_tx/%d", cpu);
 103                if (IS_ERR(siw_tx_thread[cpu])) {
 104                        siw_tx_thread[cpu] = NULL;
 105                        continue;
 106                }
 107                kthread_bind(siw_tx_thread[cpu], cpu);
 108
 109                wake_up_process(siw_tx_thread[cpu]);
 110                assigned++;
 111        }
 112        return assigned;
 113}
 114
 115static int siw_dev_qualified(struct net_device *netdev)
 116{
 117        /*
 118         * Additional hardware support can be added here
 119         * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see
 120         * <linux/if_arp.h> for type identifiers.
 121         */
 122        if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 ||
 123            (netdev->type == ARPHRD_LOOPBACK && loopback_enabled))
 124                return 1;
 125
 126        return 0;
 127}
 128
 129static DEFINE_PER_CPU(atomic_t, siw_use_cnt);
 130
 131static struct {
 132        struct cpumask **tx_valid_cpus;
 133        int num_nodes;
 134} siw_cpu_info;
 135
 136static int siw_init_cpulist(void)
 137{
 138        int i, num_nodes = nr_node_ids;
 139
 140        memset(siw_tx_thread, 0, sizeof(siw_tx_thread));
 141
 142        siw_cpu_info.num_nodes = num_nodes;
 143
 144        siw_cpu_info.tx_valid_cpus =
 145                kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL);
 146        if (!siw_cpu_info.tx_valid_cpus) {
 147                siw_cpu_info.num_nodes = 0;
 148                return -ENOMEM;
 149        }
 150        for (i = 0; i < siw_cpu_info.num_nodes; i++) {
 151                siw_cpu_info.tx_valid_cpus[i] =
 152                        kzalloc(sizeof(struct cpumask), GFP_KERNEL);
 153                if (!siw_cpu_info.tx_valid_cpus[i])
 154                        goto out_err;
 155
 156                cpumask_clear(siw_cpu_info.tx_valid_cpus[i]);
 157        }
 158        for_each_possible_cpu(i)
 159                cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]);
 160
 161        return 0;
 162
 163out_err:
 164        siw_cpu_info.num_nodes = 0;
 165        while (--i >= 0)
 166                kfree(siw_cpu_info.tx_valid_cpus[i]);
 167        kfree(siw_cpu_info.tx_valid_cpus);
 168        siw_cpu_info.tx_valid_cpus = NULL;
 169
 170        return -ENOMEM;
 171}
 172
 173static void siw_destroy_cpulist(void)
 174{
 175        int i = 0;
 176
 177        while (i < siw_cpu_info.num_nodes)
 178                kfree(siw_cpu_info.tx_valid_cpus[i++]);
 179
 180        kfree(siw_cpu_info.tx_valid_cpus);
 181}
 182
 183/*
 184 * Choose CPU with least number of active QP's from NUMA node of
 185 * TX interface.
 186 */
 187int siw_get_tx_cpu(struct siw_device *sdev)
 188{
 189        const struct cpumask *tx_cpumask;
 190        int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1;
 191
 192        if (node < 0)
 193                tx_cpumask = cpu_online_mask;
 194        else
 195                tx_cpumask = siw_cpu_info.tx_valid_cpus[node];
 196
 197        num_cpus = cpumask_weight(tx_cpumask);
 198        if (!num_cpus) {
 199                /* no CPU on this NUMA node */
 200                tx_cpumask = cpu_online_mask;
 201                num_cpus = cpumask_weight(tx_cpumask);
 202        }
 203        if (!num_cpus)
 204                goto out;
 205
 206        cpu = cpumask_first(tx_cpumask);
 207
 208        for (i = 0, min_use = SIW_MAX_QP; i < num_cpus;
 209             i++, cpu = cpumask_next(cpu, tx_cpumask)) {
 210                int usage;
 211
 212                /* Skip any cores which have no TX thread */
 213                if (!siw_tx_thread[cpu])
 214                        continue;
 215
 216                usage = atomic_read(&per_cpu(siw_use_cnt, cpu));
 217                if (usage <= min_use) {
 218                        tx_cpu = cpu;
 219                        min_use = usage;
 220                }
 221        }
 222        siw_dbg(&sdev->base_dev,
 223                "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use);
 224
 225out:
 226        if (tx_cpu >= 0)
 227                atomic_inc(&per_cpu(siw_use_cnt, tx_cpu));
 228        else
 229                pr_warn("siw: no tx cpu found\n");
 230
 231        return tx_cpu;
 232}
 233
 234void siw_put_tx_cpu(int cpu)
 235{
 236        atomic_dec(&per_cpu(siw_use_cnt, cpu));
 237}
 238
 239static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
 240{
 241        struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id);
 242
 243        if (qp) {
 244                /*
 245                 * siw_qp_id2obj() increments object reference count
 246                 */
 247                siw_qp_put(qp);
 248                return &qp->base_qp;
 249        }
 250        return NULL;
 251}
 252
 253static const struct ib_device_ops siw_device_ops = {
 254        .owner = THIS_MODULE,
 255        .uverbs_abi_ver = SIW_ABI_VERSION,
 256        .driver_id = RDMA_DRIVER_SIW,
 257
 258        .alloc_mr = siw_alloc_mr,
 259        .alloc_pd = siw_alloc_pd,
 260        .alloc_ucontext = siw_alloc_ucontext,
 261        .create_cq = siw_create_cq,
 262        .create_qp = siw_create_qp,
 263        .create_srq = siw_create_srq,
 264        .dealloc_driver = siw_device_cleanup,
 265        .dealloc_pd = siw_dealloc_pd,
 266        .dealloc_ucontext = siw_dealloc_ucontext,
 267        .dereg_mr = siw_dereg_mr,
 268        .destroy_cq = siw_destroy_cq,
 269        .destroy_qp = siw_destroy_qp,
 270        .destroy_srq = siw_destroy_srq,
 271        .get_dma_mr = siw_get_dma_mr,
 272        .get_port_immutable = siw_get_port_immutable,
 273        .iw_accept = siw_accept,
 274        .iw_add_ref = siw_qp_get_ref,
 275        .iw_connect = siw_connect,
 276        .iw_create_listen = siw_create_listen,
 277        .iw_destroy_listen = siw_destroy_listen,
 278        .iw_get_qp = siw_get_base_qp,
 279        .iw_reject = siw_reject,
 280        .iw_rem_ref = siw_qp_put_ref,
 281        .map_mr_sg = siw_map_mr_sg,
 282        .mmap = siw_mmap,
 283        .mmap_free = siw_mmap_free,
 284        .modify_qp = siw_verbs_modify_qp,
 285        .modify_srq = siw_modify_srq,
 286        .poll_cq = siw_poll_cq,
 287        .post_recv = siw_post_receive,
 288        .post_send = siw_post_send,
 289        .post_srq_recv = siw_post_srq_recv,
 290        .query_device = siw_query_device,
 291        .query_gid = siw_query_gid,
 292        .query_port = siw_query_port,
 293        .query_qp = siw_query_qp,
 294        .query_srq = siw_query_srq,
 295        .req_notify_cq = siw_req_notify_cq,
 296        .reg_user_mr = siw_reg_user_mr,
 297
 298        INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq),
 299        INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd),
 300        INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp),
 301        INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq),
 302        INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext),
 303};
 304
 305static struct siw_device *siw_device_create(struct net_device *netdev)
 306{
 307        struct siw_device *sdev = NULL;
 308        struct ib_device *base_dev;
 309        int rv;
 310
 311        sdev = ib_alloc_device(siw_device, base_dev);
 312        if (!sdev)
 313                return NULL;
 314
 315        base_dev = &sdev->base_dev;
 316
 317        sdev->netdev = netdev;
 318
 319        if (netdev->type != ARPHRD_LOOPBACK) {
 320                addrconf_addr_eui48((unsigned char *)&base_dev->node_guid,
 321                                    netdev->dev_addr);
 322        } else {
 323                /*
 324                 * The loopback device does not have a HW address,
 325                 * but connection mangagement lib expects gid != 0
 326                 */
 327                size_t len = min_t(size_t, strlen(base_dev->name), 6);
 328                char addr[6] = { };
 329
 330                memcpy(addr, base_dev->name, len);
 331                addrconf_addr_eui48((unsigned char *)&base_dev->node_guid,
 332                                    addr);
 333        }
 334
 335        base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND);
 336
 337        base_dev->node_type = RDMA_NODE_RNIC;
 338        memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON,
 339               sizeof(SIW_NODE_DESC_COMMON));
 340
 341        /*
 342         * Current model (one-to-one device association):
 343         * One Softiwarp device per net_device or, equivalently,
 344         * per physical port.
 345         */
 346        base_dev->phys_port_cnt = 1;
 347        base_dev->num_comp_vectors = num_possible_cpus();
 348
 349        xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1);
 350        xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1);
 351
 352        ib_set_device_ops(base_dev, &siw_device_ops);
 353        rv = ib_device_set_netdev(base_dev, netdev, 1);
 354        if (rv)
 355                goto error;
 356
 357        memcpy(base_dev->iw_ifname, netdev->name,
 358               sizeof(base_dev->iw_ifname));
 359
 360        /* Disable TCP port mapping */
 361        base_dev->iw_driver_flags = IW_F_NO_PORT_MAP;
 362
 363        sdev->attrs.max_qp = SIW_MAX_QP;
 364        sdev->attrs.max_qp_wr = SIW_MAX_QP_WR;
 365        sdev->attrs.max_ord = SIW_MAX_ORD_QP;
 366        sdev->attrs.max_ird = SIW_MAX_IRD_QP;
 367        sdev->attrs.max_sge = SIW_MAX_SGE;
 368        sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD;
 369        sdev->attrs.max_cq = SIW_MAX_CQ;
 370        sdev->attrs.max_cqe = SIW_MAX_CQE;
 371        sdev->attrs.max_mr = SIW_MAX_MR;
 372        sdev->attrs.max_pd = SIW_MAX_PD;
 373        sdev->attrs.max_mw = SIW_MAX_MW;
 374        sdev->attrs.max_srq = SIW_MAX_SRQ;
 375        sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR;
 376        sdev->attrs.max_srq_sge = SIW_MAX_SGE;
 377
 378        INIT_LIST_HEAD(&sdev->cep_list);
 379        INIT_LIST_HEAD(&sdev->qp_list);
 380
 381        atomic_set(&sdev->num_ctx, 0);
 382        atomic_set(&sdev->num_srq, 0);
 383        atomic_set(&sdev->num_qp, 0);
 384        atomic_set(&sdev->num_cq, 0);
 385        atomic_set(&sdev->num_mr, 0);
 386        atomic_set(&sdev->num_pd, 0);
 387
 388        sdev->numa_node = dev_to_node(&netdev->dev);
 389        spin_lock_init(&sdev->lock);
 390
 391        return sdev;
 392error:
 393        ib_dealloc_device(base_dev);
 394
 395        return NULL;
 396}
 397
 398/*
 399 * Network link becomes unavailable. Mark all
 400 * affected QP's accordingly.
 401 */
 402static void siw_netdev_down(struct work_struct *work)
 403{
 404        struct siw_device *sdev =
 405                container_of(work, struct siw_device, netdev_down);
 406
 407        struct siw_qp_attrs qp_attrs;
 408        struct list_head *pos, *tmp;
 409
 410        memset(&qp_attrs, 0, sizeof(qp_attrs));
 411        qp_attrs.state = SIW_QP_STATE_ERROR;
 412
 413        list_for_each_safe(pos, tmp, &sdev->qp_list) {
 414                struct siw_qp *qp = list_entry(pos, struct siw_qp, devq);
 415
 416                down_write(&qp->state_lock);
 417                WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE));
 418                up_write(&qp->state_lock);
 419        }
 420        ib_device_put(&sdev->base_dev);
 421}
 422
 423static void siw_device_goes_down(struct siw_device *sdev)
 424{
 425        if (ib_device_try_get(&sdev->base_dev)) {
 426                INIT_WORK(&sdev->netdev_down, siw_netdev_down);
 427                schedule_work(&sdev->netdev_down);
 428        }
 429}
 430
 431static int siw_netdev_event(struct notifier_block *nb, unsigned long event,
 432                            void *arg)
 433{
 434        struct net_device *netdev = netdev_notifier_info_to_dev(arg);
 435        struct ib_device *base_dev;
 436        struct siw_device *sdev;
 437
 438        dev_dbg(&netdev->dev, "siw: event %lu\n", event);
 439
 440        if (dev_net(netdev) != &init_net)
 441                return NOTIFY_OK;
 442
 443        base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
 444        if (!base_dev)
 445                return NOTIFY_OK;
 446
 447        sdev = to_siw_dev(base_dev);
 448
 449        switch (event) {
 450        case NETDEV_UP:
 451                sdev->state = IB_PORT_ACTIVE;
 452                siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE);
 453                break;
 454
 455        case NETDEV_GOING_DOWN:
 456                siw_device_goes_down(sdev);
 457                break;
 458
 459        case NETDEV_DOWN:
 460                sdev->state = IB_PORT_DOWN;
 461                siw_port_event(sdev, 1, IB_EVENT_PORT_ERR);
 462                break;
 463
 464        case NETDEV_REGISTER:
 465                /*
 466                 * Device registration now handled only by
 467                 * rdma netlink commands. So it shall be impossible
 468                 * to end up here with a valid siw device.
 469                 */
 470                siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n");
 471                break;
 472
 473        case NETDEV_UNREGISTER:
 474                ib_unregister_device_queued(&sdev->base_dev);
 475                break;
 476
 477        case NETDEV_CHANGEADDR:
 478                siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE);
 479                break;
 480        /*
 481         * Todo: Below netdev events are currently not handled.
 482         */
 483        case NETDEV_CHANGEMTU:
 484        case NETDEV_CHANGE:
 485                break;
 486
 487        default:
 488                break;
 489        }
 490        ib_device_put(&sdev->base_dev);
 491
 492        return NOTIFY_OK;
 493}
 494
 495static struct notifier_block siw_netdev_nb = {
 496        .notifier_call = siw_netdev_event,
 497};
 498
 499static int siw_newlink(const char *basedev_name, struct net_device *netdev)
 500{
 501        struct ib_device *base_dev;
 502        struct siw_device *sdev = NULL;
 503        int rv = -ENOMEM;
 504
 505        if (!siw_dev_qualified(netdev))
 506                return -EINVAL;
 507
 508        base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
 509        if (base_dev) {
 510                ib_device_put(base_dev);
 511                return -EEXIST;
 512        }
 513        sdev = siw_device_create(netdev);
 514        if (sdev) {
 515                dev_dbg(&netdev->dev, "siw: new device\n");
 516
 517                if (netif_running(netdev) && netif_carrier_ok(netdev))
 518                        sdev->state = IB_PORT_ACTIVE;
 519                else
 520                        sdev->state = IB_PORT_DOWN;
 521
 522                rv = siw_device_register(sdev, basedev_name);
 523                if (rv)
 524                        ib_dealloc_device(&sdev->base_dev);
 525        }
 526        return rv;
 527}
 528
 529static struct rdma_link_ops siw_link_ops = {
 530        .type = "siw",
 531        .newlink = siw_newlink,
 532};
 533
 534/*
 535 * siw_init_module - Initialize Softiwarp module and register with netdev
 536 *                   subsystem.
 537 */
 538static __init int siw_init_module(void)
 539{
 540        int rv;
 541        int nr_cpu;
 542
 543        if (SENDPAGE_THRESH < SIW_MAX_INLINE) {
 544                pr_info("siw: sendpage threshold too small: %u\n",
 545                        (int)SENDPAGE_THRESH);
 546                rv = -EINVAL;
 547                goto out_error;
 548        }
 549        rv = siw_init_cpulist();
 550        if (rv)
 551                goto out_error;
 552
 553        rv = siw_cm_init();
 554        if (rv)
 555                goto out_error;
 556
 557        if (!siw_create_tx_threads()) {
 558                pr_info("siw: Could not start any TX thread\n");
 559                rv = -ENOMEM;
 560                goto out_error;
 561        }
 562        /*
 563         * Locate CRC32 algorithm. If unsuccessful, fail
 564         * loading siw only, if CRC is required.
 565         */
 566        siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0);
 567        if (IS_ERR(siw_crypto_shash)) {
 568                pr_info("siw: Loading CRC32c failed: %ld\n",
 569                        PTR_ERR(siw_crypto_shash));
 570                siw_crypto_shash = NULL;
 571                if (mpa_crc_required) {
 572                        rv = -EOPNOTSUPP;
 573                        goto out_error;
 574                }
 575        }
 576        rv = register_netdevice_notifier(&siw_netdev_nb);
 577        if (rv)
 578                goto out_error;
 579
 580        rdma_link_register(&siw_link_ops);
 581
 582        pr_info("SoftiWARP attached\n");
 583        return 0;
 584
 585out_error:
 586        for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) {
 587                if (siw_tx_thread[nr_cpu]) {
 588                        siw_stop_tx_thread(nr_cpu);
 589                        siw_tx_thread[nr_cpu] = NULL;
 590                }
 591        }
 592        if (siw_crypto_shash)
 593                crypto_free_shash(siw_crypto_shash);
 594
 595        pr_info("SoftIWARP attach failed. Error: %d\n", rv);
 596
 597        siw_cm_exit();
 598        siw_destroy_cpulist();
 599
 600        return rv;
 601}
 602
 603static void __exit siw_exit_module(void)
 604{
 605        int cpu;
 606
 607        for_each_possible_cpu(cpu) {
 608                if (siw_tx_thread[cpu]) {
 609                        siw_stop_tx_thread(cpu);
 610                        siw_tx_thread[cpu] = NULL;
 611                }
 612        }
 613        unregister_netdevice_notifier(&siw_netdev_nb);
 614        rdma_link_unregister(&siw_link_ops);
 615        ib_unregister_driver(RDMA_DRIVER_SIW);
 616
 617        siw_cm_exit();
 618
 619        siw_destroy_cpulist();
 620
 621        if (siw_crypto_shash)
 622                crypto_free_shash(siw_crypto_shash);
 623
 624        pr_info("SoftiWARP detached\n");
 625}
 626
 627module_init(siw_init_module);
 628module_exit(siw_exit_module);
 629
 630MODULE_ALIAS_RDMA_LINK("siw");
 631